In [1]:
#dependencies
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
from config import db_password
import matplotlib.pyplot as plt
import psycopg2

In [41]:
# GET Tabled input
# creating database engine
db_name = 'Company_Stocks_DB'
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/{db_name}"
engine = create_engine(db_string)
# read data from PostgreSQL database table and load into Dataframe instance
stock_df = pd.read_sql("select * from \"company_all_star\"", engine);
# sort the dataframe by ticker column
stock_df.sort_values(by=["ticker"])
# Print the DataFrame
stock_df.columns.to_list()
# ticker, date_val, region, open_val, high_val, low_val, close_val, volume_weight

['ticker',
 'date_val',
 'company_name',
 'company_url',
 'employee_count',
 'revenue',
 'sector',
 'city_name',
 'state_name',
 'region',
 'country_code',
 'latitude',
 'longitude',
 'open_val',
 'high_val',
 'low_val',
 'close_val',
 'volume',
 'volume_weight',
 'number_of_transactions',
 'percent_change']

In [3]:
# x = stock_df['date_val']
# y = stock_df['close_val']
# plt.plot(x, y)
# plt.show()

In [4]:
# preserve date column as type object
stock_df['date'] = stock_df['date_val']

# have the user enter beginning date as yyyy-mm-dd
begin_date = '2022-03-08'
# have the user enter ending date as yyyy-mm-dd
end_date = '2022-03-10'
# iteration controls
day_range_of_iter = 3

# Convert the date to datetime64
stock_df['date_val'] = pd.to_datetime(stock_df['date_val'], format='%Y-%m-%d')

stock_df = stock_df.loc[(stock_df['date_val'] >= begin_date)
                     & (stock_df['date_val'] <= end_date)]

# drop throw-aways 
stock_df.drop(["longitude", "latitude", "company_name", "company_url","date_val"], axis=1, inplace=True)

stock_df

Unnamed: 0,ticker,employee_count,revenue,sector,city_name,state_name,region,country_code,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change,date
501,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,108.410,111.71,106.850,111.05,102310329.0,109.6319,602679.0,2.435200,2022-03-08
502,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,108.890,109.07,103.070,106.46,102557375.0,105.3382,639388.0,2.231610,2022-03-09
503,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,108.130,108.19,104.080,104.29,87584432.0,105.9691,542478.0,3.551281,2022-03-10
1006,ADBE,over-10k,1m-10m,Technology,San Jose,CA,W,US,443.800,453.11,438.930,450.87,2905656.0,447.8637,67082.0,1.593060,2022-03-08
1007,ADBE,over-10k,1m-10m,Technology,San Jose,CA,W,US,444.680,447.65,433.010,438.95,2686310.0,437.7568,66371.0,1.288567,2022-03-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50368,ZM,1k-5k,100m-200m,Technology,San Jose,CA,W,US,105.835,106.90,101.055,103.33,5030777.0,103.3206,88819.0,2.366892,2022-03-09
50369,ZM,1k-5k,100m-200m,Technology,San Jose,CA,W,US,103.480,103.49,97.900,98.12,6454629.0,99.6973,104681.0,5.179745,2022-03-10
50871,ZS,1k-5k,100m-200m,Technology,San Jose,CA,W,US,203.840,213.57,199.120,212.35,3050554.0,209.3268,45960.0,4.174843,2022-03-08
50872,ZS,1k-5k,100m-200m,Technology,San Jose,CA,W,US,212.130,213.51,204.870,208.41,2305091.0,208.7971,40754.0,1.753642,2022-03-09


In [5]:
# drop fields that will not be used to represent a period of time
stock_df.drop(columns = ['number_of_transactions', 'city_name', 'state_name', 'number_of_transactions', 'percent_change'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
stock_df

Unnamed: 0,ticker,employee_count,revenue,sector,region,country_code,open_val,high_val,low_val,close_val,volume,volume_weight,date
501,AMD,5k-10k,over-1b,Technology,W,US,108.41,111.71,106.85,111.05,102310329.0,109.6319,2022-03-08
502,AMD,5k-10k,over-1b,Technology,W,US,108.89,109.07,103.07,106.46,102557375.0,105.3382,2022-03-09
503,AMD,5k-10k,over-1b,Technology,W,US,108.13,108.19,104.08,104.29,87584432.0,105.9691,2022-03-10
1006,ADBE,over-10k,1m-10m,Technology,W,US,443.8,453.11,438.93,450.87,2905656.0,447.8637,2022-03-08
1007,ADBE,over-10k,1m-10m,Technology,W,US,444.68,447.65,433.01,438.95,2686310.0,437.7568,2022-03-09
1008,ADBE,over-10k,1m-10m,Technology,W,US,439.66,440.61,415.43,416.38,4434498.0,422.5279,2022-03-10
1321,ABNB,5k-10k,200m-1b,Technology,W,US,145.43,150.99,145.33,148.31,7023908.0,148.5454,2022-03-08
1322,ABNB,5k-10k,200m-1b,Technology,W,US,145.22,152.89,144.4092,151.8,5302511.0,149.8916,2022-03-09
1323,ABNB,5k-10k,200m-1b,Technology,W,US,154.13,155.0,145.0,145.14,4577255.0,147.8527,2022-03-10
1825,ALGN,over-10k,200m-1b,Technology,SW,US,433.22,444.84,426.31,442.05,694358.0,438.9188,2022-03-08


In [6]:
# unique days in df
unique_days = len(pd.unique(stock_df['date']))
print("unique number of days(number of days in df):", unique_days)

# unique stocks in df
unique_stocks = len(pd.unique(stock_df['ticker']))
print(unique_stocks)

# interation sets
iteration_sets = (unique_days - day_range_of_iter + 1)
print("iteration_sets: ", iteration_sets)

# total records captured
length_of_df = len(stock_df)
print(length_of_df)


unique number of days(number of days in df): 3
102
iteration_sets:  1
306


In [7]:
# sort dataframe by date
sort_date_stock_df = stock_df.sort_values(by=['date', 'ticker'])
next_date_stock_df = sort_date_stock_df

# get beginning dataframe records
b = 0 
# ending record for beginning df
ending_records = iteration_sets * unique_stocks

# starting record for end
x = (unique_days - iteration_sets) * unique_stocks
max_records = unique_days * unique_stocks

begin_df = pd.DataFrame()
end_df = pd.DataFrame()
               
for rec in sort_date_stock_df.iterrows():
    
    if b < ending_records:
        new_begin_df = sort_date_stock_df.iloc[b]
        begin_df = begin_df.append(new_begin_df,ignore_index=False)
    
    if x < max_records: 
        new_end_df = next_date_stock_df.iloc[x]
        end_df = end_df.append(new_end_df,ignore_index=False)
    b=b+1
    x=x+1
   
begin_df.reset_index(drop=True,inplace=True)
begin_df 

Unnamed: 0,ticker,employee_count,revenue,sector,region,country_code,open_val,high_val,low_val,close_val,volume,volume_weight,date
0,AAPL,over-10k,over-1b,Technology,W,US,161.475,163.41,159.41,162.95,91445405.0,161.9446,2022-03-08
1,ABNB,5k-10k,200m-1b,Technology,W,US,145.43,150.99,145.33,148.31,7023908.0,148.5454,2022-03-08
2,ADBE,over-10k,1m-10m,Technology,W,US,443.8,453.11,438.93,450.87,2905656.0,447.8637,2022-03-08
3,ADI,over-10k,over-1b,Technology,SE,US,153.09,155.06,151.3,153.62,3046254.0,153.5888,2022-03-08
4,ADP,over-10k,over-1b,Consumer Discretionary,NE,US,210.33,211.135,206.07,209.7,1791687.0,209.3495,2022-03-08
5,ADSK,over-10k,over-1b,Healthcare,W,US,202.71,207.755,200.98,204.88,1850028.0,205.2018,2022-03-08
6,AEP,over-10k,over-1b,Energy,MW,US,95.9,95.9,94.2,95.01,3006258.0,95.0679,2022-03-08
7,ALGN,over-10k,200m-1b,Technology,SW,US,433.22,444.84,426.31,442.05,694358.0,438.9188,2022-03-08
8,AMAT,over-10k,over-1b,Technology,W,US,127.7,129.84,126.2,128.62,7623175.0,128.1947,2022-03-08
9,AMD,5k-10k,over-1b,Technology,W,US,108.41,111.71,106.85,111.05,102310329.0,109.6319,2022-03-08


In [8]:
# # sort dataframe by date
# sort_date_stock_df = stock_df.sort_values(by=['date', 'ticker'])
# sort_date_stock_df

end_df.reset_index(drop=True,inplace=True)
end_df

Unnamed: 0,ticker,employee_count,revenue,sector,region,country_code,open_val,high_val,low_val,close_val,volume,volume_weight,date
0,AAPL,over-10k,over-1b,Technology,W,US,158.93,159.28,154.5,154.73,96917302.0,156.1598,2022-03-10
1,ABNB,5k-10k,200m-1b,Technology,W,US,154.13,155.0,145.0,145.14,4577255.0,147.8527,2022-03-10
2,ADBE,over-10k,1m-10m,Technology,W,US,439.66,440.61,415.43,416.38,4434498.0,422.5279,2022-03-10
3,ADI,over-10k,over-1b,Technology,SE,US,153.2,153.7834,147.29,147.6,3964265.0,149.4415,2022-03-10
4,ADP,over-10k,over-1b,Consumer Discretionary,NE,US,210.63,212.35,206.7,207.27,1664796.0,208.5401,2022-03-10
5,ADSK,over-10k,over-1b,Healthcare,W,US,198.26,200.6258,192.11,192.31,1987062.0,195.0114,2022-03-10
6,AEP,over-10k,over-1b,Energy,MW,US,96.35,96.825,95.21,95.54,2254945.0,95.8241,2022-03-10
7,ALGN,over-10k,200m-1b,Technology,SW,US,438.16,438.16,400.0,400.67,957114.0,409.9722,2022-03-10
8,AMAT,over-10k,over-1b,Technology,W,US,130.0,130.0,123.37,123.64,6015969.0,125.9169,2022-03-10
9,AMD,5k-10k,over-1b,Technology,W,US,108.13,108.19,104.08,104.29,87584432.0,105.9691,2022-03-10


In [9]:
vwa_df = pd.merge(begin_df, end_df, left_index=True, right_index=True)

In [10]:
# drop fields that will not be used to represent a period of time
vwa_df.drop(columns = ['date_x', 'employee_count_y', 'region_y', 'revenue_y', 'sector_y', 'ticker_y', 'country_code_y', 'date_y'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
vwa_df

Unnamed: 0,ticker_x,employee_count_x,revenue_x,sector_x,region_x,country_code_x,open_val_x,high_val_x,low_val_x,close_val_x,volume_x,volume_weight_x,open_val_y,high_val_y,low_val_y,close_val_y,volume_y,volume_weight_y
0,AAPL,over-10k,over-1b,Technology,W,US,161.475,163.41,159.41,162.95,91445405.0,161.9446,158.93,159.28,154.5,154.73,96917302.0,156.1598
1,ABNB,5k-10k,200m-1b,Technology,W,US,145.43,150.99,145.33,148.31,7023908.0,148.5454,154.13,155.0,145.0,145.14,4577255.0,147.8527
2,ADBE,over-10k,1m-10m,Technology,W,US,443.8,453.11,438.93,450.87,2905656.0,447.8637,439.66,440.61,415.43,416.38,4434498.0,422.5279
3,ADI,over-10k,over-1b,Technology,SE,US,153.09,155.06,151.3,153.62,3046254.0,153.5888,153.2,153.7834,147.29,147.6,3964265.0,149.4415
4,ADP,over-10k,over-1b,Consumer Discretionary,NE,US,210.33,211.135,206.07,209.7,1791687.0,209.3495,210.63,212.35,206.7,207.27,1664796.0,208.5401
5,ADSK,over-10k,over-1b,Healthcare,W,US,202.71,207.755,200.98,204.88,1850028.0,205.2018,198.26,200.6258,192.11,192.31,1987062.0,195.0114
6,AEP,over-10k,over-1b,Energy,MW,US,95.9,95.9,94.2,95.01,3006258.0,95.0679,96.35,96.825,95.21,95.54,2254945.0,95.8241
7,ALGN,over-10k,200m-1b,Technology,SW,US,433.22,444.84,426.31,442.05,694358.0,438.9188,438.16,438.16,400.0,400.67,957114.0,409.9722
8,AMAT,over-10k,over-1b,Technology,W,US,127.7,129.84,126.2,128.62,7623175.0,128.1947,130.0,130.0,123.37,123.64,6015969.0,125.9169
9,AMD,5k-10k,over-1b,Technology,W,US,108.41,111.71,106.85,111.05,102310329.0,109.6319,108.13,108.19,104.08,104.29,87584432.0,105.9691


In [11]:
vwa_df['vwa'] = 100 - vwa_df['volume_weight_y']/vwa_df['volume_weight_x'] * 100
vwa_df['va'] = 100 - vwa_df['volume_y']/vwa_df['volume_x'] * 100

In [12]:
# unique values for each column (getting to know your data)
vwa_df.nunique()

ticker_x            102
employee_count_x      4
revenue_x             6
sector_x              9
region_x             11
country_code_x        8
open_val_x          102
high_val_x          102
low_val_x           102
close_val_x         102
volume_x            102
volume_weight_x     102
open_val_y          102
high_val_y          102
low_val_y           102
close_val_y         102
volume_y            102
volume_weight_y     102
vwa                 102
va                  102
dtype: int64

In [13]:
vwa_df.drop(columns = ['ticker_x', 'volume_x', 'volume_weight_x', 'volume_y', 'volume_weight_y'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
vwa_df

Unnamed: 0,employee_count_x,revenue_x,sector_x,region_x,country_code_x,open_val_x,high_val_x,low_val_x,close_val_x,open_val_y,high_val_y,low_val_y,close_val_y,vwa,va
0,over-10k,over-1b,Technology,W,US,161.475,163.41,159.41,162.95,158.93,159.28,154.5,154.73,3.572086,-5.983786
1,5k-10k,200m-1b,Technology,W,US,145.43,150.99,145.33,148.31,154.13,155.0,145.0,145.14,0.466322,34.833215
2,over-10k,1m-10m,Technology,W,US,443.8,453.11,438.93,450.87,439.66,440.61,415.43,416.38,5.657034,-52.61607
3,over-10k,over-1b,Technology,SE,US,153.09,155.06,151.3,153.62,153.2,153.7834,147.29,147.6,2.700262,-30.135734
4,over-10k,over-1b,Consumer Discretionary,NE,US,210.33,211.135,206.07,209.7,210.63,212.35,206.7,207.27,0.386626,7.082208
5,over-10k,over-1b,Healthcare,W,US,202.71,207.755,200.98,204.88,198.26,200.6258,192.11,192.31,4.966038,-7.407131
6,over-10k,over-1b,Energy,MW,US,95.9,95.9,94.2,95.01,96.35,96.825,95.21,95.54,-0.795431,24.991634
7,over-10k,200m-1b,Technology,SW,US,433.22,444.84,426.31,442.05,438.16,438.16,400.0,400.67,6.594978,-37.841575
8,over-10k,over-1b,Technology,W,US,127.7,129.84,126.2,128.62,130.0,130.0,123.37,123.64,1.776829,21.083158
9,5k-10k,over-1b,Technology,W,US,108.41,111.71,106.85,111.05,108.13,108.19,104.08,104.29,3.340998,14.393363


In [14]:
stock_df = vwa_df

In [15]:
# generate our categorical variable list
# categorical preprocessing can be done easiest using Dataframe.dtypes == 'object'
stock_categories = stock_df.dtypes[stock_df.dtypes == "object"].index.tolist()
stock_categories

['employee_count_x', 'revenue_x', 'sector_x', 'region_x', 'country_code_x']

In [16]:
# Checking the number of unique values in each column
stock_df[stock_categories].nunique()
# there needs to be only 10 at most in each categorie, how are we going to make this smaller...by sector ???b

employee_count_x     4
revenue_x            6
sector_x             9
region_x            11
country_code_x       8
dtype: int64

In [17]:
stock_df.columns.tolist()

['employee_count_x',
 'revenue_x',
 'sector_x',
 'region_x',
 'country_code_x',
 'open_val_x',
 'high_val_x',
 'low_val_x',
 'close_val_x',
 'open_val_y',
 'high_val_y',
 'low_val_y',
 'close_val_y',
 'vwa',
 'va']

In [18]:
stock_df.rename(columns={"employee_count_x": "employee_count", "revenue_x":"revenue", "sector_x":"sector",
            "region_x":"region", "country_code_x":"country_code"},inplace=True)

In [19]:
stock_df.columns.to_list()

['employee_count',
 'revenue',
 'sector',
 'region',
 'country_code',
 'open_val_x',
 'high_val_x',
 'low_val_x',
 'close_val_x',
 'open_val_y',
 'high_val_y',
 'low_val_y',
 'close_val_y',
 'vwa',
 'va']

In [20]:
# #----------------------------------- ONE HOT ENCODE METHOD
# stock_cat = stock_df.dtypes[stock_df.dtypes=="object"].index.tolist()
# stock_df[stock_cat].nunique()

# # Create OneHotEnocder Instance
# enc = OneHotEncoder(sparse=False)

# # Fit and transform categorical data
# encode_df = pd.DataFrame(enc.fit_transform(stock_df[stock_cat]))

# # Add the encoded variable names to encode_df
# encode_df.columns = enc.get_feature_names_out(stock_cat)
# encode_df.head()


# # -------------------------------------- OLD "BUCKETING" METHOD

stock_df["employee_count"].astype('category').cat.codes
stock_df['employee_count'] = stock_df['employee_count'].astype('category').cat.codes
stock_df['revenue'] = stock_df['revenue'].astype('category').cat.codes
stock_df['sector'] = stock_df['sector'].astype('category').cat.codes
stock_df['region'] = stock_df['region'].astype('category').cat.codes
# stock_df['city_name'] = stock_df['city_name'].astype('category').cat.codes
# stock_df['state_name'] = stock_df['state_name'].astype('category').cat.codes
stock_df['country_code'] = stock_df['country_code'].astype('category').cat.codes

In [21]:
#stock_df['sector'].value_counts()

In [22]:
#stock_df['country_code'].value_counts()

In [23]:
#stock_df["region"].value_counts()

In [24]:
vwa_df.drop(columns = ['open_val_x', 'high_val_x', 'low_val_x', 'close_val_x'], axis=1, inplace=True)

In [25]:
# # I am catagorizing my own shiza from the tiza
# #replace stock's employee count string with integer
# stock_df.loc[(stock_df['employee_count'] == '5k-10k'), 'employee_count'] = 0
# stock_df.loc[(stock_df['employee_count'] == 'over-10k'), 'employee_count'] = 1
# stock_df.loc[(stock_df['employee_count'] == '1k-5k'), 'employee_count'] = 2
# stock_df.loc[(stock_df['employee_count'] == '500-1k'), 'employee_count'] = 3

# # replace stock's revenue string with integer
# stock_df.loc[(stock_df['revenue'] == '1m-10m'), 'revenue'] = 0
# stock_df.loc[(stock_df['revenue'] == '10m-50m'), 'revenue'] = 1
# stock_df.loc[(stock_df['revenue'] == '50m-100m'), 'revenue'] = 2
# stock_df.loc[(stock_df['revenue'] == '100m-200m'), 'revenue'] = 3
# stock_df.loc[(stock_df['revenue'] == '200m-1b'), 'revenue'] = 4
# stock_df.loc[(stock_df['revenue'] == 'over-1b'), 'revenue'] = 5

# # replace stock's sector string with integer
# stock_df.loc[(stock_df['sector'] == 'Technology'), 'sector'] = 9
# stock_df.loc[(stock_df['sector'] == 'Energy'), 'sector'] = 2
# stock_df.loc[(stock_df['sector'] == 'Healthcare'), 'sector'] = 8
# stock_df.loc[(stock_df['sector'] == 'Consumer Discretionary'), 'sector'] = 7
# stock_df.loc[(stock_df['sector'] == 'Industrials'), 'sector'] = 6
# stock_df.loc[(stock_df['sector'] == 'Consumer Staples'), 'sector'] = 5
# stock_df.loc[(stock_df['sector'] == 'Communication Services'), 'sector'] = 4
# stock_df.loc[(stock_df['sector'] == 'Financials'), 'sector'] = 3
# stock_df.loc[(stock_df['sector'] == 'Utilities'), 'sector'] = 1

# # replace stock's country code string with integer (Note: China was CN and CH for some reason)
# stock_df.loc[(stock_df['country_code'] == 'US'), 'country_code'] = 7
# stock_df.loc[(stock_df['country_code'] == 'Netherlands'), 'country_code'] = 6
# stock_df.loc[(stock_df['country_code'] == 'Australia'), 'country_code'] = 1
# stock_df.loc[(stock_df['country_code'] == 'UK'), 'country_code'] = 4
# stock_df.loc[(stock_df['country_code'] == 'CH'), 'country_code'] = 5
# stock_df.loc[(stock_df['country_code'] == 'CN'), 'country_code'] = 5
# stock_df.loc[(stock_df['country_code'] == 'CA'), 'country_code'] = 3
# stock_df.loc[(stock_df['country_code'] == 'Argentina'), 'country_code'] = 2

# # replace stock's region string with integer 
# stock_df.loc[(stock_df['region'] == 'W'), 'region'] = 11
# stock_df.loc[(stock_df['region'] == 'MW'), 'region'] = 9
# stock_df.loc[(stock_df['region'] == 'SW'), 'region'] = 5
# stock_df.loc[(stock_df['region'] == 'NW'), 'region'] = 8
# stock_df.loc[(stock_df['region'] == 'SE'), 'region'] = 10
# stock_df.loc[(stock_df['region'] == 'NL'), 'region'] = 4
# stock_df.loc[(stock_df['region'] == 'AU'), 'region'] = 1
# stock_df.loc[(stock_df['region'] == 'NE'), 'region'] = 7
# stock_df.loc[(stock_df['region'] == 'GB'), 'region'] = 3
# stock_df.loc[(stock_df['region'] == 'CH'), 'region'] = 6
# stock_df.loc[(stock_df['region'] == 'CA'), 'region'] = 2


# #create buckets for vwa
# stock_df.loc[(stock_df['vwa'] < 0), 'vwa'] = 0
# stock_df.loc[(stock_df['vwa'] > 0) & (stock_df['vwa'] <= 1), 'vwa'] = 1
# stock_df.loc[(stock_df['vwa'] > 1) & (stock_df['vwa'] <= 2), 'vwa'] = 2
# stock_df.loc[(stock_df['vwa'] > 2) & (stock_df['vwa'] <= 3), 'vwa'] = 3
# stock_df.loc[(stock_df['vwa'] > 3) & (stock_df['vwa'] <= 4), 'vwa'] = 4
# stock_df.loc[(stock_df['vwa'] > 4) & (stock_df['vwa'] <= 5), 'vwa'] = 5
# stock_df.loc[(stock_df['vwa'] > 5) & (stock_df['vwa'] <= 6), 'vwa'] = 6
# # stock_df.loc[(stock_df['vwa'] > 6) & (stock_df['vwa'] <= 7), 'vwa'] = 7
# # stock_df.loc[(stock_df['vwa'] > 7) & (stock_df['vwa'] <= 8), 'vwa'] = 8
# # stock_df.loc[(stock_df['vwa'] > 8) & (stock_df['vwa'] <= 9), 'vwa'] = 9
# # stock_df.loc[(stock_df['vwa'] > 9) & (stock_df['vwa'] <= 10), 'vwa'] = 10
# stock_df.loc[(stock_df['vwa'] > 6), 'vwa'] = 7

# #create buckets for va
# stock_df.loc[(stock_df['va'] < 0), 'vwa'] = 0
# stock_df.loc[(stock_df['va'] > 0) & (stock_df['va'] <= 1), 'va'] = 1
# stock_df.loc[(stock_df['va'] > 1) & (stock_df['va'] <= 2), 'va'] = 2
# stock_df.loc[(stock_df['va'] > 2) & (stock_df['va'] <= 3), 'va'] = 3
# stock_df.loc[(stock_df['va'] > 3) & (stock_df['va'] <= 4), 'va'] = 4
# stock_df.loc[(stock_df['va'] > 4) & (stock_df['va'] <= 5), 'va'] = 5
# stock_df.loc[(stock_df['va'] > 5) & (stock_df['va'] <= 6), 'va'] = 6
# # stock_df.loc[(stock_df['va'] > 6) & (stock_df['va'] <= 7), 'va'] = 7
# # stock_df.loc[(stock_df['va'] > 7) & (stock_df['va'] <= 8), 'va'] = 8
# # stock_df.loc[(stock_df['va'] > 8) & (stock_df['va'] <= 9), 'va'] = 9
# # stock_df.loc[(stock_df['va'] > 9) & (stock_df['va'] <= 10), 'va'] = 10
# stock_df.loc[(stock_df['va'] > 6), 'va'] = 7


In [26]:
stock_df.head()

Unnamed: 0,employee_count,revenue,sector,region,country_code,open_val_y,high_val_y,low_val_y,close_val_y,vwa,va
0,3,5,7,10,7,158.93,159.28,154.5,154.73,3.572086,-5.983786
1,2,3,7,10,7,154.13,155.0,145.0,145.14,0.466322,34.833215
2,3,2,7,10,7,439.66,440.61,415.43,416.38,5.657034,-52.61607
3,3,5,7,8,7,153.2,153.7834,147.29,147.6,2.700262,-30.135734
4,3,5,1,5,7,210.63,212.35,206.7,207.27,0.386626,7.082208


In [27]:
# Check volumne weight average buckets
vwa_counts = stock_df['vwa'].value_counts()
#vwa_counts

In [28]:
stock_df.drop(columns="country_code", axis=1, inplace=True)
stock_df.iloc[:,3:9].head()

Unnamed: 0,region,open_val_y,high_val_y,low_val_y,close_val_y,vwa
0,10,158.93,159.28,154.5,154.73,3.572086
1,10,154.13,155.0,145.0,145.14,0.466322
2,10,439.66,440.61,415.43,416.38,5.657034
3,8,153.2,153.7834,147.29,147.6,2.700262
4,5,210.63,212.35,206.7,207.27,0.386626


In [29]:
stock_df.iloc[:,8].head()

0    3.572086
1    0.466322
2    5.657034
3    2.700262
4    0.386626
Name: vwa, dtype: float64

In [30]:
# create features 
# X = stock_df.drop(columns=["vwa"])
# X = pd.get_dummies(X)

# create target
# y = stock_df["vwa"]

# Features set (open_val_y, high_val_y, low_val_y, close_val_y)
X = stock_df.iloc[:,3:9].values

# Target (close_val_y)
y = stock_df.iloc[:,8].values

In [31]:
# not available for multiple features
# X.describe()

In [32]:
np.unique(y)

array([-1.27229733e+01, -4.42475483e+00, -3.62367636e+00, -2.80208361e+00,
       -1.74422995e+00, -1.35090780e+00, -1.11178837e+00, -1.04845911e+00,
       -9.59520527e-01, -9.29529437e-01, -8.89110064e-01, -7.95431476e-01,
       -7.41475745e-01, -7.37565346e-01, -7.36260090e-01, -4.00327301e-01,
       -2.49773936e-01, -6.77841256e-03, -1.84771817e-03,  4.93092189e-03,
        7.66404342e-02,  3.22502059e-01,  3.26701356e-01,  3.86626192e-01,
        3.95574089e-01,  4.16828441e-01,  4.66322081e-01,  4.89033838e-01,
        5.16957879e-01,  6.02088255e-01,  6.53519712e-01,  6.54237351e-01,
        6.66507872e-01,  7.35483194e-01,  8.65461147e-01,  8.99421117e-01,
        9.10821246e-01,  1.08586884e+00,  1.16401737e+00,  1.26183021e+00,
        1.26291741e+00,  1.36031713e+00,  1.37227277e+00,  1.51809374e+00,
        1.56568835e+00,  1.59007121e+00,  1.59189431e+00,  1.60174566e+00,
        1.65073872e+00,  1.76210609e+00,  1.77682853e+00,  1.81298969e+00,
        1.96389876e+00,  

In [33]:
# train the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.26, random_state=1)

In [34]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [35]:
# random forest regression
# n_estimator default at 100
#n_features=
random_forest = RandomForestRegressor(n_estimators=500, 
                                      max_depth=15, 
                                      min_weight_fraction_leaf=0, 
                                      criterion="squared_error",
                                      bootstrap=False,
                                      max_features='auto',
                                      random_state=1)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

In [36]:
# play with Root Mean Squared Error values

# errors = abs(y_pred - y_test)
# mape = 100 * (errors/y_test)
# accuracy = 100 - np.mean(mape)
rms = metrics.mean_squared_error(y_test, y_pred, squared=False)

print(f"r2 Score: {metrics.r2_score(y_test, y_pred)}")
print(f"mean absolute error: {metrics.mean_absolute_error(y_test, y_pred)}")
print(f"mean squared error: {metrics.mean_squared_error(y_test, y_pred)}")
print(f"root mean squared error: {rms}")
#print("Accuracy:" , round(accuracy,2), '%')

r2 Score: 0.9483493094304455
mean absolute error: 0.6200402683467785
mean squared error: 2.074249956340676
root mean squared error: 1.4402256616033045
