In [1]:
#dependencies
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
from config import db_password
import psycopg2

In [2]:
# GET Tabled input
# creating database engine
db_name = 'Company_Stocks_DB'
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/{db_name}"
engine = create_engine(db_string)
# read data from PostgreSQL database table and load into Dataframe instance
stock_df = pd.read_sql("select * from \"company_all_star\"", engine);
# sort the dataframe by ticker column
stock_df.sort_values(by=["ticker"])
# Print the DataFrame
stock_df.head()

Unnamed: 0,ticker,date_val,company_name,company_url,employee_count,revenue,sector,city_name,state_name,region,...,latitude,longitude,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change
0,AMD,2020-03-12,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,42.2,43.91,39.6,43.9,86689681.0,41.6701,381223.0,4.028436
1,AMD,2020-03-15,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,39.08,43.37,38.51,38.71,84545868.0,41.0812,374962.0,0.946776
2,AMD,2020-03-16,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,40.19,42.88,38.3,41.88,92741881.0,41.124,434519.0,4.205026
3,AMD,2020-03-17,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,39.54,41.95,36.75,39.12,106949287.0,39.6363,591862.0,1.062215
4,AMD,2020-03-18,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,39.56,41.7,37.69,39.82,88939024.0,40.2337,396388.0,0.65723


In [3]:
# preserve date column as type object
stock_df['date'] = stock_df['date_val']

# have the user enter beginning date as yyyy-mm-dd
begin_date = '2022-03-08'
# have the user enter ending date as yyyy-mm-dd
end_date = '2022-03-10'
# iteration controls
day_range_of_iter = 2

# Convert the date to datetime64
stock_df['date_val'] = pd.to_datetime(stock_df['date_val'], format='%Y-%m-%d')

stock_df = stock_df.loc[(stock_df['date_val'] >= begin_date)
                     & (stock_df['date_val'] <= end_date)]

# drop throw-aways 
stock_df.drop(["longitude", "latitude", "company_name", "company_url","date_val"], axis=1, inplace=True)

stock_df

Unnamed: 0,ticker,employee_count,revenue,sector,city_name,state_name,region,country_code,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change,date
501,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,108.410,111.71,106.850,111.05,102310329.0,109.6319,602679.0,2.435200,2022-03-08
502,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,108.890,109.07,103.070,106.46,102557375.0,105.3382,639388.0,2.231610,2022-03-09
503,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,108.130,108.19,104.080,104.29,87584432.0,105.9691,542478.0,3.551281,2022-03-10
1006,ADBE,over-10k,1m-10m,Technology,San Jose,CA,W,US,443.800,453.11,438.930,450.87,2905656.0,447.8637,67082.0,1.593060,2022-03-08
1007,ADBE,over-10k,1m-10m,Technology,San Jose,CA,W,US,444.680,447.65,433.010,438.95,2686310.0,437.7568,66371.0,1.288567,2022-03-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50368,ZM,1k-5k,100m-200m,Technology,San Jose,CA,W,US,105.835,106.90,101.055,103.33,5030777.0,103.3206,88819.0,2.366892,2022-03-09
50369,ZM,1k-5k,100m-200m,Technology,San Jose,CA,W,US,103.480,103.49,97.900,98.12,6454629.0,99.6973,104681.0,5.179745,2022-03-10
50871,ZS,1k-5k,100m-200m,Technology,San Jose,CA,W,US,203.840,213.57,199.120,212.35,3050554.0,209.3268,45960.0,4.174843,2022-03-08
50872,ZS,1k-5k,100m-200m,Technology,San Jose,CA,W,US,212.130,213.51,204.870,208.41,2305091.0,208.7971,40754.0,1.753642,2022-03-09


In [4]:
# drop fields that will not be used to represent a period of time
stock_df.drop(columns = ['open_val', 'high_val', 'low_val', 'close_val', 'number_of_transactions', 'city_name', 'state_name', 'number_of_transactions', 'percent_change'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
stock_df

Unnamed: 0,ticker,employee_count,revenue,sector,region,country_code,volume,volume_weight,date
501,AMD,5k-10k,over-1b,Technology,W,US,102310329.0,109.6319,2022-03-08
502,AMD,5k-10k,over-1b,Technology,W,US,102557375.0,105.3382,2022-03-09
503,AMD,5k-10k,over-1b,Technology,W,US,87584432.0,105.9691,2022-03-10
1006,ADBE,over-10k,1m-10m,Technology,W,US,2905656.0,447.8637,2022-03-08
1007,ADBE,over-10k,1m-10m,Technology,W,US,2686310.0,437.7568,2022-03-09
1008,ADBE,over-10k,1m-10m,Technology,W,US,4434498.0,422.5279,2022-03-10
1321,ABNB,5k-10k,200m-1b,Technology,W,US,7023908.0,148.5454,2022-03-08
1322,ABNB,5k-10k,200m-1b,Technology,W,US,5302511.0,149.8916,2022-03-09
1323,ABNB,5k-10k,200m-1b,Technology,W,US,4577255.0,147.8527,2022-03-10
1825,ALGN,over-10k,200m-1b,Technology,SW,US,694358.0,438.9188,2022-03-08


In [5]:
# unique days in df
unique_days = len(pd.unique(stock_df['date']))
print("unique number of days(number of days in df):", unique_days)

# unique stocks in df
unique_stocks = len(pd.unique(stock_df['ticker']))
print(unique_stocks)

# interation sets
iteration_sets = (unique_days - day_range_of_iter + 1)
print("iteration_sets: ", iteration_sets)

# total records captured
length_of_df = len(stock_df)
print(length_of_df)


unique number of days(number of days in df): 3
102
iteration_sets:  2
306


In [6]:
# sort dataframe by date
sort_date_stock_df = stock_df.sort_values(by=['date', 'ticker'])
next_date_stock_df = sort_date_stock_df

# get beginning dataframe records
b = 0 
# ending record for beginning df
ending_records = iteration_sets * unique_stocks

# starting record for end
x = (unique_days - iteration_sets) * unique_stocks
max_records = unique_days * unique_stocks

begin_df = pd.DataFrame()
end_df = pd.DataFrame()
               
for rec in sort_date_stock_df.iterrows():
    
    if b < ending_records:
        new_begin_df = sort_date_stock_df.iloc[b]
        begin_df = begin_df.append(new_begin_df,ignore_index=False)
    
    if x < max_records: 
        new_end_df = next_date_stock_df.iloc[x]
        end_df = end_df.append(new_end_df,ignore_index=False)
    b=b+1
    x=x+1
   
begin_df.reset_index(drop=True,inplace=True)
begin_df 

Unnamed: 0,ticker,employee_count,revenue,sector,region,country_code,volume,volume_weight,date
0,AAPL,over-10k,over-1b,Technology,W,US,91445405.0,161.9446,2022-03-08
1,ABNB,5k-10k,200m-1b,Technology,W,US,7023908.0,148.5454,2022-03-08
2,ADBE,over-10k,1m-10m,Technology,W,US,2905656.0,447.8637,2022-03-08
3,ADI,over-10k,over-1b,Technology,SE,US,3046254.0,153.5888,2022-03-08
4,ADP,over-10k,over-1b,Consumer Discretionary,NE,US,1791687.0,209.3495,2022-03-08
5,ADSK,over-10k,over-1b,Healthcare,W,US,1850028.0,205.2018,2022-03-08
6,AEP,over-10k,over-1b,Energy,MW,US,3006258.0,95.0679,2022-03-08
7,ALGN,over-10k,200m-1b,Technology,SW,US,694358.0,438.9188,2022-03-08
8,AMAT,over-10k,over-1b,Technology,W,US,7623175.0,128.1947,2022-03-08
9,AMD,5k-10k,over-1b,Technology,W,US,102310329.0,109.6319,2022-03-08


In [7]:
# # sort dataframe by date
# sort_date_stock_df = stock_df.sort_values(by=['date', 'ticker'])
# sort_date_stock_df

end_df.reset_index(drop=True,inplace=True)
end_df

Unnamed: 0,ticker,employee_count,revenue,sector,region,country_code,volume,volume_weight,date
0,AAPL,over-10k,over-1b,Technology,W,US,105342033.0,158.0284,2022-03-09
1,ABNB,5k-10k,200m-1b,Technology,W,US,5302511.0,149.8916,2022-03-09
2,ADBE,over-10k,1m-10m,Technology,W,US,2686310.0,437.7568,2022-03-09
3,ADI,over-10k,over-1b,Technology,SE,US,4035714.0,150.1401,2022-03-09
4,ADP,over-10k,over-1b,Consumer Discretionary,NE,US,2399303.0,207.7212,2022-03-09
5,ADSK,over-10k,over-1b,Healthcare,W,US,1458925.0,199.9849,2022-03-09
6,AEP,over-10k,over-1b,Energy,MW,US,2897953.0,95.5087,2022-03-09
7,ALGN,over-10k,200m-1b,Technology,SW,US,532709.0,427.7159,2022-03-09
8,AMAT,over-10k,over-1b,Technology,W,US,5604387.0,124.5674,2022-03-09
9,AMD,5k-10k,over-1b,Technology,W,US,102557375.0,105.3382,2022-03-09


In [8]:
vwa_df = pd.merge(begin_df, end_df, left_index=True, right_index=True)

In [9]:
# drop fields that will not be used to represent a period of time
vwa_df.drop(columns = ['date_x', 'employee_count_y', 'region_y', 'revenue_y', 'sector_y', 'ticker_y', 'country_code_y', 'date_y'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
vwa_df

Unnamed: 0,ticker_x,employee_count_x,revenue_x,sector_x,region_x,country_code_x,volume_x,volume_weight_x,volume_y,volume_weight_y
0,AAPL,over-10k,over-1b,Technology,W,US,91445405.0,161.9446,105342033.0,158.0284
1,ABNB,5k-10k,200m-1b,Technology,W,US,7023908.0,148.5454,5302511.0,149.8916
2,ADBE,over-10k,1m-10m,Technology,W,US,2905656.0,447.8637,2686310.0,437.7568
3,ADI,over-10k,over-1b,Technology,SE,US,3046254.0,153.5888,4035714.0,150.1401
4,ADP,over-10k,over-1b,Consumer Discretionary,NE,US,1791687.0,209.3495,2399303.0,207.7212
5,ADSK,over-10k,over-1b,Healthcare,W,US,1850028.0,205.2018,1458925.0,199.9849
6,AEP,over-10k,over-1b,Energy,MW,US,3006258.0,95.0679,2897953.0,95.5087
7,ALGN,over-10k,200m-1b,Technology,SW,US,694358.0,438.9188,532709.0,427.7159
8,AMAT,over-10k,over-1b,Technology,W,US,7623175.0,128.1947,5604387.0,124.5674
9,AMD,5k-10k,over-1b,Technology,W,US,102310329.0,109.6319,102557375.0,105.3382


In [10]:
vwa_df['vwa'] = 100 - vwa_df['volume_weight_y']/vwa_df['volume_weight_x'] * 100
vwa_df['va'] = 100 - vwa_df['volume_y']/vwa_df['volume_x'] * 100

In [11]:
# unique values for each column (getting to know your data)
vwa_df.nunique()

ticker_x            102
employee_count_x      4
revenue_x             6
sector_x              9
region_x             11
country_code_x        8
volume_x            204
volume_weight_x     204
volume_y            204
volume_weight_y     204
vwa                 204
va                  204
dtype: int64

In [12]:
vwa_df.drop(columns = ['ticker_x', 'volume_x', 'volume_weight_x', 'volume_y', 'volume_weight_y'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
vwa_df

Unnamed: 0,employee_count_x,revenue_x,sector_x,region_x,country_code_x,vwa,va
0,over-10k,over-1b,Technology,W,US,2.418234,-15.196639
1,5k-10k,200m-1b,Technology,W,US,-0.906255,24.507681
2,over-10k,1m-10m,Technology,W,US,2.256691,7.548932
3,over-10k,over-1b,Technology,SE,US,2.245411,-32.481205
4,over-10k,over-1b,Consumer Discretionary,NE,US,0.77779,-33.913066
5,over-10k,over-1b,Healthcare,W,US,2.542327,21.140383
6,over-10k,over-1b,Energy,MW,US,-0.463669,3.602652
7,over-10k,200m-1b,Technology,SW,US,2.552386,23.280354
8,over-10k,over-1b,Technology,W,US,2.829524,26.482247
9,5k-10k,over-1b,Technology,W,US,3.91647,-0.241467


In [13]:
stock_df = vwa_df

In [14]:
# generate our categorical variable list
# categorical preprocessing can be done easiest using Dataframe.dtypes == 'object'
stock_categories = stock_df.dtypes[stock_df.dtypes == "object"].index.tolist()
stock_categories

['employee_count_x', 'revenue_x', 'sector_x', 'region_x', 'country_code_x']

In [15]:
# Checking the number of unique values in each column
stock_df[stock_categories].nunique()
# there needs to be only 10 at most in each categorie, how are we going to make this smaller...by sector ???b

employee_count_x     4
revenue_x            6
sector_x             9
region_x            11
country_code_x       8
dtype: int64

In [16]:
# I am catagorizing my own shiza from the tiza
# replace stock's employee count string with integer
stock_df.loc[(stock_df['employee_count_x'] == '5k-10k'), 'employee_count_x'] = 0
stock_df.loc[(stock_df['employee_count_x'] == 'over-10k'), 'employee_count_x'] = 1
stock_df.loc[(stock_df['employee_count_x'] == '1k-5k'), 'employee_count_x'] = 2
stock_df.loc[(stock_df['employee_count_x'] == '500-1k'), 'employee_count_x'] = 3

# replace stock's revenue string with integer
stock_df.loc[(stock_df['revenue_x'] == '1m-10m'), 'revenue_x'] = 0
stock_df.loc[(stock_df['revenue_x'] == '10m-50m'), 'revenue_x'] = 1
stock_df.loc[(stock_df['revenue_x'] == '50m-100m'), 'revenue_x'] = 2
stock_df.loc[(stock_df['revenue_x'] == '100m-200m'), 'revenue_x'] = 3
stock_df.loc[(stock_df['revenue_x'] == '200m-1b'), 'revenue_x'] = 4
stock_df.loc[(stock_df['revenue_x'] == 'over-1b'), 'revenue_x'] = 5

# replace stock's sector string with integer
stock_df.loc[(stock_df['sector_x'] == 'Technology'), 'sector_x'] = 0
stock_df.loc[(stock_df['sector_x'] == 'Energy'), 'sector_x'] = 1
stock_df.loc[(stock_df['sector_x'] == 'Healthcare'), 'sector_x'] = 2
stock_df.loc[(stock_df['sector_x'] == 'Consumer Discretionary'), 'sector_x'] = 3
stock_df.loc[(stock_df['sector_x'] == 'Industrials'), 'sector_x'] = 4
stock_df.loc[(stock_df['sector_x'] == 'Consumer Staples'), 'sector_x'] = 5
stock_df.loc[(stock_df['sector_x'] == 'Communication Services'), 'sector_x'] = 6
stock_df.loc[(stock_df['sector_x'] == 'Financials'), 'sector_x'] = 7
stock_df.loc[(stock_df['sector_x'] == 'Utilities'), 'sector_x'] = 8

# replace stock's country code string with integer (Note: China was CN and CH for some reason)
stock_df.loc[(stock_df['country_code_x'] == 'US'), 'country_code_x'] = 0
stock_df.loc[(stock_df['country_code_x'] == 'Netherlands'), 'country_code_x'] = 1
stock_df.loc[(stock_df['country_code_x'] == 'Australia'), 'country_code_x'] = 2
stock_df.loc[(stock_df['country_code_x'] == 'UK'), 'country_code_x'] = 3
stock_df.loc[(stock_df['country_code_x'] == 'CH'), 'country_code_x'] = 4
stock_df.loc[(stock_df['country_code_x'] == 'CN'), 'country_code_x'] = 4
stock_df.loc[(stock_df['country_code_x'] == 'CA'), 'country_code_x'] = 5
stock_df.loc[(stock_df['country_code_x'] == 'Argentina'), 'country_code_x'] = 6

# replace stock's region string with integer 
stock_df.loc[(stock_df['region_x'] == 'W'), 'region_x'] = 0
stock_df.loc[(stock_df['region_x'] == 'MW'), 'region_x'] = 1
stock_df.loc[(stock_df['region_x'] == 'SW'), 'region_x'] = 2
stock_df.loc[(stock_df['region_x'] == 'NW'), 'region_x'] = 3
stock_df.loc[(stock_df['region_x'] == 'SE'), 'region_x'] = 4
stock_df.loc[(stock_df['region_x'] == 'NL'), 'region_x'] = 5
stock_df.loc[(stock_df['region_x'] == 'AU'), 'region_x'] = 6
stock_df.loc[(stock_df['region_x'] == 'NE'), 'region_x'] = 7
stock_df.loc[(stock_df['region_x'] == 'GB'), 'region_x'] = 8
stock_df.loc[(stock_df['region_x'] == 'CH'), 'region_x'] = 9
stock_df.loc[(stock_df['region_x'] == 'CA'), 'region_x'] = 10


#create buckets for vwa
stock_df.loc[(stock_df['vwa'] < 0), 'vwa'] = 0
stock_df.loc[(stock_df['vwa'] > 0) & (stock_df['vwa'] <= 1), 'vwa'] = 1
stock_df.loc[(stock_df['vwa'] > 1) & (stock_df['vwa'] <= 2), 'vwa'] = 2
stock_df.loc[(stock_df['vwa'] > 2) & (stock_df['vwa'] <= 3), 'vwa'] = 3
stock_df.loc[(stock_df['vwa'] > 3) & (stock_df['vwa'] <= 4), 'vwa'] = 4
stock_df.loc[(stock_df['vwa'] > 4) & (stock_df['vwa'] <= 5), 'vwa'] = 5
stock_df.loc[(stock_df['vwa'] > 5) & (stock_df['vwa'] <= 6), 'vwa'] = 6
# stock_df.loc[(stock_df['vwa'] > 6) & (stock_df['vwa'] <= 7), 'vwa'] = 7
# stock_df.loc[(stock_df['vwa'] > 7) & (stock_df['vwa'] <= 8), 'vwa'] = 8
# stock_df.loc[(stock_df['vwa'] > 8) & (stock_df['vwa'] <= 9), 'vwa'] = 9
# stock_df.loc[(stock_df['vwa'] > 9) & (stock_df['vwa'] <= 10), 'vwa'] = 10
stock_df.loc[(stock_df['vwa'] > 6), 'vwa'] = 7

#create buckets for va
stock_df.loc[(stock_df['va'] < 0), 'vwa'] = 0
stock_df.loc[(stock_df['va'] > 0) & (stock_df['va'] <= 1), 'va'] = 1
stock_df.loc[(stock_df['va'] > 1) & (stock_df['va'] <= 2), 'va'] = 2
stock_df.loc[(stock_df['va'] > 2) & (stock_df['va'] <= 3), 'va'] = 3
stock_df.loc[(stock_df['va'] > 3) & (stock_df['va'] <= 4), 'va'] = 4
stock_df.loc[(stock_df['va'] > 4) & (stock_df['va'] <= 5), 'va'] = 5
stock_df.loc[(stock_df['va'] > 5) & (stock_df['va'] <= 6), 'va'] = 6
# stock_df.loc[(stock_df['va'] > 6) & (stock_df['va'] <= 7), 'va'] = 7
# stock_df.loc[(stock_df['va'] > 7) & (stock_df['va'] <= 8), 'va'] = 8
# stock_df.loc[(stock_df['va'] > 8) & (stock_df['va'] <= 9), 'va'] = 9
# stock_df.loc[(stock_df['va'] > 9) & (stock_df['va'] <= 10), 'va'] = 10
stock_df.loc[(stock_df['va'] > 6), 'va'] = 7
stock_df

Unnamed: 0,employee_count_x,revenue_x,sector_x,region_x,country_code_x,vwa,va
0,1,5,0,0,0,0.0,-15.196639
1,0,4,0,0,0,0.0,7.0
2,1,0,0,0,0,3.0,7.0
3,1,5,0,4,0,0.0,-32.481205
4,1,5,3,7,0,0.0,-33.913066
5,1,5,2,0,0,3.0,7.0
6,1,5,1,1,0,0.0,4.0
7,1,4,0,2,0,3.0,7.0
8,1,5,0,0,0,3.0,7.0
9,0,5,0,0,0,0.0,-0.241467


In [17]:
# Check volumne weight average buckets
vwa_counts = stock_df['vwa'].value_counts()
vwa_counts

0.0    112
2.0     34
1.0     30
3.0     20
4.0      3
5.0      2
6.0      2
7.0      1
Name: vwa, dtype: int64

In [18]:
# create features array
X = stock_df.drop(columns=["vwa"])
X = pd.get_dummies(X)

# create target
y = stock_df["vwa"]

In [19]:
X.describe()

Unnamed: 0,va,employee_count_x_0,employee_count_x_1,employee_count_x_2,employee_count_x_3,revenue_x_0,revenue_x_1,revenue_x_2,revenue_x_3,revenue_x_4,...,region_x_8,region_x_9,region_x_10,country_code_x_0,country_code_x_1,country_code_x_2,country_code_x_3,country_code_x_4,country_code_x_5,country_code_x_6
count,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0,...,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0
mean,-9.644249,0.215686,0.607843,0.137255,0.039216,0.04902,0.009804,0.029412,0.039216,0.088235,...,0.009804,0.029412,0.009804,0.911765,0.019608,0.009804,0.009804,0.029412,0.009804,0.009804
std,46.483257,0.412309,0.489432,0.344963,0.194585,0.21644,0.098771,0.169373,0.194585,0.284335,...,0.098771,0.169373,0.098771,0.284335,0.138989,0.098771,0.098771,0.169373,0.098771,0.098771
min,-483.39343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-9.599165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,7.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
np.unique(y)

array([0., 1., 2., 3., 4., 5., 6., 7.])

In [21]:
# train the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [22]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [23]:
# random forest regression
# n_estimator default at 100
random_forest = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=1)
random_forest.fit(X_train, y_train)

RandomForestRegressor(max_depth=15, random_state=1)

In [24]:
y_pred = random_forest.predict(X_test)
y_true = y_test

In [25]:
print(f"r2 Score: {metrics.r2_score(y_test, y_pred)}")
print(f"mean absolute error: {metrics.mean_absolute_error(y_test, y_pred)}")

r2 Score: 0.16728586872683948
mean absolute error: 0.809824444417475
