In [1]:
#dependencies
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
from config import db_password
import matplotlib.pyplot as plt
import psycopg2

In [2]:
# GET Tabled input
# creating database engine
db_name = 'Company_Stocks_DB'
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/{db_name}"
engine = create_engine(db_string)
# read data from PostgreSQL database table and load into Dataframe instance
stock_df = pd.read_sql("select * from \"company_all_star\"", engine);
# sort the dataframe by ticker column
stock_df.sort_values(by=["ticker"])
# Print the DataFrame
stock_df.head()

Unnamed: 0,ticker,date_val,company_name,company_url,employee_count,revenue,sector,city_name,state_name,region,...,latitude,longitude,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change
0,AMD,2020-03-12,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,42.2,43.91,39.6,43.9,86689681.0,41.6701,381223.0,4.028436
1,AMD,2020-03-15,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,39.08,43.37,38.51,38.71,84545868.0,41.0812,374962.0,0.946776
2,AMD,2020-03-16,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,40.19,42.88,38.3,41.88,92741881.0,41.124,434519.0,4.205026
3,AMD,2020-03-17,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,39.54,41.95,36.75,39.12,106949287.0,39.6363,591862.0,1.062215
4,AMD,2020-03-18,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,39.56,41.7,37.69,39.82,88939024.0,40.2337,396388.0,0.65723


In [3]:
# x = stock_df['date_val']
# y = stock_df['percent_change']
# plt.plot(x, y)
# plt.show()

In [4]:
# preserve date column as type object
stock_df['date'] = stock_df['date_val']

# have the user enter beginning date as yyyy-mm-dd
begin_date = '2022-03-01'
# have the user enter ending date as yyyy-mm-dd
end_date = '2022-03-12'
# iteration controls
day_range_of_iter = 2

# Convert the date to datetime64
stock_df['date_val'] = pd.to_datetime(stock_df['date_val'], format='%Y-%m-%d')

stock_df = stock_df.loc[(stock_df['date_val'] >= begin_date)
                     & (stock_df['date_val'] <= end_date)]

# drop throw-aways 
stock_df.drop(["longitude", "latitude", "company_name", "company_url","date_val"], axis=1, inplace=True)

stock_df

Unnamed: 0,ticker,employee_count,revenue,sector,city_name,state_name,region,country_code,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change,date
496,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,115.47,119.48,113.31,118.28,118054509.0,116.7418,706887.0,2.433533,2022-03-01
497,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,118.13,118.26,111.05,111.98,123853721.0,113.6286,763902.0,5.206129,2022-03-02
498,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,112.00,113.00,106.81,108.41,100671339.0,109.0619,674554.0,3.205357,2022-03-03
499,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,108.53,109.53,102.82,102.95,92599972.0,105.5087,659639.0,5.141436,2022-03-06
500,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,102.81,109.90,100.08,105.53,135348316.0,105.1526,845843.0,2.645657,2022-03-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50869,ZS,1k-5k,100m-200m,Technology,San Jose,CA,W,US,228.37,229.97,204.36,204.37,4379337.0,210.5799,72096.0,10.509261,2022-03-06
50870,ZS,1k-5k,100m-200m,Technology,San Jose,CA,W,US,203.50,203.92,190.13,198.63,4389634.0,196.9284,71180.0,2.393120,2022-03-07
50871,ZS,1k-5k,100m-200m,Technology,San Jose,CA,W,US,203.84,213.57,199.12,212.35,3050554.0,209.3268,45960.0,4.174843,2022-03-08
50872,ZS,1k-5k,100m-200m,Technology,San Jose,CA,W,US,212.13,213.51,204.87,208.41,2305091.0,208.7971,40754.0,1.753642,2022-03-09


In [5]:
# drop fields that will not be used to represent a period of time
stock_df.drop(columns = ['open_val', 'high_val', 'low_val', 'close_val', 'number_of_transactions', 'city_name', 'state_name', 'number_of_transactions', 'percent_change'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
stock_df

Unnamed: 0,ticker,employee_count,revenue,sector,region,country_code,volume,volume_weight,date
496,AMD,5k-10k,over-1b,Technology,W,US,118054509.0,116.7418,2022-03-01
497,AMD,5k-10k,over-1b,Technology,W,US,123853721.0,113.6286,2022-03-02
498,AMD,5k-10k,over-1b,Technology,W,US,100671339.0,109.0619,2022-03-03
499,AMD,5k-10k,over-1b,Technology,W,US,92599972.0,105.5087,2022-03-06
500,AMD,5k-10k,over-1b,Technology,W,US,135348316.0,105.1526,2022-03-07
501,AMD,5k-10k,over-1b,Technology,W,US,102310329.0,109.6319,2022-03-08
502,AMD,5k-10k,over-1b,Technology,W,US,102557375.0,105.3382,2022-03-09
503,AMD,5k-10k,over-1b,Technology,W,US,87584432.0,105.9691,2022-03-10
1001,ADBE,over-10k,1m-10m,Technology,W,US,2948450.0,471.9849,2022-03-01
1002,ADBE,over-10k,1m-10m,Technology,W,US,2906318.0,462.3042,2022-03-02


In [6]:
# unique days in df
unique_days = len(pd.unique(stock_df['date']))
print("unique number of days(number of days in df):", unique_days)

# unique stocks in df
unique_stocks = len(pd.unique(stock_df['ticker']))
print(unique_stocks)

# interation sets
iteration_sets = (unique_days - day_range_of_iter + 1)
print("iteration_sets: ", iteration_sets)

# total records captured
length_of_df = len(stock_df)
print(length_of_df)


unique number of days(number of days in df): 8
102
iteration_sets:  7
816


In [7]:
# sort dataframe by date
sort_date_stock_df = stock_df.sort_values(by=['date', 'ticker'])
next_date_stock_df = sort_date_stock_df

# get beginning dataframe records
b = 0 
# ending record for beginning df
ending_records = iteration_sets * unique_stocks

# starting record for end
x = (unique_days - iteration_sets) * unique_stocks
max_records = unique_days * unique_stocks

begin_df = pd.DataFrame()
end_df = pd.DataFrame()
               
for rec in sort_date_stock_df.iterrows():
    
    if b < ending_records:
        new_begin_df = sort_date_stock_df.iloc[b]
        begin_df = begin_df.append(new_begin_df,ignore_index=False)
    
    if x < max_records: 
        new_end_df = next_date_stock_df.iloc[x]
        end_df = end_df.append(new_end_df,ignore_index=False)
    b=b+1
    x=x+1
   
begin_df.reset_index(drop=True,inplace=True)
begin_df 

Unnamed: 0,ticker,employee_count,revenue,sector,region,country_code,volume,volume_weight,date
0,AAPL,over-10k,over-1b,Technology,W,US,79724750.0,165.8095,2022-03-01
1,ABNB,5k-10k,200m-1b,Technology,W,US,5028250.0,155.9842,2022-03-01
2,ADBE,over-10k,1m-10m,Technology,W,US,2948450.0,471.9849,2022-03-01
3,ADI,over-10k,over-1b,Technology,SE,US,2578681.0,159.8088,2022-03-01
4,ADP,over-10k,over-1b,Consumer Discretionary,NE,US,1674593.0,206.1608,2022-03-01
5,ADSK,over-10k,over-1b,Healthcare,W,US,1677230.0,215.3437,2022-03-01
6,AEP,over-10k,over-1b,Energy,MW,US,3146513.0,90.9575,2022-03-01
7,ALGN,over-10k,200m-1b,Technology,SW,US,581358.0,494.8399,2022-03-01
8,AMAT,over-10k,over-1b,Technology,W,US,7008727.0,132.4731,2022-03-01
9,AMD,5k-10k,over-1b,Technology,W,US,118054509.0,116.7418,2022-03-01


In [8]:
# # sort dataframe by date
# sort_date_stock_df = stock_df.sort_values(by=['date', 'ticker'])
# sort_date_stock_df

end_df.reset_index(drop=True,inplace=True)
end_df

Unnamed: 0,ticker,employee_count,revenue,sector,region,country_code,volume,volume_weight,date
0,AAPL,over-10k,over-1b,Technology,W,US,76678441.0,166.918,2022-03-02
1,ABNB,5k-10k,200m-1b,Technology,W,US,4045757.0,153.1107,2022-03-02
2,ADBE,over-10k,1m-10m,Technology,W,US,2906318.0,462.3042,2022-03-02
3,ADI,over-10k,over-1b,Technology,SE,US,2656102.0,159.5458,2022-03-02
4,ADP,over-10k,over-1b,Consumer Discretionary,NE,US,2192173.0,206.0616,2022-03-02
5,ADSK,over-10k,over-1b,Healthcare,W,US,1730045.0,211.8108,2022-03-02
6,AEP,over-10k,over-1b,Energy,MW,US,4173651.0,93.6477,2022-03-02
7,ALGN,over-10k,200m-1b,Technology,SW,US,655912.0,481.2937,2022-03-02
8,AMAT,over-10k,over-1b,Technology,W,US,5449674.0,131.0939,2022-03-02
9,AMD,5k-10k,over-1b,Technology,W,US,123853721.0,113.6286,2022-03-02


In [9]:
vwa_df = pd.merge(begin_df, end_df, left_index=True, right_index=True)

In [10]:
# drop fields that will not be used to represent a period of time
vwa_df.drop(columns = ['date_x', 'employee_count_y', 'region_y', 'revenue_y', 'sector_y', 'ticker_y', 'country_code_y', 'date_y'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
vwa_df

Unnamed: 0,ticker_x,employee_count_x,revenue_x,sector_x,region_x,country_code_x,volume_x,volume_weight_x,volume_y,volume_weight_y
0,AAPL,over-10k,over-1b,Technology,W,US,79724750.0,165.8095,76678441.0,166.918
1,ABNB,5k-10k,200m-1b,Technology,W,US,5028250.0,155.9842,4045757.0,153.1107
2,ADBE,over-10k,1m-10m,Technology,W,US,2948450.0,471.9849,2906318.0,462.3042
3,ADI,over-10k,over-1b,Technology,SE,US,2578681.0,159.8088,2656102.0,159.5458
4,ADP,over-10k,over-1b,Consumer Discretionary,NE,US,1674593.0,206.1608,2192173.0,206.0616
5,ADSK,over-10k,over-1b,Healthcare,W,US,1677230.0,215.3437,1730045.0,211.8108
6,AEP,over-10k,over-1b,Energy,MW,US,3146513.0,90.9575,4173651.0,93.6477
7,ALGN,over-10k,200m-1b,Technology,SW,US,581358.0,494.8399,655912.0,481.2937
8,AMAT,over-10k,over-1b,Technology,W,US,7008727.0,132.4731,5449674.0,131.0939
9,AMD,5k-10k,over-1b,Technology,W,US,118054509.0,116.7418,123853721.0,113.6286


In [11]:
vwa_df['vwa'] = 100 - vwa_df['volume_weight_y']/vwa_df['volume_weight_x'] * 100
vwa_df['va'] = 100 - vwa_df['volume_y']/vwa_df['volume_x'] * 100

In [12]:
# unique values for each column (getting to know your data)
vwa_df.nunique()

ticker_x            102
employee_count_x      4
revenue_x             6
sector_x              9
region_x             11
country_code_x        8
volume_x            714
volume_weight_x     714
volume_y            714
volume_weight_y     714
vwa                 714
va                  714
dtype: int64

In [13]:
vwa_df.drop(columns = ['ticker_x', 'volume_x', 'volume_weight_x', 'volume_y', 'volume_weight_y'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
vwa_df

Unnamed: 0,employee_count_x,revenue_x,sector_x,region_x,country_code_x,vwa,va
0,over-10k,over-1b,Technology,W,US,-0.668538,3.821033
1,5k-10k,200m-1b,Technology,W,US,1.842174,19.539462
2,over-10k,1m-10m,Technology,W,US,2.051061,1.428954
3,over-10k,over-1b,Technology,SE,US,0.164572,-3.002349
4,over-10k,over-1b,Consumer Discretionary,NE,US,0.048118,-30.907809
5,over-10k,over-1b,Healthcare,W,US,1.640587,-3.148942
6,over-10k,over-1b,Energy,MW,US,-2.957645,-32.643692
7,over-10k,200m-1b,Technology,SW,US,2.737491,-12.824112
8,over-10k,over-1b,Technology,W,US,1.041117,22.244453
9,5k-10k,over-1b,Technology,W,US,2.66674,-4.912317


In [14]:
stock_df = vwa_df

In [15]:
# generate our categorical variable list
# categorical preprocessing can be done easiest using Dataframe.dtypes == 'object'
stock_categories = stock_df.dtypes[stock_df.dtypes == "object"].index.tolist()
stock_categories

['employee_count_x', 'revenue_x', 'sector_x', 'region_x', 'country_code_x']

In [16]:
# Checking the number of unique values in each column
stock_df[stock_categories].nunique()
# there needs to be only 10 at most in each categorie, how are we going to make this smaller...by sector ???b

employee_count_x     4
revenue_x            6
sector_x             9
region_x            11
country_code_x       8
dtype: int64

In [17]:
stock_df.columns.tolist()

['employee_count_x',
 'revenue_x',
 'sector_x',
 'region_x',
 'country_code_x',
 'vwa',
 'va']

In [18]:
stock_df.rename(columns={"employee_count_x": "employee_count", "revenue_x":"revenue", "sector_x":"sector",
            "region_x":"region", "country_code_x":"country_code"},inplace=True)

In [19]:
stock_df.columns

Index(['employee_count', 'revenue', 'sector', 'region', 'country_code', 'vwa',
       'va'],
      dtype='object')

In [20]:
# #----------------------------------- ONE HOT ENCODE METHOD
# stock_cat = stock_df.dtypes[stock_df.dtypes=="object"].index.tolist()
# stock_df[stock_cat].nunique()

# # Create OneHotEnocder Instance
# enc = OneHotEncoder(sparse=False)

# # Fit and transform categorical data
# encode_df = pd.DataFrame(enc.fit_transform(stock_df[stock_cat]))

# # Add the encoded variable names to encode_df
# encode_df.columns = enc.get_feature_names_out(stock_cat)
# encode_df.head()


#-------------------------------------- OLD "BUCKETING" METHOD

# stock_df["employee_count"].astype('category').cat.codes
# stock_df['employee_count'] = stock_df['employee_count'].astype('category').cat.codes
# stock_df['revenue'] = stock_df['revenue'].astype('category').cat.codes
# stock_df['sector'] = stock_df['sector'].astype('category').cat.codes
# stock_df['region'] = stock_df['region'].astype('category').cat.codes
# #stock_df['city_name'] = stock_df['city_name'].astype('category').cat.codes
# #stock_df['state_name'] = stock_df['state_name'].astype('category').cat.codes
# stock_df['country_code'] = stock_df['country_code'].astype('category').cat.codes

In [21]:
stock_df['sector'].value_counts()

Technology                371
Healthcare                 98
Consumer Discretionary     70
Industrials                63
Consumer Staples           49
Communication Services     35
Financials                 14
Energy                      7
Utilities                   7
Name: sector, dtype: int64

In [22]:
stock_df['country_code'].value_counts()

US             651
Netherlands     14
CH              14
UK               7
CA               7
Argentina        7
CN               7
Australia        7
Name: country_code, dtype: int64

In [23]:
stock_df["region"].value_counts()

W     294
SE    182
MW     63
NW     63
NE     42
CH     21
SW     14
NL     14
GB      7
CA      7
AU      7
Name: region, dtype: int64

In [24]:
# I am catagorizing my own shiza from the tiza
#replace stock's employee count string with integer
stock_df.loc[(stock_df['employee_count'] == '5k-10k'), 'employee_count'] = 0
stock_df.loc[(stock_df['employee_count'] == 'over-10k'), 'employee_count'] = 1
stock_df.loc[(stock_df['employee_count'] == '1k-5k'), 'employee_count'] = 2
stock_df.loc[(stock_df['employee_count'] == '500-1k'), 'employee_count'] = 3

# replace stock's revenue string with integer
stock_df.loc[(stock_df['revenue'] == '1m-10m'), 'revenue'] = 0
stock_df.loc[(stock_df['revenue'] == '10m-50m'), 'revenue'] = 1
stock_df.loc[(stock_df['revenue'] == '50m-100m'), 'revenue'] = 2
stock_df.loc[(stock_df['revenue'] == '100m-200m'), 'revenue'] = 3
stock_df.loc[(stock_df['revenue'] == '200m-1b'), 'revenue'] = 4
stock_df.loc[(stock_df['revenue'] == 'over-1b'), 'revenue'] = 5

# replace stock's sector string with integer
stock_df.loc[(stock_df['sector'] == 'Technology'), 'sector'] = 9
stock_df.loc[(stock_df['sector'] == 'Energy'), 'sector'] = 2
stock_df.loc[(stock_df['sector'] == 'Healthcare'), 'sector'] = 8
stock_df.loc[(stock_df['sector'] == 'Consumer Discretionary'), 'sector'] = 7
stock_df.loc[(stock_df['sector'] == 'Industrials'), 'sector'] = 6
stock_df.loc[(stock_df['sector'] == 'Consumer Staples'), 'sector'] = 5
stock_df.loc[(stock_df['sector'] == 'Communication Services'), 'sector'] = 4
stock_df.loc[(stock_df['sector'] == 'Financials'), 'sector'] = 3
stock_df.loc[(stock_df['sector'] == 'Utilities'), 'sector'] = 1

# replace stock's country code string with integer (Note: China was CN and CH for some reason)
stock_df.loc[(stock_df['country_code'] == 'US'), 'country_code'] = 7
stock_df.loc[(stock_df['country_code'] == 'Netherlands'), 'country_code'] = 6
stock_df.loc[(stock_df['country_code'] == 'Australia'), 'country_code'] = 1
stock_df.loc[(stock_df['country_code'] == 'UK'), 'country_code'] = 4
stock_df.loc[(stock_df['country_code'] == 'CH'), 'country_code'] = 5
stock_df.loc[(stock_df['country_code'] == 'CN'), 'country_code'] = 5
stock_df.loc[(stock_df['country_code'] == 'CA'), 'country_code'] = 3
stock_df.loc[(stock_df['country_code'] == 'Argentina'), 'country_code'] = 2

# replace stock's region string with integer 
stock_df.loc[(stock_df['region'] == 'W'), 'region'] = 11
stock_df.loc[(stock_df['region'] == 'MW'), 'region'] = 9
stock_df.loc[(stock_df['region'] == 'SW'), 'region'] = 5
stock_df.loc[(stock_df['region'] == 'NW'), 'region'] = 8
stock_df.loc[(stock_df['region'] == 'SE'), 'region'] = 10
stock_df.loc[(stock_df['region'] == 'NL'), 'region'] = 4
stock_df.loc[(stock_df['region'] == 'AU'), 'region'] = 1
stock_df.loc[(stock_df['region'] == 'NE'), 'region'] = 7
stock_df.loc[(stock_df['region'] == 'GB'), 'region'] = 3
stock_df.loc[(stock_df['region'] == 'CH'), 'region'] = 6
stock_df.loc[(stock_df['region'] == 'CA'), 'region'] = 2


#create buckets for vwa
stock_df.loc[(stock_df['vwa'] < 0), 'vwa'] = 0
stock_df.loc[(stock_df['vwa'] > 0) & (stock_df['vwa'] <= 1), 'vwa'] = 1
stock_df.loc[(stock_df['vwa'] > 1) & (stock_df['vwa'] <= 2), 'vwa'] = 2
stock_df.loc[(stock_df['vwa'] > 2) & (stock_df['vwa'] <= 3), 'vwa'] = 3
stock_df.loc[(stock_df['vwa'] > 3) & (stock_df['vwa'] <= 4), 'vwa'] = 4
stock_df.loc[(stock_df['vwa'] > 4) & (stock_df['vwa'] <= 5), 'vwa'] = 5
stock_df.loc[(stock_df['vwa'] > 5) & (stock_df['vwa'] <= 6), 'vwa'] = 6
# stock_df.loc[(stock_df['vwa'] > 6) & (stock_df['vwa'] <= 7), 'vwa'] = 7
# stock_df.loc[(stock_df['vwa'] > 7) & (stock_df['vwa'] <= 8), 'vwa'] = 8
# stock_df.loc[(stock_df['vwa'] > 8) & (stock_df['vwa'] <= 9), 'vwa'] = 9
# stock_df.loc[(stock_df['vwa'] > 9) & (stock_df['vwa'] <= 10), 'vwa'] = 10
stock_df.loc[(stock_df['vwa'] > 6), 'vwa'] = 7

#create buckets for va
stock_df.loc[(stock_df['va'] < 0), 'vwa'] = 0
stock_df.loc[(stock_df['va'] > 0) & (stock_df['va'] <= 1), 'va'] = 1
stock_df.loc[(stock_df['va'] > 1) & (stock_df['va'] <= 2), 'va'] = 2
stock_df.loc[(stock_df['va'] > 2) & (stock_df['va'] <= 3), 'va'] = 3
stock_df.loc[(stock_df['va'] > 3) & (stock_df['va'] <= 4), 'va'] = 4
stock_df.loc[(stock_df['va'] > 4) & (stock_df['va'] <= 5), 'va'] = 5
stock_df.loc[(stock_df['va'] > 5) & (stock_df['va'] <= 6), 'va'] = 6
# stock_df.loc[(stock_df['va'] > 6) & (stock_df['va'] <= 7), 'va'] = 7
# stock_df.loc[(stock_df['va'] > 7) & (stock_df['va'] <= 8), 'va'] = 8
# stock_df.loc[(stock_df['va'] > 8) & (stock_df['va'] <= 9), 'va'] = 9
# stock_df.loc[(stock_df['va'] > 9) & (stock_df['va'] <= 10), 'va'] = 10
stock_df.loc[(stock_df['va'] > 6), 'va'] = 7


In [25]:
stock_df

Unnamed: 0,employee_count,revenue,sector,region,country_code,vwa,va
0,1,5,9,11,7,0.0,4.0
1,0,4,9,11,7,2.0,7.0
2,1,0,9,11,7,3.0,2.0
3,1,5,9,10,7,0.0,-3.002349
4,1,5,7,7,7,0.0,-30.907809
5,1,5,8,11,7,0.0,-3.148942
6,1,5,2,9,7,0.0,-32.643692
7,1,4,9,5,7,0.0,-12.824112
8,1,5,9,11,7,2.0,7.0
9,0,5,9,11,7,0.0,-4.912317


In [26]:
# Check volumne weight average buckets
vwa_counts = stock_df['vwa'].value_counts()
vwa_counts

0.0    486
1.0     79
2.0     69
3.0     48
4.0     10
5.0     10
7.0      7
6.0      5
Name: vwa, dtype: int64

In [27]:
# create features array
X = stock_df.drop(columns=["vwa"])
X = pd.get_dummies(X)

# create target
y = stock_df["vwa"]

In [28]:
#X.describe()

In [29]:
np.unique(y)

array([0., 1., 2., 3., 4., 5., 6., 7.])

In [30]:
# train the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [31]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [32]:
# random forest regression
# n_estimator default at 100
random_forest = RandomForestRegressor(n_estimators=100, max_depth=15,random_state=1)
random_forest.fit(X_train, y_train)

RandomForestRegressor(max_depth=15, random_state=1)

In [33]:
y_pred = random_forest.predict(X_test)

In [34]:
# errors = abs(y_pred - y_test)
# mape = 100 * (errors/y_test)
# accuracy = 100 - np.mean(mape)

print(f"r2 Score: {metrics.r2_score(y_test, y_pred)}")
print(f"mean absolute error: {metrics.mean_absolute_error(y_test, y_pred)}")
#print("Accuracy:" , round(accuracy,2), '%')

r2 Score: 0.237157343643691
mean absolute error: 0.7185286008328732
