# METHOD FROM ARTICLE
### https://tinyurl.com/yc4de7rr

In [1]:
#dependencies
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
from config import db_password
import psycopg2

In [2]:
# GET Tabled input
# creating database engine
db_name = 'Company_Stocks_DB'
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/{db_name}"
engine = create_engine(db_string)
# read data from PostgreSQL database table and load into Dataframe instance
stock_df = pd.read_sql("select * from \"company_all_star\"", engine);
# sort the dataframe by ticker column
stock_df.sort_values(by=["ticker"])
# Print the DataFrame
stock_df.head()

Unnamed: 0,ticker,date_val,company_name,company_url,employee_count,revenue,sector,city_name,state_name,region,...,latitude,longitude,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change
0,AMD,2020-03-12,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,42.2,43.91,39.6,43.9,86689681.0,41.6701,381223.0,4.028436
1,AMD,2020-03-15,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,39.08,43.37,38.51,38.71,84545868.0,41.0812,374962.0,0.946776
2,AMD,2020-03-16,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,40.19,42.88,38.3,41.88,92741881.0,41.124,434519.0,4.205026
3,AMD,2020-03-17,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,39.54,41.95,36.75,39.12,106949287.0,39.6363,591862.0,1.062215
4,AMD,2020-03-18,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,39.56,41.7,37.69,39.82,88939024.0,40.2337,396388.0,0.65723


In [3]:
stock_df.columns.tolist()

['ticker',
 'date_val',
 'company_name',
 'company_url',
 'employee_count',
 'revenue',
 'sector',
 'city_name',
 'state_name',
 'region',
 'country_code',
 'latitude',
 'longitude',
 'open_val',
 'high_val',
 'low_val',
 'close_val',
 'volume',
 'volume_weight',
 'number_of_transactions',
 'percent_change']

In [4]:
#def get_stock(begin_date, end_date)
 # stock_df['date'] = stock_date['date_val']
 # begin_date = begin_date
 # end_date = end_date 
    #rest of code in cell below
 # for loop

In [5]:
# preserve date column as type object
stock_df['date'] = stock_df['date_val']
# have the user enter beginning date as yyyy-mm-dd
begin_date = '2022-02-10'
# have the user enter ending date as yyyy-mm-dd
end_date = '2022-03-10'

#  day interval iteration control
# day_range_of_iter = x (no. of days)

# Convert the date to datetime64
stock_df['date_val'] = pd.to_datetime(stock_df['date_val'], format='%Y-%m-%d')
stock_df = stock_df.loc[(stock_df['date_val'] >= begin_date)
                     & (stock_df['date_val'] <= end_date)]
# drop throw-aways
stock_df.drop(["longitude", "latitude", "company_name", "company_url","date_val"], axis=1, inplace=True)
stock_df

Unnamed: 0,ticker,employee_count,revenue,sector,city_name,state_name,region,country_code,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change,date
484,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,126.14,127.1699,111.81,113.18,164708241.0,117.6520,1209307.0,10.274298,2022-02-10
485,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,115.51,118.3700,113.46,114.27,135125910.0,116.0441,880606.0,1.073500,2022-02-13
486,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,117.19,121.8800,114.36,121.47,144139671.0,118.5217,812307.0,3.652189,2022-02-14
487,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,119.05,119.5400,114.22,117.69,119107831.0,116.8985,701937.0,1.142377,2022-02-15
488,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,116.26,116.9800,112.26,112.37,98179641.0,114.5982,595580.0,3.345949,2022-02-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50869,ZS,1k-5k,100m-200m,Technology,San Jose,CA,W,US,228.37,229.9700,204.36,204.37,4379337.0,210.5799,72096.0,10.509261,2022-03-06
50870,ZS,1k-5k,100m-200m,Technology,San Jose,CA,W,US,203.50,203.9200,190.13,198.63,4389634.0,196.9284,71180.0,2.393120,2022-03-07
50871,ZS,1k-5k,100m-200m,Technology,San Jose,CA,W,US,203.84,213.5700,199.12,212.35,3050554.0,209.3268,45960.0,4.174843,2022-03-08
50872,ZS,1k-5k,100m-200m,Technology,San Jose,CA,W,US,212.13,213.5100,204.87,208.41,2305091.0,208.7971,40754.0,1.753642,2022-03-09


In [6]:
stock_df.drop(['number_of_transactions'], axis=1, inplace=True)
stock_df
    

Unnamed: 0,ticker,employee_count,revenue,sector,city_name,state_name,region,country_code,open_val,high_val,low_val,close_val,volume,volume_weight,percent_change,date
484,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,126.14,127.1699,111.81,113.18,164708241.0,117.6520,10.274298,2022-02-10
485,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,115.51,118.3700,113.46,114.27,135125910.0,116.0441,1.073500,2022-02-13
486,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,117.19,121.8800,114.36,121.47,144139671.0,118.5217,3.652189,2022-02-14
487,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,119.05,119.5400,114.22,117.69,119107831.0,116.8985,1.142377,2022-02-15
488,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,116.26,116.9800,112.26,112.37,98179641.0,114.5982,3.345949,2022-02-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50869,ZS,1k-5k,100m-200m,Technology,San Jose,CA,W,US,228.37,229.9700,204.36,204.37,4379337.0,210.5799,10.509261,2022-03-06
50870,ZS,1k-5k,100m-200m,Technology,San Jose,CA,W,US,203.50,203.9200,190.13,198.63,4389634.0,196.9284,2.393120,2022-03-07
50871,ZS,1k-5k,100m-200m,Technology,San Jose,CA,W,US,203.84,213.5700,199.12,212.35,3050554.0,209.3268,4.174843,2022-03-08
50872,ZS,1k-5k,100m-200m,Technology,San Jose,CA,W,US,212.13,213.5100,204.87,208.41,2305091.0,208.7971,1.753642,2022-03-09


In [7]:
# get first stock ticker in first row
# process_list = []
# prev_ticker = stock_df.iat[0,0]
# print (prev_ticker)
# i = 0
# new_ticker_flag = 'yes'
# #for rec in stock_df.iterrows():
# for rec in stock_df.iterrows():
#     new_ticker = stock_df['ticker']
#     new_ticker = new_ticker.iloc[i]
#     print("previous ticker and new ticker: ", prev_ticker, "and", new_ticker)
#     if (prev_ticker == new_ticker):
#         if (new_ticker_flag == 'yes'):
#             new_ticker_flag = 'no'
#             print(new_ticker_flag)
#             begin_vw = stock_df['volume_weight']
#             begin_vw = begin_vw.iloc[i]
#             print("begin_vw: ", begin_vw)
#         else:
#             last_vw = stock_df['volume_weight']
#             last_vw = last_vw.iloc[i]
#             print("last_vw: ", last_vw)
#     else:
#             print ("new")
#             vw_average = 100 - (last_vw/begin_vw) * 100
#             print("vwa: ", vw_average)
            
#             process_list.append(vw_average)
            
#             prev_ticker = new_ticker
#             new_ticker_flag == 'yes'
#             begin_vw = stock_df['volume_weight']
#             begin_vw = begin_vw.iloc[i]
#     i=i+1
# vw_average = 100 - (last_vw/begin_vw) * 100
# print("vwa: ", vw_average)
# process_list.append(vw_average)
            

In [8]:
# process_list

In [9]:
# ticker, to volume columns.  
#vw_average is our target

# combine object columns from stock_df with proces_list into new DataFrame process_df.
# ticker, employee_count, revenue, sector, city_name, state_name, country_code

#stock_df.columns.tolist()

In [10]:
#len(process_list)

In [11]:
#len(stock_df)

In [12]:
# make copy for testing adding process_list as column to new dataframe for model
# stock_df = stock_df.copy(deep=True)

In [13]:
stock_df = stock_df.drop(["volume_weight", "percent_change"], axis=1)
stock_df.columns.tolist()

['ticker',
 'employee_count',
 'revenue',
 'sector',
 'city_name',
 'state_name',
 'region',
 'country_code',
 'open_val',
 'high_val',
 'low_val',
 'close_val',
 'volume',
 'date']

In [14]:
stock_df = stock_df.drop_duplicates(subset="ticker")
stock_df.sort_values(by=["ticker"])
stock_df

Unnamed: 0,ticker,employee_count,revenue,sector,city_name,state_name,region,country_code,open_val,high_val,low_val,close_val,volume,date
484,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,126.1400,127.1699,111.8100,113.18,164708241.0,2022-02-10
989,ADBE,over-10k,1m-10m,Technology,San Jose,CA,W,US,497.9200,499.9300,472.8952,473.97,5026970.0,2022-02-10
1304,ABNB,5k-10k,200m-1b,Technology,San Francisco,CA,W,US,175.3087,176.7400,164.8300,166.53,6464397.0,2022-02-10
1808,ALGN,over-10k,200m-1b,Technology,Tempe,AZ,SW,US,528.3800,534.4700,505.5000,509.10,1162707.0,2022-02-10
2312,AMZN,over-10k,over-1b,Technology,Seattle,WA,NW,US,3162.6902,3180.0000,3054.6750,3065.87,3855022.0,2022-02-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48838,WBA,over-10k,over-1b,Healthcare,Chicago,IL,MW,US,49.3400,49.9800,48.4100,48.70,7927409.0,2022-02-10
49343,WDAY,over-10k,over-1b,Technology,Pleasanton,CA,W,US,240.6400,243.8700,231.8600,232.77,1407668.0,2022-02-10
49846,XEL,over-10k,over-1b,Utilities,Minneapolis,MN,MW,US,67.2400,67.7200,66.7700,66.96,3546760.0,2022-02-10
50350,ZM,1k-5k,100m-200m,Technology,San Jose,CA,W,US,146.9650,150.4800,140.1600,141.24,2690370.0,2022-02-10


In [15]:
stock_df.reset_index(drop=True, inplace=True)
stock_df

Unnamed: 0,ticker,employee_count,revenue,sector,city_name,state_name,region,country_code,open_val,high_val,low_val,close_val,volume,date
0,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,126.1400,127.1699,111.8100,113.18,164708241.0,2022-02-10
1,ADBE,over-10k,1m-10m,Technology,San Jose,CA,W,US,497.9200,499.9300,472.8952,473.97,5026970.0,2022-02-10
2,ABNB,5k-10k,200m-1b,Technology,San Francisco,CA,W,US,175.3087,176.7400,164.8300,166.53,6464397.0,2022-02-10
3,ALGN,over-10k,200m-1b,Technology,Tempe,AZ,SW,US,528.3800,534.4700,505.5000,509.10,1162707.0,2022-02-10
4,AMZN,over-10k,over-1b,Technology,Seattle,WA,NW,US,3162.6902,3180.0000,3054.6750,3065.87,3855022.0,2022-02-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,WBA,over-10k,over-1b,Healthcare,Chicago,IL,MW,US,49.3400,49.9800,48.4100,48.70,7927409.0,2022-02-10
98,WDAY,over-10k,over-1b,Technology,Pleasanton,CA,W,US,240.6400,243.8700,231.8600,232.77,1407668.0,2022-02-10
99,XEL,over-10k,over-1b,Utilities,Minneapolis,MN,MW,US,67.2400,67.7200,66.7700,66.96,3546760.0,2022-02-10
100,ZM,1k-5k,100m-200m,Technology,San Jose,CA,W,US,146.9650,150.4800,140.1600,141.24,2690370.0,2022-02-10


In [16]:
stock_df.iloc[:,8:11]

Unnamed: 0,open_val,high_val,low_val,close_val
0,126.1400,127.1699,111.8100,113.18
1,497.9200,499.9300,472.8952,473.97
2,175.3087,176.7400,164.8300,166.53
3,528.3800,534.4700,505.5000,509.10
4,3162.6902,3180.0000,3054.6750,3065.87
...,...,...,...,...
97,49.3400,49.9800,48.4100,48.70
98,240.6400,243.8700,231.8600,232.77
99,67.2400,67.7200,66.7700,66.96
100,146.9650,150.4800,140.1600,141.24


In [17]:
stock_df.iloc[:, 11]

0       113.18
1       473.97
2       166.53
3       509.10
4      3065.87
        ...   
97       48.70
98      232.77
99       66.96
100     141.24
101     273.00
Name: close_val, Length: 102, dtype: float64

In [18]:
### HERE IS WHERE ISSUE LIES, length of dataframe is 102, 
### and the volume weight average from for loop list lenght is 101

# stock_df["volume_weight_avg"] = process_list


In [19]:
# stock_df.drop(columns="ticker", inplace=True)

In [20]:
# stock_df.drop(columns="city_name", inplace=True)

In [21]:
stock_cat = stock_df.dtypes[stock_df.dtypes=="object"].index.tolist()
stock_df[stock_cat].nunique()

ticker            102
employee_count      4
revenue             6
sector              9
city_name          70
state_name         28
region             11
country_code        8
date                1
dtype: int64

In [22]:
stock_df.reset_index(drop=True,inplace=True)

In [23]:
# #----------------------------------- ONE HOT ENCODE METHOD

# # Create OneHotEnocder Instance
# enc = OneHotEncoder(sparse=False)

# # Fit and transform categorical data
# encode_df = pd.DataFrame(enc.fit_transform(stock_df[stock_cat]))

# # Add the encoded variable names to encode_df
# encode_df.columns = enc.get_feature_names_out(stock_cat)
# encode_df.head()


# # -------------------------------------- OLD "BUCKETING" METHOD

#stock_df["employee_count"].astype('category').cat.codes
stock_df['employee_count'] = stock_df['employee_count'].astype('category').cat.codes
stock_df['revenue'] = stock_df['revenue'].astype('category').cat.codes
stock_df['sector'] = stock_df['sector'].astype('category').cat.codes
stock_df['region'] = stock_df['region'].astype('category').cat.codes
# stock_df['city_name'] = stock_df['city_name'].astype('category').cat.codes
# stock_df['state_name'] = stock_df['state_name'].astype('category').cat.codes
stock_df['country_code'] = stock_df['country_code'].astype('category').cat.codes

# # processing_df = stock_df.copy()

In [24]:
# # replace stock's employee count string with integer
# stock_df.loc[(stock_df['employee_count'] == '5k-10k'), 'employee_count'] = 0
# stock_df.loc[(stock_df['employee_count'] == 'over-10k'), 'employee_count'] = 1
# stock_df.loc[(stock_df['employee_count'] == '1k-5k'), 'employee_count'] = 2
# stock_df.loc[(stock_df['employee_count'] == '500-1k'), 'employee_count'] = 3

# # replace stock's revenue string with integer
# stock_df.loc[(stock_df['revenue'] == '1m-10m'), 'revenue'] = 0
# stock_df.loc[(stock_df['revenue'] == '10m-50m'), 'revenue'] = 1
# stock_df.loc[(stock_df['revenue'] == '50m-100m'), 'revenue'] = 2
# stock_df.loc[(stock_df['revenue'] == '100m-200m'), 'revenue'] = 3
# stock_df.loc[(stock_df['revenue'] == '200m-1b'), 'revenue'] = 4
# stock_df.loc[(stock_df['revenue'] == 'over-1b'), 'revenue'] = 5

# # replace stock's sector string with integer
# stock_df.loc[(stock_df['sector'] == 'Technology'), 'sector'] = 0
# stock_df.loc[(stock_df['sector'] == 'Energy'), 'sector'] = 1
# stock_df.loc[(stock_df['sector'] == 'Healthcare'), 'sector'] = 2
# stock_df.loc[(stock_df['sector'] == 'Consumer Discretionary'), 'sector'] = 3
# stock_df.loc[(stock_df['sector'] == 'Industrials'), 'sector'] = 4
# stock_df.loc[(stock_df['sector'] == 'Consumer Staples'), 'sector'] = 5
# stock_df.loc[(stock_df['sector'] == 'Communication Services'), 'sector'] = 6
# stock_df.loc[(stock_df['sector'] == 'Financials'), 'sector'] = 7
# stock_df.loc[(stock_df['sector'] == 'Utilities'), 'sector'] = 8

# # replace stock's country code string with integer (Note: China was CN and CH for some reason)
# stock_df.loc[(stock_df['country_code'] == 'US'), 'country_code'] = 0
# stock_df.loc[(stock_df['country_code'] == 'Netherlands'), 'country_code'] = 1
# stock_df.loc[(stock_df['country_code'] == 'Australia'), 'country_code'] = 2
# stock_df.loc[(stock_df['country_code'] == 'UK'), 'country_code'] = 3
# stock_df.loc[(stock_df['country_code'] == 'CH'), 'country_code'] = 4
# stock_df.loc[(stock_df['country_code'] == 'CN'), 'country_code'] = 4
# stock_df.loc[(stock_df['country_code'] == 'CA'), 'country_code'] = 5
# stock_df.loc[(stock_df['country_code'] == 'Argentina'), 'country_code'] = 6

# # replace stock's region string with integer 
# stock_df.loc[(stock_df['region'] == 'W'), 'region'] = 0
# stock_df.loc[(stock_df['region'] == 'MW'), 'region'] = 1
# stock_df.loc[(stock_df['region'] == 'SW'), 'region'] = 2
# stock_df.loc[(stock_df['region'] == 'NW'), 'region'] = 3
# stock_df.loc[(stock_df['region'] == 'SE'), 'region'] = 4
# stock_df.loc[(stock_df['region'] == 'NL'), 'region'] = 5
# stock_df.loc[(stock_df['region'] == 'AU'), 'region'] = 6
# stock_df.loc[(stock_df['region'] == 'NE'), 'region'] = 7
# stock_df.loc[(stock_df['region'] == 'GB'), 'region'] = 8
# stock_df.loc[(stock_df['region'] == 'CH'), 'region'] = 9
# stock_df.loc[(stock_df['region'] == 'CA'), 'region'] = 10


# # #create buckets for vwa
# # stock_df.loc[(stock_df['volume_weight_avg'] < 0), 'volume_weight_avg'] = 0
# # stock_df.loc[(stock_df['volume_weight_avg'] > 0) & (stock_df['volume_weight_avg'] <= 1), 'volume_weight_avg'] = 1
# # stock_df.loc[(stock_df['volume_weight_avg'] > 1) & (stock_df['volume_weight_avg'] <= 2), 'volume_weight_avg'] = 2
# # stock_df.loc[(stock_df['volume_weight_avg'] > 2) & (stock_df['volume_weight_avg'] <= 3), 'volume_weight_avg'] = 3
# # stock_df.loc[(stock_df['volume_weight_avg'] > 3) & (stock_df['volume_weight_avg'] <= 4), 'volume_weight_avg'] = 4
# # stock_df.loc[(stock_df['volume_weight_avg'] > 4) & (stock_df['volume_weight_avg'] <= 5), 'volume_weight_avg'] = 5
# # stock_df.loc[(stock_df['volume_weight_avg'] > 5) & (stock_df['volume_weight_avg'] <= 6), 'volume_weight_avg'] = 6
# # # stock_df.loc[(stock_df['volume_weight_avg'] > 6) & (stock_df['volume_weight_avg'] <= 7), 'volume_weight_avg'] = 7
# # # stock_df.loc[(stock_df['volume_weight_avg'] > 7) & (stock_df['volume_weight_avg'] <= 8), 'volume_weight_avg'] = 8
# # # stock_df.loc[(stock_df['volume_weight_avg'] > 8) & (stock_df['volume_weight_avg'] <= 9), 'volume_weight_avg'] = 9
# # # stock_df.loc[(stock_df['volume_weight_avg'] > 9) & (stock_df['volume_weight_avg'] <= 10), 'volume_weight_avg'] = 10
# # stock_df.loc[(stock_df['volume_weight_avg'] > 6), 'volume_weight_avg'] = 7

# stock_df

In [25]:
# ticker, to volume columns.  
#vw_average is our target

# combine object columns from stock_df with proces_list into new DataFrame process_df.

In [26]:
#float_stock_df = stock_df.select_dtypes(exclude=['object'])
#float_stock_df.head()

In [27]:
# Merge encode_df features and drop originals, n means new
#processing_df = encode_df.merge(stock_df, left_index=True, right_index=True) 
#processing_df = stock_df.drop(stock_cat, axis=1)
#processing_df.head()

In [28]:
# Function to run model (run each ticker and date through model in for loop)
# def random_forest(processing_df): does not need to be a functino for 'for loop' anymore
X = stock_df.iloc[:,8:12].values
#X = pd.get_dummies(X)
    
y = stock_df.iloc[:, 11].values

#['percent_change'] as target

In [29]:
#X.describe()

In [30]:
np.unique(y)

array([   6.22,   25.84,   33.37,   34.73,   38.03,   42.13,   47.62,
         47.63,   48.7 ,   51.95,   53.9 ,   58.04,   58.38,   60.  ,
         62.07,   66.7 ,   66.96,   67.99,   71.33,   73.98,   81.5 ,
         82.8 ,   87.34,   87.65,   89.76,   90.91,   92.81,   93.73,
         94.27,   97.4 ,   99.04,  112.76,  113.18,  114.51,  115.29,
        118.91,  121.3 ,  122.48,  124.48,  125.37,  132.49,  133.13,
        134.1 ,  137.43,  141.24,  142.5 ,  153.9 ,  160.64,  161.32,
        164.4 ,  164.64,  166.53,  167.4 ,  168.58,  168.64,  169.88,
        181.86,  186.74,  186.99,  191.77,  192.91,  203.79,  214.23,
        214.59,  219.55,  227.13,  228.2 ,  232.77,  233.89,  239.49,
        273.  ,  283.16,  295.04,  295.29,  295.65,  310.27,  316.72,
        319.02,  321.76,  330.9 ,  369.23,  375.53,  391.31,  420.56,
        473.97,  509.1 ,  509.67,  510.15,  515.05,  535.81,  559.99,
        573.42,  604.73,  628.24,  638.41,  668.28,  860.  , 1091.09,
       2530.01, 2682

In [31]:
# train the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [32]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [33]:
# random forest classfier
# n_estimator default at 100
random_forest = RandomForestRegressor(n_estimators=500, max_depth=15,random_state=1)
random_forest.fit(X_train, y_train)

RandomForestRegressor(max_depth=15, n_estimators=500, random_state=1)

In [34]:
y_pred = random_forest.predict(X_test)
y_true = y_test

In [35]:
# errors = abs(y_pred - y_test)
# mape = 100 * (errors/y_test)
# accuracy = 100 - np.mean(mape)

In [36]:
print(f"r2 Score: {metrics.r2_score(y_true, y_pred)}")
print(f"mean absolute error: {metrics.mean_absolute_error(y_true, y_pred)}")
print(f"mean squared error: {metrics.mean_squared_error(y_true, y_pred)}")
#print("Accuracy:" , round(accuracy,2), '%')

r2 Score: 0.9994450878599488
mean absolute error: 5.099109230769186
mean squared error: 138.2185120149941
Accuracy: 98.61 %


In [37]:
# print(classification_report_imbalanced(y_test, y_pred))

# with open("random_forest_result.txt", "w") as external_file:
#     add_text = classification_report_imbalanced(y_test, y_pred)
#     print(add_text, file=external_file)
#     external_file.close()

In [38]:
#sorted(zip(X.columns, random_forest.feature_importances_), reverse=True)