In [1]:
#dependencies
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
from config import db_password
import matplotlib.pyplot as plt
import psycopg2

In [2]:
# GET Tabled input
# creating database engine
db_name = 'Company_Stocks_DB'
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/{db_name}"
engine = create_engine(db_string)
# read data from PostgreSQL database table and load into Dataframe instance
stock_df = pd.read_sql("select * from \"company_all_star\"", engine);
# sort the dataframe by ticker column
stock_df.sort_values(by=["ticker"])
# Print the DataFrame
stock_df.columns.to_list()
# ticker, date_val, region, open_val, high_val, low_val, close_val, volume_weight

['ticker',
 'date_val',
 'company_name',
 'company_url',
 'employee_count',
 'revenue',
 'sector',
 'city_name',
 'state_name',
 'region',
 'country_code',
 'latitude',
 'longitude',
 'open_val',
 'high_val',
 'low_val',
 'close_val',
 'volume',
 'volume_weight',
 'number_of_transactions',
 'percent_change']

In [3]:
# save original dataframe as .csv
# stock_df.to_csv("../resources/company_all_star.csv")
# completed 

In [4]:
# save original datafrom as .json
#stock_df.to_json("../resources/company_all_star.json")

In [5]:
# preserve date column as type object
stock_df['date'] = stock_df['date_val']

# have the user enter beginning date as yyyy-mm-dd
begin_date = '2022-01-10'
# have the user enter ending date as yyyy-mm-dd
end_date = '2022-03-10'
# iteration controls
day_range_of_iter = 22

# Convert the date to datetime64
stock_df['date_val'] = pd.to_datetime(stock_df['date_val'], format='%Y-%m-%d')

stock_df = stock_df.loc[(stock_df['date_val'] >= begin_date)
                     & (stock_df['date_val'] <= end_date)]

# drop throw-aways 
stock_df.drop(["longitude", "latitude", "company_name", "company_url","date_val"], axis=1, inplace=True)

stock_df.head()

Unnamed: 0,ticker,employee_count,revenue,sector,city_name,state_name,region,country_code,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change,date
462,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,133.45,138.99,131.59,137.31,77153906.0,136.1279,499284.0,2.892469,2022-01-10
463,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,138.585,140.57,135.77,137.47,69669598.0,138.0222,458692.0,0.80456,2022-01-11
464,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,139.44,141.25,131.81,132.74,76618059.0,135.7041,535482.0,4.804934,2022-01-12
465,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,131.68,137.0,131.43,136.88,71078438.0,135.0553,466893.0,3.948967,2022-01-13
466,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,134.5,136.3893,131.59,131.93,56589456.0,133.7456,426778.0,1.910781,2022-01-17


In [6]:
# drop fields that will not be used to represent a period of time
stock_df.drop(columns = ['number_of_transactions', 'city_name', 'state_name', 
                         'number_of_transactions', 'percent_change'], 
                          axis=1, inplace=True)
pd.set_option('display.max_rows', None)
stock_df.head()

Unnamed: 0,ticker,employee_count,revenue,sector,region,country_code,open_val,high_val,low_val,close_val,volume,volume_weight,date
462,AMD,5k-10k,over-1b,Technology,W,US,133.45,138.99,131.59,137.31,77153906.0,136.1279,2022-01-10
463,AMD,5k-10k,over-1b,Technology,W,US,138.585,140.57,135.77,137.47,69669598.0,138.0222,2022-01-11
464,AMD,5k-10k,over-1b,Technology,W,US,139.44,141.25,131.81,132.74,76618059.0,135.7041,2022-01-12
465,AMD,5k-10k,over-1b,Technology,W,US,131.68,137.0,131.43,136.88,71078438.0,135.0553,2022-01-13
466,AMD,5k-10k,over-1b,Technology,W,US,134.5,136.3893,131.59,131.93,56589456.0,133.7456,2022-01-17


In [7]:
# unique days in df
unique_days = len(pd.unique(stock_df['date']))
print("unique number of days(number of days in df):", unique_days)

# unique stocks in df
unique_stocks = len(pd.unique(stock_df['ticker']))
print("no. of stocks: ", unique_stocks)

# interation sets
iteration_sets = (unique_days - day_range_of_iter + 1)
print("iteration_sets: ", iteration_sets)

# total records captured
length_of_df = len(stock_df)
print("DataFrame Length: ", length_of_df)


unique number of days(number of days in df): 42
no. of stocks:  102
iteration_sets:  21
DataFrame Length:  4284


In [8]:
# sort dataframe by date
sort_date_stock_df = stock_df.sort_values(by=['date', 'ticker'])
next_date_stock_df = sort_date_stock_df

# get beginning dataframe records
b = 0 
# ending record for beginning df
ending_records = iteration_sets * unique_stocks

# starting record for end
x = (unique_days - iteration_sets) * unique_stocks
max_records = unique_days * unique_stocks

begin_df = pd.DataFrame()
end_df = pd.DataFrame()
               
for rec in sort_date_stock_df.iterrows():
    
    if b < ending_records:
        new_begin_df = sort_date_stock_df.iloc[b]
        begin_df = begin_df.append(new_begin_df,ignore_index=False)
    
    if x < max_records: 
        new_end_df = next_date_stock_df.iloc[x]
        end_df = end_df.append(new_end_df,ignore_index=False)
    b=b+1
    x=x+1
   
begin_df.reset_index(drop=True,inplace=True)
begin_df.head(15)

Unnamed: 0,ticker,employee_count,revenue,sector,region,country_code,open_val,high_val,low_val,close_val,volume,volume_weight,date
0,AAPL,over-10k,over-1b,Technology,W,US,172.32,175.18,170.82,175.08,76138312.0,173.6337,2022-01-10
1,ABNB,5k-10k,200m-1b,Technology,W,US,160.71,169.39,159.0776,168.61,5331629.0,165.8285,2022-01-10
2,ADBE,over-10k,1m-10m,Technology,W,US,525.5533,535.18,520.9821,529.89,3422161.0,529.9001,2022-01-10
3,ADI,over-10k,over-1b,Technology,SE,US,168.77,174.03,168.46,173.63,3789327.0,172.4173,2022-01-10
4,ADP,over-10k,over-1b,Consumer Discretionary,NE,US,233.66,235.42,230.1,235.28,1292976.0,233.7077,2022-01-10
5,ADSK,over-10k,over-1b,Healthcare,W,US,260.47,271.99,260.0,270.63,1373111.0,269.1642,2022-01-10
6,AEP,over-10k,over-1b,Energy,MW,US,90.27,90.63,89.18,89.77,2231603.0,89.6846,2022-01-10
7,ALGN,over-10k,200m-1b,Technology,SW,US,547.45,561.56,535.32,559.64,1048046.0,554.6396,2022-01-10
8,AMAT,over-10k,over-1b,Technology,W,US,148.38,152.66,146.2727,152.45,6824370.0,150.3402,2022-01-10
9,AMD,5k-10k,over-1b,Technology,W,US,133.45,138.99,131.59,137.31,77153906.0,136.1279,2022-01-10


In [9]:
# # sort dataframe by date
# sort_date_stock_df = stock_df.sort_values(by=['date', 'ticker'])
# sort_date_stock_df

end_df.reset_index(drop=True,inplace=True)
end_df.head()

Unnamed: 0,ticker,employee_count,revenue,sector,region,country_code,open_val,high_val,low_val,close_val,volume,volume_weight,date
0,AAPL,over-10k,over-1b,Technology,W,US,174.14,175.48,171.55,172.12,90715899.0,173.5408,2022-02-09
1,ABNB,5k-10k,200m-1b,Technology,W,US,165.245,177.06,165.245,171.95,8261379.0,173.4198,2022-02-09
2,ADBE,over-10k,1m-10m,Technology,W,US,508.08,514.4799,492.28,495.02,4735802.0,501.5645,2022-02-09
3,ADI,over-10k,over-1b,Technology,SE,US,163.82,168.33,161.03,161.67,3462561.0,163.3178,2022-02-09
4,ADP,over-10k,over-1b,Consumer Discretionary,NE,US,207.2469,208.31,204.18,205.54,2270554.0,205.981,2022-02-09


In [10]:
vwa_df = pd.merge(begin_df, end_df, left_index=True, right_index=True)

In [11]:
# drop fields that will not be used to represent a period of time
vwa_df.drop(columns = ['date_x', 'employee_count_y', 'region_y', 'revenue_y', 'sector_y', 'ticker_y', 'country_code_y', 'date_y'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
vwa_df.head()

Unnamed: 0,ticker_x,employee_count_x,revenue_x,sector_x,region_x,country_code_x,open_val_x,high_val_x,low_val_x,close_val_x,volume_x,volume_weight_x,open_val_y,high_val_y,low_val_y,close_val_y,volume_y,volume_weight_y
0,AAPL,over-10k,over-1b,Technology,W,US,172.32,175.18,170.82,175.08,76138312.0,173.6337,174.14,175.48,171.55,172.12,90715899.0,173.5408
1,ABNB,5k-10k,200m-1b,Technology,W,US,160.71,169.39,159.0776,168.61,5331629.0,165.8285,165.245,177.06,165.245,171.95,8261379.0,173.4198
2,ADBE,over-10k,1m-10m,Technology,W,US,525.5533,535.18,520.9821,529.89,3422161.0,529.9001,508.08,514.4799,492.28,495.02,4735802.0,501.5645
3,ADI,over-10k,over-1b,Technology,SE,US,168.77,174.03,168.46,173.63,3789327.0,172.4173,163.82,168.33,161.03,161.67,3462561.0,163.3178
4,ADP,over-10k,over-1b,Consumer Discretionary,NE,US,233.66,235.42,230.1,235.28,1292976.0,233.7077,207.2469,208.31,204.18,205.54,2270554.0,205.981


In [12]:
vwa_df['vwa'] = 100 - vwa_df['volume_weight_y']/vwa_df['volume_weight_x'] * 100
vwa_df['va'] = 100 - vwa_df['volume_y']/vwa_df['volume_x'] * 100

In [13]:
# unique values for each column (getting to know your data)
vwa_df.nunique()

ticker_x             102
employee_count_x       4
revenue_x              6
sector_x               9
region_x              11
country_code_x         8
open_val_x          2076
high_val_x          2092
low_val_x           2100
close_val_x         2080
volume_x            2142
volume_weight_x     2141
open_val_y          2074
high_val_y          2082
low_val_y           2087
close_val_y         2076
volume_y            2142
volume_weight_y     2141
vwa                 2142
va                  2142
dtype: int64

In [14]:
vwa_df.drop(columns = ['ticker_x', 'volume_x', 'volume_weight_x', 'volume_y', 'volume_weight_y'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
vwa_df.head()

Unnamed: 0,employee_count_x,revenue_x,sector_x,region_x,country_code_x,open_val_x,high_val_x,low_val_x,close_val_x,open_val_y,high_val_y,low_val_y,close_val_y,vwa,va
0,over-10k,over-1b,Technology,W,US,172.32,175.18,170.82,175.08,174.14,175.48,171.55,172.12,0.053503,-19.146191
1,5k-10k,200m-1b,Technology,W,US,160.71,169.39,159.0776,168.61,165.245,177.06,165.245,171.95,-4.577802,-54.950373
2,over-10k,1m-10m,Technology,W,US,525.5533,535.18,520.9821,529.89,508.08,514.4799,492.28,495.02,5.347348,-38.3863
3,over-10k,over-1b,Technology,SE,US,168.77,174.03,168.46,173.63,163.82,168.33,161.03,161.67,5.277603,8.623325
4,over-10k,over-1b,Consumer Discretionary,NE,US,233.66,235.42,230.1,235.28,207.2469,208.31,204.18,205.54,11.863837,-75.606817


In [15]:
stock_df = vwa_df

In [16]:
# generate our categorical variable list
# categorical preprocessing can be done easiest using Dataframe.dtypes == 'object'
stock_categories = stock_df.dtypes[stock_df.dtypes == "object"].index.tolist()
stock_categories

['employee_count_x', 'revenue_x', 'sector_x', 'region_x', 'country_code_x']

In [17]:
# Checking the number of unique values in each column
stock_df[stock_categories].nunique()
# there needs to be only 10 at most in each categorie, how are we going to make this smaller...by sector ???b

employee_count_x     4
revenue_x            6
sector_x             9
region_x            11
country_code_x       8
dtype: int64

In [18]:
#stock_df.columns.tolist()

In [19]:
stock_df.rename(columns={"employee_count_x": "employee_count", "revenue_x":"revenue", "sector_x":"sector",
            "region_x":"region", "country_code_x":"country_code"},inplace=True)

In [20]:
#stock_df.columns.to_list()

In [21]:
# -------------------------------------- .cat.codes

stock_df["employee_count"].astype('category').cat.codes
stock_df['employee_count'] = stock_df['employee_count'].astype('category').cat.codes
stock_df['revenue'] = stock_df['revenue'].astype('category').cat.codes
stock_df['sector'] = stock_df['sector'].astype('category').cat.codes
stock_df['region'] = stock_df['region'].astype('category').cat.codes
# stock_df['city_name'] = stock_df['city_name'].astype('category').cat.codes
# stock_df['state_name'] = stock_df['state_name'].astype('category').cat.codes
stock_df['country_code'] = stock_df['country_code'].astype('category').cat.codes

In [22]:
#stock_df['sector'].value_counts()

In [23]:
#stock_df['country_code'].value_counts()

In [24]:
#stock_df["region"].value_counts()

In [25]:
vwa_df.drop(columns = ['open_val_x', 'high_val_x', 'low_val_x', 'close_val_x'], axis=1, inplace=True)

In [26]:
stock_df.head()
# vwa =  percent change volume weight,  va = volume weight

Unnamed: 0,employee_count,revenue,sector,region,country_code,open_val_y,high_val_y,low_val_y,close_val_y,vwa,va
0,3,5,7,10,7,174.14,175.48,171.55,172.12,0.053503,-19.146191
1,2,3,7,10,7,165.245,177.06,165.245,171.95,-4.577802,-54.950373
2,3,2,7,10,7,508.08,514.4799,492.28,495.02,5.347348,-38.3863
3,3,5,7,8,7,163.82,168.33,161.03,161.67,5.277603,8.623325
4,3,5,1,5,7,207.2469,208.31,204.18,205.54,11.863837,-75.606817


In [27]:
# Check volumne weight average buckets
vwa_counts = stock_df['vwa'].value_counts()
vwa_counts

 0.053503     1
 7.633963     1
 2.168708     1
 4.410728     1
 4.608011     1
-5.859193     1
 14.830037    1
-0.574421     1
 2.853225     1
 13.417651    1
 1.602273     1
 3.907807     1
 1.197366     1
 21.325670    1
 2.835479     1
 1.731777     1
 3.353108     1
 37.706725    1
-0.874632     1
 6.834244     1
 4.556796     1
 5.201171     1
-16.258750    1
 5.713568     1
 5.888017     1
 5.861016     1
-2.207489     1
 2.802099     1
 4.637765     1
 5.275388     1
 1.370242     1
-0.930239     1
-1.938799     1
 0.535185     1
 5.494266     1
 5.866351     1
 9.679844     1
-12.795001    1
 8.496366     1
 35.356510    1
 5.662050     1
 25.235206    1
 10.083992    1
 0.985734     1
 1.891986     1
 13.219924    1
-10.024634    1
-3.088155     1
-2.524819     1
 1.722566     1
-2.546875     1
-1.753468     1
-7.175365     1
 5.616211     1
-5.581465     1
 5.299523     1
 3.884534     1
-2.318193     1
 14.599444    1
 13.697197    1
 7.729163     1
 2.932340     1
-4.18545

In [28]:
stock_df.drop(columns=["country_code", "employee_count", "va"], axis=1, inplace=True)


In [29]:
stock_df.head()

Unnamed: 0,revenue,sector,region,open_val_y,high_val_y,low_val_y,close_val_y,vwa
0,5,7,10,174.14,175.48,171.55,172.12,0.053503
1,3,7,10,165.245,177.06,165.245,171.95,-4.577802
2,2,7,10,508.08,514.4799,492.28,495.02,5.347348
3,5,7,8,163.82,168.33,161.03,161.67,5.277603
4,5,1,5,207.2469,208.31,204.18,205.54,11.863837


In [30]:
# create features 
X = stock_df.drop(columns=["vwa"])
# X = pd.get_dummies(X)

# create target
y = stock_df["vwa"]


In [31]:
# not available for multiple features
# X.describe()

In [32]:
np.unique(y)

array([-28.2914014 , -26.73731426, -26.60389507, ...,  47.71142977,
        47.81819094,  48.14522789])

In [33]:
# train the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [34]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [40]:
# random forest regression
# n_estimator default at 100

#random_forest = RandomForestRegressor(n_estimators=2500, 
#                                      max_depth=15, 
#                                      min_weight_fraction_leaf=0, 
#                                      criterion="mse",
#                                      bootstrap=False,
#                                      max_features=(0,5),
#                                      random_state=1)

random_forest = RandomForestRegressor(n_estimators=3500, max_depth=20, random_state=1, criterion="mse")

random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

In [41]:
rms = metrics.mean_squared_error(y_test, y_pred, squared=False)

print("begin_date: ", begin_date)
print("end_date: ", end_date)
print("day interval: ", day_range_of_iter,"\n")

print(f"r2 Score: {round(metrics.r2_score(y_test, y_pred)*100,2)} %")
print(f"mean absolute error: {metrics.mean_absolute_error(y_test, y_pred)}")
print(f"mean squared error: {metrics.mean_squared_error(y_test, y_pred)}")
print(f"root mean squared error: {rms}")
#print(f"mean absolute percentage error: {metrics.mean_absolute_percentage_error(y_test,y_pred)}")


begin_date:  2022-01-10
end_date:  2022-03-10
day interval:  22 

r2 Score: 68.76 %
mean absolute error: 3.63623531246166
mean squared error: 29.585896555551106
root mean squared error: 5.439291916743493


In [42]:
errors = abs(y_pred - y_test)
mape = 100 * (errors/y_test)
accuracy = 100 - np.mean(mape)
print("Accuracy:" , round(accuracy,2), '%')

Accuracy: 69.13 %


In [38]:
#10/04/2021 - 12/27/2021