In [1]:
#dependencies
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
from config import db_password
import matplotlib.pyplot as plt
import psycopg2

In [2]:
# GET Tabled input
# creating database engine
db_name = 'Company_Stocks_DB'
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/{db_name}"
engine = create_engine(db_string)
# read data from PostgreSQL database table and load into Dataframe instance
stock_df = pd.read_sql("select * from \"company_all_star\"", engine);
# sort the dataframe by ticker column
stock_df.sort_values(by=["ticker"])
# Print the DataFrame
stock_df.columns.to_list()
# ticker, date_val, region, open_val, high_val, low_val, close_val, volume_weight

['ticker',
 'date_val',
 'company_name',
 'company_url',
 'employee_count',
 'revenue',
 'sector',
 'city_name',
 'state_name',
 'region',
 'country_code',
 'latitude',
 'longitude',
 'open_val',
 'high_val',
 'low_val',
 'close_val',
 'volume',
 'volume_weight',
 'number_of_transactions',
 'percent_change']

In [None]:
# save original dataframe as .csv
# stock_df.to_csv("../resources/company_all_star.csv")
# completed 

In [3]:
# save original datafrom as .json
stock_df.to_json("../resources/company_all_star.json")

In [None]:
# preserve date column as type object
stock_df['date'] = stock_df['date_val']

# have the user enter beginning date as yyyy-mm-dd
begin_date = '2022-01-10'
# have the user enter ending date as yyyy-mm-dd
end_date = '2022-03-10'
# iteration controls
day_range_of_iter = 22

# Convert the date to datetime64
stock_df['date_val'] = pd.to_datetime(stock_df['date_val'], format='%Y-%m-%d')

stock_df = stock_df.loc[(stock_df['date_val'] >= begin_date)
                     & (stock_df['date_val'] <= end_date)]

# drop throw-aways 
stock_df.drop(["longitude", "latitude", "company_name", "company_url","date_val"], axis=1, inplace=True)

stock_df.head()

In [None]:
# drop fields that will not be used to represent a period of time
stock_df.drop(columns = ['number_of_transactions', 'city_name', 'state_name', 
                         'number_of_transactions', 'percent_change'], 
                          axis=1, inplace=True)
pd.set_option('display.max_rows', None)
stock_df.head()

In [None]:
# unique days in df
unique_days = len(pd.unique(stock_df['date']))
print("unique number of days(number of days in df):", unique_days)

# unique stocks in df
unique_stocks = len(pd.unique(stock_df['ticker']))
print("no. of stocks: ", unique_stocks)

# interation sets
iteration_sets = (unique_days - day_range_of_iter + 1)
print("iteration_sets: ", iteration_sets)

# total records captured
length_of_df = len(stock_df)
print("DataFrame Length: ", length_of_df)


In [None]:
# sort dataframe by date
sort_date_stock_df = stock_df.sort_values(by=['date', 'ticker'])
next_date_stock_df = sort_date_stock_df

# get beginning dataframe records
b = 0 
# ending record for beginning df
ending_records = iteration_sets * unique_stocks

# starting record for end
x = (unique_days - iteration_sets) * unique_stocks
max_records = unique_days * unique_stocks

begin_df = pd.DataFrame()
end_df = pd.DataFrame()
               
for rec in sort_date_stock_df.iterrows():
    
    if b < ending_records:
        new_begin_df = sort_date_stock_df.iloc[b]
        begin_df = begin_df.append(new_begin_df,ignore_index=False)
    
    if x < max_records: 
        new_end_df = next_date_stock_df.iloc[x]
        end_df = end_df.append(new_end_df,ignore_index=False)
    b=b+1
    x=x+1
   
begin_df.reset_index(drop=True,inplace=True)
begin_df.head()

In [None]:
# # sort dataframe by date
# sort_date_stock_df = stock_df.sort_values(by=['date', 'ticker'])
# sort_date_stock_df

end_df.reset_index(drop=True,inplace=True)
end_df.head()

In [None]:
vwa_df = pd.merge(begin_df, end_df, left_index=True, right_index=True)

In [None]:
# drop fields that will not be used to represent a period of time
vwa_df.drop(columns = ['date_x', 'employee_count_y', 'region_y', 'revenue_y', 'sector_y', 'ticker_y', 'country_code_y', 'date_y'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
vwa_df.head()

In [None]:
vwa_df['vwa'] = 100 - vwa_df['volume_weight_y']/vwa_df['volume_weight_x'] * 100
vwa_df['va'] = 100 - vwa_df['volume_y']/vwa_df['volume_x'] * 100

In [None]:
# unique values for each column (getting to know your data)
vwa_df.nunique()

In [None]:
vwa_df.drop(columns = ['ticker_x', 'volume_x', 'volume_weight_x', 'volume_y', 'volume_weight_y'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
vwa_df.head()

In [None]:
stock_df = vwa_df

In [None]:
# generate our categorical variable list
# categorical preprocessing can be done easiest using Dataframe.dtypes == 'object'
stock_categories = stock_df.dtypes[stock_df.dtypes == "object"].index.tolist()
stock_categories

In [None]:
# Checking the number of unique values in each column
stock_df[stock_categories].nunique()
# there needs to be only 10 at most in each categorie, how are we going to make this smaller...by sector ???b

In [None]:
#stock_df.columns.tolist()

In [None]:
stock_df.rename(columns={"employee_count_x": "employee_count", "revenue_x":"revenue", "sector_x":"sector",
            "region_x":"region", "country_code_x":"country_code"},inplace=True)

In [None]:
#stock_df.columns.to_list()

In [None]:
# -------------------------------------- .cat.codes

stock_df["employee_count"].astype('category').cat.codes
stock_df['employee_count'] = stock_df['employee_count'].astype('category').cat.codes
stock_df['revenue'] = stock_df['revenue'].astype('category').cat.codes
stock_df['sector'] = stock_df['sector'].astype('category').cat.codes
stock_df['region'] = stock_df['region'].astype('category').cat.codes
# stock_df['city_name'] = stock_df['city_name'].astype('category').cat.codes
# stock_df['state_name'] = stock_df['state_name'].astype('category').cat.codes
stock_df['country_code'] = stock_df['country_code'].astype('category').cat.codes

In [None]:
#stock_df['sector'].value_counts()

In [None]:
#stock_df['country_code'].value_counts()

In [None]:
#stock_df["region"].value_counts()

In [None]:
vwa_df.drop(columns = ['open_val_x', 'high_val_x', 'low_val_x', 'close_val_x'], axis=1, inplace=True)

In [None]:
stock_df.head()
# vwa =  percent change volume weight,  va = volume weight

In [None]:
# Check volumne weight average buckets
vwa_counts = stock_df['vwa'].value_counts()
vwa_counts

In [None]:
stock_df.drop(columns=["country_code", "employee_count", "va"], axis=1, inplace=True)


In [None]:
stock_df.head()

In [None]:
# create features 
X = stock_df.drop(columns=["vwa"])
# X = pd.get_dummies(X)

# create target
y = stock_df["vwa"]


In [None]:
# not available for multiple features
# X.describe()

In [None]:
np.unique(y)

In [None]:
# train the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# random forest regression
# n_estimator default at 100

#random_forest = RandomForestRegressor(n_estimators=2500, 
#                                      max_depth=15, 
#                                      min_weight_fraction_leaf=0, 
#                                      criterion="mse",
#                                      bootstrap=False,
#                                      max_features=(0,5),
#                                      random_state=1)

random_forest = RandomForestRegressor(n_estimators=3000, max_depth=15, random_state=1, criterion="mse")

random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

In [None]:
rms = metrics.mean_squared_error(y_test, y_pred, squared=False)

print("begin_date: ", begin_date)
print("end_date: ", end_date)
print("day interval: ", day_range_of_iter,"\n")

print(f"r2 Score: {round(metrics.r2_score(y_test, y_pred)*100,2)} %")
print(f"mean absolute error: {metrics.mean_absolute_error(y_test, y_pred)}")
print(f"mean squared error: {metrics.mean_squared_error(y_test, y_pred)}")
print(f"root mean squared error: {rms}")
#print(f"mean absolute percentage error: {metrics.mean_absolute_percentage_error(y_test,y_pred)}")


In [None]:
# errors = abs(y_pred - y_test)
# mape = 100 * (errors/y_test)
# accuracy = 100 - np.mean(mape)
# print("Accuracy:" , round(accuracy,2), '%')
# print(np.mean(mape))

In [None]:
#10/04/2021 - 12/27/2021