In [1]:
# KERNEL: mlenv

# Import dependencies
import warnings
warnings.filterwarnings("ignore")

In [2]:
#dependencies
import numpy as np
import pandas as pd
#from pathlib import Path
#from collections import Counter
from sklearn.metrics import balanced_accuracy_score
#from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
from config import db_password
import psycopg2

In [3]:
# GET Tabled input
# creating database engine
db_name = 'Company_Stocks_DB'
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/{db_name}"
engine = create_engine(db_string)
# read data from PostgreSQL database table and load into Dataframe instance
stock_df = pd.read_sql("select * from \"view_company_all_star\"", engine);
# Print the DataFrame
stock_df.head()

Unnamed: 0,ticker,date_val,company_name,company_url,employee_count,revenue,sector,city_name,state_name,country_code,latitude,longitude,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change
0,AMD,2020-03-12,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,42.2,43.91,39.6,43.9,86689681.0,41.6701,381223.0,4.028436
1,AMD,2020-03-15,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,39.08,43.37,38.51,38.71,84545868.0,41.0812,374962.0,0.946776
2,AMD,2020-03-16,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,40.19,42.88,38.3,41.88,92741881.0,41.124,434519.0,4.205026
3,AMD,2020-03-17,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,39.54,41.95,36.75,39.12,106949287.0,39.6363,591862.0,1.062215
4,AMD,2020-03-18,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,39.56,41.7,37.69,39.82,88939024.0,40.2337,396388.0,0.65723


In [4]:
# # Test Data for Initial Model
# stock_df = pd.read_csv("../Queries/Testing_Files/company_all_star.csv")
# stock_df.head()

In [5]:
# check dtypes
stock_df.dtypes

ticker                     object
date_val                   object
company_name               object
company_url                object
employee_count             object
revenue                    object
sector                     object
city_name                  object
state_name                 object
country_code               object
latitude                  float64
longitude                 float64
open_val                  float64
high_val                  float64
low_val                   float64
close_val                 float64
volume                    float64
volume_weight             float64
number_of_transactions    float64
percent_change            float64
dtype: object

In [6]:
# training model needs to be "int64" for it to be fit, converted all float64s
# stock_df[[
#     "latitude", 
#     "longitude", 
#     "open_val", 
#     "high_val", 
#     "low_val", 
#     "close_val", 
#     "volume",
#     "volume_weight",
#     "number_of_transactions",
#     "percent_change"]] = stock_df[[
#                                 "latitude", 
#                                 "longitude",
#                                 "open_val", 
#                                 "high_val", 
#                                 "low_val", 
#                                 "close_val", 
#                                 "volume",
#                                 "volume_weight",
#                                 "number_of_transactions",
#                                 "percent_change"
#                                 ]].astype("int")

In [7]:
# verify .astype() changes
stock_df.dtypes

ticker                     object
date_val                   object
company_name               object
company_url                object
employee_count             object
revenue                    object
sector                     object
city_name                  object
state_name                 object
country_code               object
latitude                  float64
longitude                 float64
open_val                  float64
high_val                  float64
low_val                   float64
close_val                 float64
volume                    float64
volume_weight             float64
number_of_transactions    float64
percent_change            float64
dtype: object

In [8]:
# Convert "date_val" from datetime to string
stock_df['date_val'] = stock_df['date_val'].astype(str)

In [9]:
#Combine ticker and date into one column
stock_df["ticker_date"] = stock_df["ticker"] + " " + stock_df["date_val"]

In [10]:
#stock_df["ticker_date"].astype("float")

In [11]:
stock_df.drop(["ticker","date_val","longitude", "latitude", "company_name", "company_url"], axis=1, inplace=True)
ticker_date = stock_df.pop("ticker_date")
stock_df.insert(0, "ticker_date", ticker_date)
stock_df.head()

Unnamed: 0,ticker_date,employee_count,revenue,sector,city_name,state_name,country_code,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change
0,AMD 2020-03-12,5k-10k,over-1b,Technology,Santa Clara,CA,US,42.2,43.91,39.6,43.9,86689681.0,41.6701,381223.0,4.028436
1,AMD 2020-03-15,5k-10k,over-1b,Technology,Santa Clara,CA,US,39.08,43.37,38.51,38.71,84545868.0,41.0812,374962.0,0.946776
2,AMD 2020-03-16,5k-10k,over-1b,Technology,Santa Clara,CA,US,40.19,42.88,38.3,41.88,92741881.0,41.124,434519.0,4.205026
3,AMD 2020-03-17,5k-10k,over-1b,Technology,Santa Clara,CA,US,39.54,41.95,36.75,39.12,106949287.0,39.6363,591862.0,1.062215
4,AMD 2020-03-18,5k-10k,over-1b,Technology,Santa Clara,CA,US,39.56,41.7,37.69,39.82,88939024.0,40.2337,396388.0,0.65723


In [12]:
# SKIP THIS BUT COULD BE USED FOR REFERENCE
# stock_AMD = stock_df.loc[stock_df["ticker"] == "AMD"]
# stock_AMD

In [14]:
stock_df.dtypes
# need to bucket all objects????

ticker_date                object
employee_count             object
revenue                    object
sector                     object
city_name                  object
state_name                 object
country_code               object
open_val                  float64
high_val                  float64
low_val                   float64
close_val                 float64
volume                    float64
volume_weight             float64
number_of_transactions    float64
percent_change            float64
dtype: object

In [15]:
# Function to run model (run each ticker and date through model in for loop)
def random_forest(processing_df):
    X = stock_df.drop(["ticker_date"], axis=1)
    X = pd.get_dummies(X)
    
    y = stock_df["ticker_date"]

In [16]:
# Create our features 
X = stock_df.drop("ticker_date", axis=1)
X = pd.get_dummies(X)

# Target 
y = stock_df["ticker_date"]

# need to combine ticker and date values into one column in format below
# "AMD 03.17.2022" 

# THEN

# DROP long, lat, company_name, url, ticker & date (once columns are combined)
# bucketing for employee_count and revenue

In [17]:
# One dataset for geolocation and another for stock behaviour? run each through the model?

# for each record in the the dataframe:
#     move records ticker into new_ticker
#     if new_ticker = prev_ticker:
#         move the ticker value to previous value
#         and...
#         move the record to the processing_dataframe
#     else: # when the new_ticker and the previous_ticker aren't the same,
#           # we have all records in the processing_df for a given ticker
#         call the function that will do the Gradient Boosting Decision Tree Algorithm (GBDT)
# white_check_mark
# eyes
# raised_hands::skin-tone-4





# 9:01
# you have to us the CSV file with only the tickers in it
# New
# 9:03
# # you have to make all the columns a number
# # what can you use to do this? I forget
# #after you have all columns as numbers, then do the following:
# #sort the dataframe by ticker column
# #prime prev_ticker with first record's ticker value in the datafrome
# for each record in the the dataframe:
#     move records ticker into new_ticker
#     if new_ticker = prev_ticker:
#         move the ticker value to previous value
#         and...
#         move the record to the processing_dataframe
#     else: # when the new_ticker and the previous_ticker aren't the same,
#           # we have all records in the processing_df for a given ticker
#         call the function that will do the Gradient Boosting Decision Tree Algorithm (GBDT)

In [30]:
# FOR LOOP
# prime previous ticker as variable: 

# prev_ticker = stock_df[ticker]
#     for record in stock_df:
#         new_ticker = stock_df[ticker]
        
#         if (new_ticker == prev_ticker):
#             prev_ticker = stock_df[ticker]
#             processing_df.append(record)
#         else
#             def model_random_forest(processing_df):

In [19]:
X.describe()

Unnamed: 0,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change,employee_count_1k-5k,employee_count_500-1k,...,state_name_WA,state_name_WI,country_code_Argentina,country_code_Australia,country_code_CA,country_code_CH,country_code_CN,country_code_Netherlands,country_code_UK,country_code_US
count,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,...,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0
mean,297.455243,301.673835,293.013166,297.388236,7683480.0,297.402374,74908.34,1.522338,0.131934,0.039627,...,0.069348,0.009907,0.009907,0.009907,0.009907,0.019814,0.009907,0.019814,0.009907,0.910839
std,480.512346,486.195147,474.296604,480.188942,17281080.0,480.302373,133412.1,1.581511,0.338422,0.195084,...,0.254047,0.09904,0.09904,0.09904,0.09904,0.139361,0.09904,0.139361,0.09904,0.284979
min,4.38,4.6,4.11,4.44,69543.0,4.4255,2794.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,82.93,84.12,81.543475,82.917,1261210.0,82.909075,22086.0,0.486388,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,155.375,158.0,152.77,155.615,2626642.0,155.428,36480.0,1.07412,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,302.82,308.075,298.055,302.90125,6403628.0,302.90735,68993.25,2.017073,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,3744.0,3773.0782,3696.7929,3731.41,401693400.0,3722.7632,2966979.0,40.454186,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
y.value_counts()
# second class is date


AMD 2020-03-12     1
MU 2020-11-10      1
MU 2020-11-12      1
MU 2020-11-15      1
MU 2020-11-16      1
                  ..
DXCM 2020-04-02    1
DXCM 2020-04-05    1
DXCM 2020-04-06    1
DXCM 2020-04-07    1
ZS 2022-03-10      1
Name: ticker_date, Length: 50874, dtype: int64

In [21]:
np.unique(y)

array(['AAPL 2020-03-12', 'AAPL 2020-03-15', 'AAPL 2020-03-16', ...,
       'ZS 2022-03-08', 'ZS 2022-03-09', 'ZS 2022-03-10'], dtype=object)

In [22]:
# train the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [23]:
# random forest classfier
# n_estimator default at 100
random_forest = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
random_forest.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# confusion matrix
# this matrix is gigantic
y_pred = random_forest.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
balanced_accuracy_score(y_test, y_pred)
# store score (metrics) THEN
# empty processing_df data frame to star the for loop again

In [None]:
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
sorted(zip(X.columns, random_forest.feature_importances_), reverse=True)