In [1]:
# KERNEL: mlenv

# Import dependencies
import warnings
warnings.filterwarnings("ignore")

In [2]:
#dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
from config import db_password

In [3]:
# GET Tabled input
# creating database engine
db_name = 'Company_Stocks_DB'
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/{db_name}"
engine = create_engine(db_string)
# read data from PostgreSQL database table and load into Dataframe instance
stock_df = pd.read_sql("select * from \"view_company_all_star\"", engine);
# Print the DataFrame
stock_df.head()

Unnamed: 0,ticker,date_val,company_name,company_url,employee_count,revenue,sector,city_name,state_name,country_code,latitude,longitude,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change
0,AMD,2020-03-12,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,42.2,43.91,39.6,43.9,86689681.0,41.6701,381223.0,4.028436
1,AMD,2020-03-15,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,39.08,43.37,38.51,38.71,84545868.0,41.0812,374962.0,0.946776
2,AMD,2020-03-16,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,40.19,42.88,38.3,41.88,92741881.0,41.124,434519.0,4.205026
3,AMD,2020-03-17,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,39.54,41.95,36.75,39.12,106949287.0,39.6363,591862.0,1.062215
4,AMD,2020-03-18,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,39.56,41.7,37.69,39.82,88939024.0,40.2337,396388.0,0.65723


In [4]:
# # Data Load (Test_File)
# stock_df = pd.read_csv("../Queries/Testing_Files/company_all_star.csv")
# stock_df.head(30)

In [5]:
stock_df.columns.tolist()

['ticker',
 'date_val',
 'company_name',
 'company_url',
 'employee_count',
 'revenue',
 'sector',
 'city_name',
 'state_name',
 'country_code',
 'latitude',
 'longitude',
 'open_val',
 'high_val',
 'low_val',
 'close_val',
 'volume',
 'volume_weight',
 'number_of_transactions',
 'percent_change']

In [6]:
# check dtypes
stock_df.dtypes

ticker                     object
date_val                   object
company_name               object
company_url                object
employee_count             object
revenue                    object
sector                     object
city_name                  object
state_name                 object
country_code               object
latitude                  float64
longitude                 float64
open_val                  float64
high_val                  float64
low_val                   float64
close_val                 float64
volume                    float64
volume_weight             float64
number_of_transactions    float64
percent_change            float64
dtype: object

In [10]:
# training model needs to be "int64" for it to be fit, converted all float64s
stock_df[[
    "latitude", 
    "longitude", 
    "open_val", 
    "high_val", 
    "low_val", 
    "close_val", 
    "volume",
    "volume_weight",
    "number_of_transactions",
    "percent_change"]] = stock_df[[
                                "latitude", 
                                "longitude",
                                "open_val", 
                                "high_val", 
                                "low_val", 
                                "close_val", 
                                "volume",
                                "volume_weight",
                                "number_of_transactions",
                                "percent_change"
                                ]].astype("int")

In [12]:
# verify .astype() changes
stock_df.dtypes

ticker                    object
date_val                  object
company_name              object
company_url               object
employee_count            object
revenue                   object
sector                    object
city_name                 object
state_name                object
country_code              object
latitude                   int64
longitude                  int64
open_val                   int64
high_val                   int64
low_val                    int64
close_val                  int64
volume                     int64
volume_weight              int64
number_of_transactions     int64
percent_change             int64
dtype: object

In [13]:
stock_df.head(30)

Unnamed: 0,ticker,date_val,company_name,company_url,employee_count,revenue,sector,city_name,state_name,country_code,latitude,longitude,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change
0,AMD,2020-03-12,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37,-121,42,43,39,43,86689681,41,381223,4
1,AMD,2020-03-15,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37,-121,39,43,38,38,84545868,41,374962,0
2,AMD,2020-03-16,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37,-121,40,42,38,41,92741881,41,434519,4
3,AMD,2020-03-17,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37,-121,39,41,36,39,106949287,39,591862,1
4,AMD,2020-03-18,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37,-121,39,41,37,39,88939024,40,396388,0
5,AMD,2020-03-19,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37,-121,41,42,39,39,106859502,41,533411,4
6,AMD,2020-03-22,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37,-121,40,42,38,41,101704663,40,493186,2
7,AMD,2020-03-23,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37,-121,44,46,43,46,106794151,45,535460,4
8,AMD,2020-03-24,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37,-121,46,47,44,44,93760389,46,488900,4
9,AMD,2020-03-25,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37,-121,45,47,45,47,73915608,46,379715,3


In [14]:
# need help with which features are to be used and which to drop

# DROP LONG LAT

In [15]:
# Create our features 
X = stock_df.drop("volume_weight", axis=1)
X = pd.get_dummies(X)

# Target 
y = stock_df["volume_weight"]

In [16]:
# One dataset for geolocation and another for stock behaviour? run each through the model?

In [17]:
X.describe()

Unnamed: 0,latitude,longitude,open_val,high_val,low_val,close_val,volume,number_of_transactions,percent_change,ticker_AAPL,...,state_name_WA,state_name_WI,country_code_Argentina,country_code_Australia,country_code_CA,country_code_CH,country_code_CN,country_code_Netherlands,country_code_UK,country_code_US
count,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,...,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0
mean,37.089063,-90.437846,296.988167,301.174765,292.553426,296.896765,7683480.0,74908.34,1.066222,0.009907,...,0.069348,0.009907,0.009907,0.009907,0.009907,0.019814,0.009907,0.019814,0.009907,0.910839
std,11.113165,51.767554,480.529339,486.203403,474.318435,480.192698,17281080.0,133412.1,1.564538,0.09904,...,0.254047,0.09904,0.09904,0.09904,0.09904,0.139361,0.09904,0.139361,0.09904,0.284979
min,-34.0,-122.0,4.0,4.0,4.0,4.0,69543.0,2794.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,37.0,-121.0,82.0,84.0,81.0,82.0,1261210.0,22086.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,37.0,-111.0,155.0,158.0,152.0,155.0,2626642.0,36480.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,41.0,-77.0,302.0,308.0,298.0,302.0,6403628.0,68993.25,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,52.0,150.0,3744.0,3773.0,3696.0,3731.0,401693400.0,2966979.0,40.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [18]:
y.value_counts()

53      345
6       318
55      302
54      300
52      292
       ... 
2442      1
2379      1
2448      1
2446      1
850       1
Name: volume_weight, Length: 2423, dtype: int64

In [19]:
np.unique(y)

array([   4,    5,    6, ..., 3720, 3721, 3722])

In [20]:
# train the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [21]:
# random forest classfier
# n_estimator default at 100
random_forest = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
random_forest.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [None]:
# confusion matrix
# this matrix is gigantic
y_pred = random_forest.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
balanced_accuracy_score(y_test, y_pred)

In [None]:
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
sorted(zip(X.columns, random_forest.feature_importances_), reverse=True)