In [6]:
# KERNEL: mlenv

# Import dependencies
import warnings
warnings.filterwarnings("ignore")

In [7]:
# pip install psycopg2

In [8]:
#dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
from config import db_password

In [9]:
# # GET Tabled input
# # creating database engine
# db_name = 'Company_Stocks_DB'
# db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/{db_name}"
# engine = create_engine(db_string)
# # read data from PostgreSQL database table and load into Dataframe instance
# stock_df = pd.read_sql("select * from \"view_company_all_star\"", engine);
# # Print the DataFrame
# stock_df.head()

In [10]:
# Test Data for Initial Model
stock_df = pd.read_csv("../Queries/Testing_Files/company_all_star.csv")
stock_df.head()

Unnamed: 0,ticker,date,company_name,url,employee_count,revenue,sector,city_name,state_name,country,latitude,longitude,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change
0,AMD,2020-03-12,Advanced Micro Devices Inc,{url:amd.com},5k-10k,over-1b,TBD,Santa Clara,CA,US,37.233325,-121.684635,42.2,43.91,39.6,43.9,86689681.0,41.6701,381223,4.028436
1,AMD,2020-03-15,Advanced Micro Devices Inc,{url:amd.com},5k-10k,over-1b,TBD,Santa Clara,CA,US,37.233325,-121.684635,39.08,43.37,38.51,38.71,84545868.0,41.0812,374962,0.946776
2,AMD,2020-03-16,Advanced Micro Devices Inc,{url:amd.com},5k-10k,over-1b,TBD,Santa Clara,CA,US,37.233325,-121.684635,40.19,42.88,38.3,41.88,92741881.0,41.124,434519,4.205026
3,AMD,2020-03-17,Advanced Micro Devices Inc,{url:amd.com},5k-10k,over-1b,TBD,Santa Clara,CA,US,37.233325,-121.684635,39.54,41.95,36.75,39.12,106949287.0,39.6363,591862,1.062215
4,AMD,2020-03-18,Advanced Micro Devices Inc,{url:amd.com},5k-10k,over-1b,TBD,Santa Clara,CA,US,37.233325,-121.684635,39.56,41.7,37.69,39.82,88939024.0,40.2337,396388,0.65723


In [11]:
# check dtypes
stock_df.dtypes

ticker                     object
date                       object
company_name               object
url                        object
employee_count             object
revenue                    object
sector                     object
city_name                  object
state_name                 object
country                    object
latitude                  float64
longitude                 float64
open_val                  float64
high_val                  float64
low_val                   float64
close_val                 float64
volume                    float64
volume_weight             float64
number_of_transactions      int64
percent_change            float64
dtype: object

In [12]:
# training model needs to be "int64" for it to be fit, converted all float64s
stock_df[[
    "latitude", 
    "longitude", 
    "open_val", 
    "high_val", 
    "low_val", 
    "close_val", 
    "volume",
    "volume_weight",
    "percent_change"]] = stock_df[[
                                "latitude", 
                                "longitude",
                                "open_val", 
                                "high_val", 
                                "low_val", 
                                "close_val", 
                                "volume",
                                "volume_weight",
                                "percent_change"
                                ]].astype("int")

In [13]:
# verify .astype() changes
stock_df.dtypes

ticker                    object
date                      object
company_name              object
url                       object
employee_count            object
revenue                   object
sector                    object
city_name                 object
state_name                object
country                   object
latitude                   int64
longitude                  int64
open_val                   int64
high_val                   int64
low_val                    int64
close_val                  int64
volume                     int64
volume_weight              int64
number_of_transactions     int64
percent_change             int64
dtype: object

In [14]:
# need help with which features are to be used and which to drop

In [15]:
stock_AMD = stock_df.loc[stock_df["ticker"] == "AMD"]
stock_AMD

Unnamed: 0,ticker,date,company_name,url,employee_count,revenue,sector,city_name,state_name,country,latitude,longitude,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change
0,AMD,2020-03-12,Advanced Micro Devices Inc,{url:amd.com},5k-10k,over-1b,TBD,Santa Clara,CA,US,37,-121,42,43,39,43,86689681,41,381223,4
1,AMD,2020-03-15,Advanced Micro Devices Inc,{url:amd.com},5k-10k,over-1b,TBD,Santa Clara,CA,US,37,-121,39,43,38,38,84545868,41,374962,0
2,AMD,2020-03-16,Advanced Micro Devices Inc,{url:amd.com},5k-10k,over-1b,TBD,Santa Clara,CA,US,37,-121,40,42,38,41,92741881,41,434519,4
3,AMD,2020-03-17,Advanced Micro Devices Inc,{url:amd.com},5k-10k,over-1b,TBD,Santa Clara,CA,US,37,-121,39,41,36,39,106949287,39,591862,1
4,AMD,2020-03-18,Advanced Micro Devices Inc,{url:amd.com},5k-10k,over-1b,TBD,Santa Clara,CA,US,37,-121,39,41,37,39,88939024,40,396388,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499,AMD,2022-03-06,Advanced Micro Devices Inc,{url:amd.com},5k-10k,over-1b,TBD,Santa Clara,CA,US,37,-121,108,109,102,102,92599972,105,659639,5
500,AMD,2022-03-07,Advanced Micro Devices Inc,{url:amd.com},5k-10k,over-1b,TBD,Santa Clara,CA,US,37,-121,102,109,100,105,135348316,105,845843,2
501,AMD,2022-03-08,Advanced Micro Devices Inc,{url:amd.com},5k-10k,over-1b,TBD,Santa Clara,CA,US,37,-121,108,111,106,111,102310329,109,602679,2
502,AMD,2022-03-09,Advanced Micro Devices Inc,{url:amd.com},5k-10k,over-1b,TBD,Santa Clara,CA,US,37,-121,108,109,103,106,102557375,105,639388,2


In [24]:
# Create our features 
X = stock_AMD.drop("ticker", axis=1)
X = pd.get_dummies(X)

# Target 
y = stock_AMD["ticker"]

# need to combine ticker and date values into one column in format below
# "AMD 03.17.2022" 

# THEN

# DROP long, lat, company_name, url, ticker & date (once columns are combined)
# bucketing for employee_count and revenue

In [17]:
# One dataset for geolocation and another for stock behaviour? run each through the model?

# for each record in the the dataframe:
#     move records ticker into new_ticker
#     if new_ticker = prev_ticker:
#         move the ticker value to previous value
#         and...
#         move the record to the processing_dataframe
#     else: # when the new_ticker and the previous_ticker aren't the same,
#           # we have all records in the processing_df for a given ticker
#         call the function that will do the Gradient Boosting Decision Tree Algorithm (GBDT)
# white_check_mark
# eyes
# raised_hands::skin-tone-4





# 9:01
# you have to us the CSV file with only the tickers in it
# New
# 9:03
# # you have to make all the columns a number
# # what can you use to do this? I forget
# #after you have all columns as numbers, then do the following:
# #sort the dataframe by ticker column
# #prime prev_ticker with first record's ticker value in the datafrome
# for each record in the the dataframe:
#     move records ticker into new_ticker
#     if new_ticker = prev_ticker:
#         move the ticker value to previous value
#         and...
#         move the record to the processing_dataframe
#     else: # when the new_ticker and the previous_ticker aren't the same,
#           # we have all records in the processing_df for a given ticker
#         call the function that will do the Gradient Boosting Decision Tree Algorithm (GBDT)

In [None]:
# FOR LOOP
# prime previous ticker as variable: 

prev_ticker = stock_df[ticker]
    for record in stock_df:
        new_ticker = stock_df[ticker]
        
        if (new_ticker == prev_ticker):
            prev_ticker = stock_df[ticker]
            processing_df.append(record)
        else
            def model_random_forest(processing_df):

In [18]:
X.describe()

Unnamed: 0,latitude,longitude,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change,...,date_2022-03-09,date_2022-03-10,company_name_Advanced Micro Devices Inc,url_{url:amd.com},employee_count_5k-10k,revenue_over-1b,sector_TBD,city_name_Santa Clara,state_name_CA,country_US
count,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0,...,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0
mean,37.0,-121.0,89.797619,91.535714,87.827381,89.68254,59441170.0,89.720238,379324.8,1.640873,...,0.001984,0.001984,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
std,0.0,0.0,26.974967,27.592294,26.210419,26.900938,29707680.0,26.865601,180596.5,1.83407,...,0.044544,0.044544,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,37.0,-121.0,39.0,41.0,36.0,38.0,16705910.0,39.0,121839.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,37.0,-121.0,77.0,78.0,76.0,77.0,39327770.0,77.0,258988.5,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,37.0,-121.0,85.0,87.0,83.5,85.0,51266910.0,85.0,330293.0,1.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,37.0,-121.0,106.0,108.0,104.0,106.0,69654020.0,106.0,438351.2,3.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,37.0,-121.0,163.0,164.0,156.0,161.0,225368700.0,159.0,1381840.0,11.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
y.value_counts()
# second class is date


AMD    504
Name: ticker, dtype: int64

In [20]:
np.unique(y)

array(['AMD'], dtype=object)

In [21]:
# train the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [22]:
# random forest classfier
# n_estimator default at 100
random_forest = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
random_forest.fit(X_train, y_train)

ValueError: The target 'y' needs to have more than 1 class. Got 1 class instead

In [23]:
# confusion matrix
# this matrix is gigantic
y_pred = random_forest.predict(X_test)
confusion_matrix(y_test, y_pred)

AttributeError: 'list' object has no attribute 'take'

In [None]:
balanced_accuracy_score(y_test, y_pred)
# store score (metrics) THEN
# empty processing_df data frame to star the for loop again

In [None]:
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
sorted(zip(X.columns, random_forest.feature_importances_), reverse=True)