In [1]:
#import libraries
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb



from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score


import matplotlib.pyplot as plt
from xgboost import XGBClassifier



In [2]:
df = pd.read_csv('/Users/devpatel/Desktop/FALL 23/FinTech 540/grouped.csv')
df.head()

Unnamed: 0,Date,Time-Interval,Sector,Avg_Rate_of_Return,Top_Sector
0,6/1/20,9:30:00,ENERGY & TRANSPORTATION,-0.002969,0
1,6/1/20,9:30:00,FINANCE,0.005628,0
2,6/1/20,9:30:00,LIFE SCIENCES,-0.001304,0
3,6/1/20,9:30:00,MANUFACTURING,0.00673,1
4,6/1/20,9:30:00,TECHNOLOGY,-0.000366,0


In [3]:
data = df.copy()
#reformate Date column
data['Date'] = pd.to_datetime(data['Date'])
data.head()

Unnamed: 0,Date,Time-Interval,Sector,Avg_Rate_of_Return,Top_Sector
0,2020-06-01,9:30:00,ENERGY & TRANSPORTATION,-0.002969,0
1,2020-06-01,9:30:00,FINANCE,0.005628,0
2,2020-06-01,9:30:00,LIFE SCIENCES,-0.001304,0
3,2020-06-01,9:30:00,MANUFACTURING,0.00673,1
4,2020-06-01,9:30:00,TECHNOLOGY,-0.000366,0


In [4]:
#convert Time Interval to datetime
data['Time-Interval'] = pd.to_datetime(data['Time-Interval'])
data.head()

Unnamed: 0,Date,Time-Interval,Sector,Avg_Rate_of_Return,Top_Sector
0,2020-06-01,2023-11-08 09:30:00,ENERGY & TRANSPORTATION,-0.002969,0
1,2020-06-01,2023-11-08 09:30:00,FINANCE,0.005628,0
2,2020-06-01,2023-11-08 09:30:00,LIFE SCIENCES,-0.001304,0
3,2020-06-01,2023-11-08 09:30:00,MANUFACTURING,0.00673,1
4,2020-06-01,2023-11-08 09:30:00,TECHNOLOGY,-0.000366,0


In [5]:
data['year'] = data['Date'].dt.year
data['month'] = data['Date'].dt.month
data['day'] = data['Date'].dt.day
data['hour'] = data['Time-Interval'].dt.hour
data['minute'] = data['Time-Interval'].dt.minute
data['second'] = data['Time-Interval'].dt.second
data.head()

Unnamed: 0,Date,Time-Interval,Sector,Avg_Rate_of_Return,Top_Sector,year,month,day,hour,minute,second
0,2020-06-01,2023-11-08 09:30:00,ENERGY & TRANSPORTATION,-0.002969,0,2020,6,1,9,30,0
1,2020-06-01,2023-11-08 09:30:00,FINANCE,0.005628,0,2020,6,1,9,30,0
2,2020-06-01,2023-11-08 09:30:00,LIFE SCIENCES,-0.001304,0,2020,6,1,9,30,0
3,2020-06-01,2023-11-08 09:30:00,MANUFACTURING,0.00673,1,2020,6,1,9,30,0
4,2020-06-01,2023-11-08 09:30:00,TECHNOLOGY,-0.000366,0,2020,6,1,9,30,0


In [6]:
#drop Date and Time-Interval columns
data.drop(['Date','Time-Interval'],axis=1,inplace=True)
data.head()

Unnamed: 0,Sector,Avg_Rate_of_Return,Top_Sector,year,month,day,hour,minute,second
0,ENERGY & TRANSPORTATION,-0.002969,0,2020,6,1,9,30,0
1,FINANCE,0.005628,0,2020,6,1,9,30,0
2,LIFE SCIENCES,-0.001304,0,2020,6,1,9,30,0
3,MANUFACTURING,0.00673,1,2020,6,1,9,30,0
4,TECHNOLOGY,-0.000366,0,2020,6,1,9,30,0


In [7]:
#find unique sectors from Sector column
data['Sector'].unique()

#one hot encoding
data = pd.get_dummies(data,columns=['Sector'])
data.head()

Unnamed: 0,Avg_Rate_of_Return,Top_Sector,year,month,day,hour,minute,second,Sector_ENERGY & TRANSPORTATION,Sector_FINANCE,Sector_LIFE SCIENCES,Sector_MANUFACTURING,Sector_TECHNOLOGY,Sector_TRADE & SERVICES
0,-0.002969,0,2020,6,1,9,30,0,1,0,0,0,0,0
1,0.005628,0,2020,6,1,9,30,0,0,1,0,0,0,0
2,-0.001304,0,2020,6,1,9,30,0,0,0,1,0,0,0
3,0.00673,1,2020,6,1,9,30,0,0,0,0,1,0,0
4,-0.000366,0,2020,6,1,9,30,0,0,0,0,0,1,0


In [8]:
#split data into train and test
X = data.drop(['Top_Sector'],axis=1)
y = data['Top_Sector']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42, stratify=y)

In [9]:
#naive untuned model
xgb_naive = XGBClassifier()
xgb_naive.fit(X_train,y_train)
y_pred = xgb_naive.predict(X_test)
#classification report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.98      0.92     14696
           1       0.67      0.25      0.36      2948

    accuracy                           0.85     17644
   macro avg       0.77      0.61      0.64     17644
weighted avg       0.83      0.85      0.82     17644



In [10]:
print('Accuracy Score: ',accuracy_score(y_test,y_pred))
print('AUC Score: ',roc_auc_score(y_test,y_pred))

Accuracy Score:  0.8535479483110405
AUC Score:  0.6105468845213237


In [11]:
# XGBoost Classifier
xgb = XGBClassifier()
pipe_xgb = Pipeline([('xgb', xgb)])

param_grid = {'xgb__n_estimators': [200, 300, 400, 500, 600],'xgb__max_depth': [2,4,6,10,15],'xgb__learning_rate': [0.01, 0.05, 0.1, 0.15],
              'xgb__min_child_weight': [1, 2, 3],'xgb__reg_alpha' : [0.0, 0.5, 1.0],'xgb__reg_lambda' : [0.0, 0.5, 1.0, 2.0],
              'xgb__gamma': [0.0, 1, 3, 6],
              'xgb__colsample_bytree': [0.4, 0.5, 0.7, .9]}

#randomized search
rs_xgb = RandomizedSearchCV(pipe_xgb, param_grid, cv=5, n_jobs=-1, random_state=42, n_iter=500, scoring='roc_auc')

#fit model
rs_xgb.fit(X_train, y_train)

#best params
rs_xgb.best_params_

{'xgb__reg_lambda': 1.0,
 'xgb__reg_alpha': 0.0,
 'xgb__n_estimators': 500,
 'xgb__min_child_weight': 2,
 'xgb__max_depth': 10,
 'xgb__learning_rate': 0.01,
 'xgb__gamma': 6,
 'xgb__colsample_bytree': 0.9}

In [12]:
#extract best model
best_model = rs_xgb.best_estimator_

#train best model with full train set
best_model.fit(X_train, y_train)

#predict on test set
y_test_pred = best_model.predict(X_test)

In [15]:
import pickle

# Assuming best_model is your XGBClassifier
model_params = best_model.get_params()

# Save model parameters with pickle
with open('model_params.pkl', 'wb') as file:  # Notice the 'wb' for writing in binary mode
    pickle.dump(model_params, file)


In [16]:

#classification report
print(classification_report(y_test,y_test_pred))

print('Accuracy Score: ',accuracy_score(y_test,y_test_pred))

print('AUC Score: ',roc_auc_score(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       0.86      0.98      0.92     14696
           1       0.72      0.21      0.33      2948

    accuracy                           0.85     17644
   macro avg       0.79      0.60      0.62     17644
weighted avg       0.84      0.85      0.82     17644

Accuracy Score:  0.8547948311040581
AUC Score:  0.598143773511322
