In [1]:
import pandas as pd
import numpy as np
import sklearn as sk

#load the data
train = pd.read_csv('data/train_prepared.zip')
test = pd.read_csv('data/test_prepared.zip')

In [3]:
#prepare train data for modelling
train_np = train.to_numpy()
y_train = train_np[:,8]
X_train = np.delete(train_np, 8, 1)

In [4]:
#prepare test data for modelling
test_np = test.to_numpy()
y_test = test_np[:,8]
X_test = np.delete(test_np, 8, 1)

In [None]:
import sklearn.linear_model
import sklearn.metrics
import sklearn.preprocessing
import sklearn.compose

#Logistic regression
model = sklearn.linear_model.LogisticRegression().fit(X_train, y_train)
log_reg_predictions = model.predict(X_test)

log_reg_accuracy = sklearn.metrics.accuracy_score(y_test, log_reg_predictions)
print(log_reg_accuracy)

In [7]:
full_set = pd.read_csv('data/full_prepared.zip')

In [9]:
#prepare full dataset for modelling
full_np = full_set.to_numpy()
y_full = full_np[:,8]
X_full = np.delete(full_np, 8, 1)

In [None]:
import sklearn.model_selection

#perform cross-validation with accuracy scores
accuracy_scores = sklearn.model_selection.cross_val_score(sklearn.linear_model.LogisticRegression(), X_full, y_full, cv=4)
print('Accuracy by cval: ', accuracy_scores)

In [None]:
#perform cross-validation with roc auc scores with OvO scheme
roc_auc_ovo_scores = sklearn.model_selection.cross_val_score(sklearn.linear_model.LogisticRegression(), X_full, y_full, cv=4,
                         scoring='roc_auc_ovo')
print('ROC AUC by cval: ', roc_auc_ovo_scores)

In [None]:
#perform cross-validation with roc auc scores with OvR scheme
roc_auc_ovr_scores = sklearn.model_selection.cross_val_score(sklearn.linear_model.LogisticRegression(), X_full, y_full, cv=4,
                         scoring='roc_auc_ovr')
print('ROC AUC by cval: ', roc_auc_ovr_scores)

In [None]:
#construct confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, log_reg_predictions)

In [None]:
#compute F1 scores
f1_macro_score = sklearn.metrics.f1_score(y_test, log_reg_predictions, average='macro')
f1_micro_score = sklearn.metrics.f1_score(y_test, log_reg_predictions, average='micro')
print(f1_macro_score)
print(f1_micro_score)

In [None]:
#run Logistic Regression model in another library to get the model summary
import statsmodels.api as sm
target = train.loc[:,"ratchg"]
train_sm = train.drop(columns=['ratchg'], axis=1)
model = sm.MNLogit(target, train_sm).fit()
model.summary()
model.summary2()

In [None]:
#try to improve Logistic Regreession performance with feature engineering (failed)
transform = sklearn.compose.ColumnTransformer([("categorical", sklearn.preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"), [1,2,3,6])])

X_train_poly = transform.fit_transform(X_train)
X_test_poly = transform.transform(X_test)

polynomial = sklearn.preprocessing.PolynomialFeatures(2, include_bias=False)
X_train_poly = polynomial.fit_transform(X_train_poly)
X_test_poly = polynomial.fit_transform(X_test_poly)

model = sklearn.linear_model.LogisticRegression().fit(X_train_poly, y_train)
log_reg_predictions = model.predict(X_test_poly)

log_reg_accuracy = sklearn.metrics.accuracy_score(y_test, log_reg_predictions)
print(log_reg_accuracy)

In [None]:
import sklearn.tree

#Decision tree model
DT_model = sklearn.tree.DecisionTreeClassifier().fit(X_train, y_train)
DT_predictions = DT_model.predict(X_test)

DT_accuracy = sklearn.metrics.accuracy_score(y_test, DT_predictions)
print(DT_accuracy)

In [None]:
#compute F1 scores for Decision Tree model
f1_macro_score = sklearn.metrics.f1_score(y_test, DT_predictions, average='macro')
f1_micro_score = sklearn.metrics.f1_score(y_test, DT_predictions, average='micro')
print(f1_macro_score)
print(f1_micro_score)

In [None]:
#construct confusion matrix for Decision Tree model
confusion_matrix(y_test, DT_predictions)

In [None]:
#perform cross-validation with accuracy scores for Decision Tree model
accuracy_scores = sklearn.model_selection.cross_val_score(sklearn.tree.DecisionTreeClassifier(), X_full, y_full, cv=4)
print('Accuracy by cval: ', accuracy_scores)

In [None]:
import sklearn.ensemble

#Random Forest model
RF_model = sklearn.ensemble.RandomForestClassifier(oob_score=True).fit(X_train, y_train)
        
RF_predictions = RF_model.predict(X_test)

RF_accuracy = sklearn.metrics.accuracy_score(y_test, RF_predictions)
print(RF_accuracy)

In [None]:
#print the oob score for Random Forest model
print(RF_model.oob_score_)

In [None]:
#compute F1 scores for Random Forest model
f1_macro_score = sklearn.metrics.f1_score(y_test, RF_predictions, average='macro')
f1_micro_score = sklearn.metrics.f1_score(y_test, RF_predictions, average='micro')

print(f1_macro_score)
print(f1_micro_score)

In [None]:
#construct confusion matrix for Random Forest model
confusion_matrix(y_test, RF_predictions)