# Train and Evaluate Simple Models

In [None]:
import altair as alt
import joblib
import os
import pandas as pd
# from sklearn.model_selection import train_test_split
# from tqdm import tqdm
# from typing import List, Set
import numpy as np

# from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, accuracy_score

In [None]:
RANDOM_SEED = 17
use_topics = True
project_base = os.path.dirname(os.path.realpath('.'))
print(f'Project base path: {project_base}')

In [None]:
# helper function
def make_kaggle_preds(fitted_model, data, output_file_name):
    preds = fitted_model.predict(data)
    kaggle_preds_df = pd.DataFrame({'id': list(range(len(preds))), 'label':preds})
    kaggle_preds_df['label'] = kaggle_preds_df['label'].astype(int)
    kaggle_preds_df.to_csv(os.path.join(project_base, 'data', 'kaggle_preds', output_file_name), index=False)

### Load Data

In [None]:
if use_topics:
    X_train_processed = pd.read_csv(os.path.join(project_base, 'data', 'cleaned_data', 'processed_train_data_w_topics.csv'))
    X_test_processed = pd.read_csv(os.path.join(project_base, 'data', 'cleaned_data', 'processed_test_data_w_topics.csv'))
else:
    X_train_processed = pd.read_csv(os.path.join(project_base, 'data', 'cleaned_data', 'processed_train_data.csv'))
    X_test_processed = pd.read_csv(os.path.join(project_base, 'data', 'cleaned_data', 'processed_test_data.csv'))
X_kaggle_processed = pd.read_csv(os.path.join(project_base, 'data', 'cleaned_data', 'processed_kaggle_data.csv'))


y_train = pd.read_csv(os.path.join(project_base,  'data', 'cleaned_data', 'processed_train_y.csv'))
y_test = pd.read_csv(os.path.join(project_base, 'data', 'cleaned_data', 'processed_test_y.csv'))

In [None]:
print(f' Train set shape: {X_train_processed.shape}')
print(f' Test set shape: {X_test_processed.shape}')

### Basic Models

In [None]:
from sklearn.dummy import DummyClassifier

dummy_classifier = DummyClassifier(strategy='uniform', random_state=RANDOM_SEED)

dummy_classifier.fit(X_train_processed, y_train)
dummy_classifier.score(X_test_processed, y_test)

In [None]:
# kaggle predictions
make_kaggle_preds(fitted_model=dummy_classifier, data=X_kaggle_processed, output_file_name='dummy_preds.csv')

### SGD Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_classifier = SGDClassifier()

sgd_classifier.fit(X_train_processed, y_train)

In [None]:
sgd_test_preds = sgd_classifier.predict(X_test_processed)

In [None]:
print(f'Accuracy Score:\n\n{accuracy_score(y_test, sgd_test_preds)}')

In [None]:
print(f'Classification Report:\n\n{classification_report(y_test, sgd_test_preds)}')
print(f'Confusion Matrix:\n\n{confusion_matrix(y_test, sgd_test_preds)}')
p, r, threshold = precision_recall_curve(y_test, sgd_test_preds)

In [None]:
make_kaggle_preds(fitted_model=sgd_classifier, data=X_kaggle_processed, output_file_name='sgd_preds.csv')

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(random_state=RANDOM_SEED)

rf_classifier.fit(X_train_processed, y_train.values.ravel())

In [None]:
# save fitted model
joblib.dump(rf_classifier, os.path.join(project_base, 'data', 'trained_models','rf_clf.pkl')) 

In [None]:
rf_test_preds = rf_classifier.predict(X_test_processed)

In [None]:
print(f'Accuracy Score:\n\n{accuracy_score(y_test, rf_test_preds)}')

In [None]:
make_kaggle_preds(fitted_model=rf_classifier, data=X_kaggle_processed, output_file_name='rf_clf_topics_preds.csv')

### Random Forest With Random Search

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

gs_rf_clf = RandomForestClassifier()
grid_search = GridSearchCV(estimator = gs_rf_clf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)

In [None]:
# run grid search to find best parameters
grid_search.fit(X_train_processed, y_train.values)

In [None]:
# best params
# {'bootstrap': True,
#  'max_depth': 90,
#  'max_features': 3,
#  'min_samples_leaf': 3,
#  'min_samples_split': 12,
#  'n_estimators': 1000}

# best model
cv_rf_classifier = RandomForestClassifier(max_depth=90, max_features=3, min_samples_leaf=3,
                       min_samples_split=12, n_estimators=1000)

cv_rf_classifier.fit(X_train_processed, y_train.values)

In [None]:
rf_test_preds = cv_rf_classifier.predict(X_test_processed)

In [None]:
print(f'Accuracy Score:\n\n{accuracy_score(y_test, rf_test_preds)}')

In [None]:
make_kaggle_preds(fitted_model=cv_rf_classifier, data=X_kaggle_processed, output_file_name='cv_rf_clf_preds.csv')

In [None]:
# print(f'Accuracy Score:\n\n{accuracy_score(y_test, rf_test_preds)}')
# print(f'Classification Report:\n\n{classification_report(y_test, rf_test_preds)}')
# print(f'Confusion Matrix:\n\n{confusion_matrix(y_test, rf_test_preds)}')
# p, r, threshold = precision_recall_curve(y_test, rf_test_preds)

In [None]:
# wc_analysis_df = pd.DataFrame({'word_count': list(X_test['word_count']), 
#                                 'true_label':y_test, 
#                                 'predicted_label': rf_test_preds})
# print(f'Accuracy Score check: {len(wc_analysis_df[wc_analysis_df.true_label == wc_analysis_df.predicted_label]) / len(wc_analysis_df)}')


In [None]:
# wc_set = set(wc_analysis_df.word_count)
# accuracy_list = []
# word_freq_list = []

# for wc in wc_set:
#     current_df = wc_analysis_df[wc_analysis_df.word_count == wc].copy()
#     current_df_len = len(current_df)
#     accuracy = len(current_df[current_df.true_label == current_df.predicted_label]) / current_df_len
#     print(f'For samples with length {wc} the model accuracy was {accuracy*100:.3f}% with {current_df_len} total words\n')
#     accuracy_list.append(accuracy)
#     word_freq_list.append(current_df_len)

In [None]:
# full_wc_analysis = pd.DataFrame({'word_count': list(range(1, len(accuracy_list)+1)), 
#                                  'acurracy':accuracy_list,
#                                  'word_freq':word_freq_list})
# # full_wc_analysis.head()
# l = alt.Chart(full_wc_analysis).mark_line(color='red').encode(
#     x = alt.X('word_count'),
#     y = alt.Y('acurracy'))

# b = alt.Chart(full_wc_analysis).mark_bar().encode(
#     x = alt.X('word_count'),
#     y = alt.Y('word_freq'))

# print(f'Correlation between word frequency and accuracy: {full_wc_analysis.word_freq.corr(full_wc_analysis.acurracy)*100:.2f}%')
# (b+l).resolve_scale(y='independent')


In [None]:
# full_wc_analysis.word_freq.corr(full_wc_analysis.acurracy)

In [None]:
# # where do our error come from
# rf_test_pred_probas = rf_classifier.predict_proba(X_test_processed)
# rf_test_pred_probas_df = pd.DataFrame({'id': list(range(len(rf_test_pred_probas))), 
#                                        'proba':rf_test_pred_probas[:,1],
#                                        'rounded_proba': np.round(rf_test_pred_probas[:,1]),
#                                        'label': y_test})

In [None]:
# rf_test_pred_probas_df_wrong = rf_test_pred_probas_df[rf_test_pred_probas_df.label !=rf_test_pred_probas_df.rounded_proba]
# rf_test_pred_probas_df_wrong.shape

In [None]:
# rf_test_pred_probas_df_wrong.groupby(['label', 'rounded_proba','proba'],
#                                      as_index=False).count().sort_values('id', ascending=False)

### Create Data for Kaggle Scoreboard 

In [None]:
# rf_kaggle_preds = rf_classifier.predict(X_kaggle_processed)

In [None]:
# kaggle_preds_df = pd.DataFrame({'id': list(range(len(rf_kaggle_preds))), 'label':rf_kaggle_preds})
# kaggle_preds_df['label'] = kaggle_preds_df['label'].astype(int)
# kaggle_preds_df.head()

In [None]:
# kaggle_preds_df.to_csv(os.path.join(project_base, 'data', 'cleaned_data', 'rf_kaggle_preds.csv'), index=False)

In [None]:
# # in case we want to try a different decision threshold
# rf_kaggle_pred_probas = rf_classifier.predict_proba(X_kaggle_processed)
# kaggle_pred_probas_df = pd.DataFrame({'id': list(range(len(rf_kaggle_pred_probas))), 'label':rf_kaggle_pred_probas[:,0]})

### Pytorch NN

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import TensorDataset, DataLoader

In [None]:
train_dataset = TensorDataset(torch.from_numpy(X_train_processed).float(), torch.from_numpy(y_train).float())
test_dataset = TensorDataset(torch.from_numpy(X_test_processed).float(), torch.from_numpy(y_test).float())
# kaggle_dataset = TensorDataset(torch.from_numpy(X_kaggle_processed).float(), torch.from_numpy(y_test.values).float())

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=512, shuffle=True, drop_last=False)
test_dataloader = DataLoader(test_dataset, batch_size=512, shuffle=False, drop_last=False)
# kaggle_dataloader = DataLoader(kaggle_dataset, batch_size=512, shuffle=False, drop_last=False)

In [None]:
class NN(nn.Module):
    def __init__(self, input_shape, output_shape):
        super(NN, self).__init__()
        
        self.fc_1 = nn.Linear(input_shape, 8)
        self.bn1 = nn.BatchNorm1d(8)
        
        self.fc_2 = nn.Linear(8, 16)
        self.bn2 = nn.BatchNorm1d(16)
        
        self.fc_3 = nn.Linear(16, output_shape)
        
        # self.kaiming_1 = nn.kaiming_normal(fc_1.weights, 'leaky_relu')
        
    def forward(self, x):
        x = F.relu(self.bn1(self.fc_1(x)))
        x = F.relu(self.bn2(self.fc_2(x)))
        x = torch.sigmoid(self.fc_3(x))
        return x
    
# check the model dimensions
model = NN(31, 2)
x = torch.randn(100, 31)
print(model(x).shape)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

In [None]:
# hyperparameters
input_size = 26
output_size = 1
learning_rate = 0.003
epochs = 10

In [None]:
model = NN(input_size, output_size)
model.to(device)

In [None]:
criterion = nn.BCELoss()
# if multiclass
# criterion = nn.CrossEntropyLoss()
adam = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# not great performance, code still needs to be cleaned up
for i in range(1, epochs+1):
    
    for idx, (x, y) in enumerate(train_dataloader):
        
        x, y = x.to(device), y.to(device)
        model.train()
        model.zero_grad()
        
        # predictions
        y_hat = model(x)
        loss = criterion(y_hat.squeeze(dim=1), y)
        loss.backward()
        adam.step()
    
    model.eval()
    with torch.no_grad():
        num_correct = 0
        total = 0 
        
        for idx, (x, y) in enumerate(test_dataloader):
        
            x, y = x.to(device), y.to(device)

            y_hat = model(x)

#             print(f'Batch {idx}, Epoch {i}, Loss {loss.item()}')

            
#             _, predictions = y_hat.max(1)

            y_hat_preds = y_hat.round().squeeze(dim=1)
            num_correct += (y_hat_preds == y).sum()
        
#             correct = val_y.eq(torch.round(y_hat_preds.squeeze())).sum()
#             num_correct += correct.item()
            
            total += y_hat.size(0)
            
        print(f'Epoch {i} test accuracy {num_correct / total}')

In [None]:
final_preds = model(torch.from_numpy(X_test_processed).float()).detach().numpy()
kaggle_preds = model(torch.from_numpy(X_kaggle_processed).float()).detach().numpy()

In [None]:
final_class_preds = final_preds.round()
final_kaggle_preds = kaggle_preds.round()

In [None]:
print(f'Accuracy Score:\n\n{accuracy_score(y_test, final_class_preds)}')
print(f'Classification Report:\n\n{classification_report(y_test, final_class_preds)}')
print(f'Confusion Matrix:\n\n{confusion_matrix(y_test, final_class_preds)}')

In [None]:
kaggle_nn_pred_df = pd.DataFrame({'id': list(range(len(final_kaggle_preds))), 'label':final_kaggle_preds[:,0]})

In [None]:
kaggle_nn_pred_df.to_csv(os.path.join(project_base, 'data', 'cleaned_data', 'nn_kaggle_preds.csv'), index=False)