### Imports

In [18]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
import os
import warnings

In [19]:
warnings.filterwarnings('ignore')

### Read Data

In [20]:
train_df = pd.read_csv('../data/Annotation/joined/joined_train.csv', index_col='id')
test_df = pd.read_csv('../data/Annotation/joined/joined_test.csv', index_col='id')
print(train_df.shape)
print(test_df.shape)

(12271, 772)
(3068, 772)


### Baseline Models

In [21]:
#initialize base models with default params
lr = LogisticRegression()
knn = KNeighborsClassifier()
rf = RandomForestClassifier()
gnb = GaussianNB()
mlp = MLPClassifier()

In [22]:
def print_classification_report(y_true, y_pred, filename):
    cls_report = classification_report(y_true, y_pred)
    print('\nClassification Report')
    print('======================================================')
    print('\n', cls_report)
    with open(os.path.join('../metrics', filename), 'w+') as f:
        f.write(cls_report)

In [23]:
train = train_df.drop(columns=['label'])
test = test_df.drop(columns=['label'])

In [24]:
lr.fit(train, train_df['label'])
pred = lr.predict(test)
print_classification_report(test_df['label'], pred, 'sklearn_lr.txt')


Classification Report

               precision    recall  f1-score   support

           0       0.62      0.58      0.60       329
           1       0.43      0.04      0.07        74
           2       0.44      0.03      0.05       160
           3       0.75      0.86      0.80      1185
           4       0.51      0.46      0.48       478
           5       0.57      0.51      0.54       162
           6       0.58      0.70      0.64       680

    accuracy                           0.65      3068
   macro avg       0.56      0.45      0.45      3068
weighted avg       0.63      0.65      0.62      3068



In [25]:
knn.fit(train, train_df['label'])
pred = knn.predict(test)
print_classification_report(test_df['label'], pred, 'sklearn_knn.txt')


Classification Report

               precision    recall  f1-score   support

           0       0.41      0.43      0.42       329
           1       0.22      0.09      0.13        74
           2       0.14      0.09      0.11       160
           3       0.62      0.74      0.67      1185
           4       0.39      0.33      0.36       478
           5       0.53      0.22      0.31       162
           6       0.48      0.47      0.48       680

    accuracy                           0.51      3068
   macro avg       0.40      0.34      0.35      3068
weighted avg       0.49      0.51      0.49      3068



In [26]:
rf.fit(train, train_df['label'])
pred = rf.predict(test)
print_classification_report(test_df['label'], pred, 'sklearn_rf.txt')


Classification Report

               precision    recall  f1-score   support

           0       0.68      0.57      0.62       329
           1       0.56      0.07      0.12        74
           2       0.82      0.06      0.11       160
           3       0.70      0.86      0.77      1185
           4       0.44      0.23      0.30       478
           5       0.64      0.31      0.42       162
           6       0.53      0.78      0.63       680

    accuracy                           0.62      3068
   macro avg       0.62      0.41      0.42      3068
weighted avg       0.62      0.62      0.58      3068



In [27]:
gnb.fit(train, train_df['label'])
pred = gnb.predict(test)
print_classification_report(test_df['label'], pred, 'sklearn_gnb.txt')


Classification Report

               precision    recall  f1-score   support

           0       0.35      0.34      0.34       329
           1       0.12      0.28      0.17        74
           2       0.10      0.68      0.17       160
           3       0.75      0.62      0.68      1185
           4       0.28      0.21      0.24       478
           5       0.45      0.21      0.29       162
           6       0.29      0.00      0.01       680

    accuracy                           0.36      3068
   macro avg       0.33      0.34      0.27      3068
weighted avg       0.46      0.36      0.37      3068



In [None]:
mlp.fit(train, train_df['label'])
pred = mlp.predict(test)
print_classification_report(test_df['label'], pred, 'sklearn_mlp.txt')