### Import libraries

In [1]:
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

### Load data

In [2]:
df = pd.read_csv('../data/cleaned_train_data.csv')

In [3]:
df['label'].value_counts()

1    1544
0    1503
Name: label, dtype: int64

In [4]:
X = df['text']
y = df['label']

### Split data

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.3)

### Modeling with CountVectorizer

#### KNN

In [14]:
knn_pipe_cvec = Pipeline([
    ('cvec', CountVectorizer()),
    ('ss', StandardScaler(with_mean=False)),
    ('knn', KNeighborsClassifier())
])

knn_pipe_cvec_params = {
    'cvec__min_df': [1, 2, 3],
    'cvec__ngram_range': [(1,1), (1,2)],
    'knn__n_neighbors': [i for i in range(1, 101, 5)],
    'knn__weights': ['uniform', 'distance'],
}

In [15]:
knn_cvec_gs = GridSearchCV(knn_pipe_cvec, param_grid=knn_pipe_cvec_params, cv=5)

In [16]:
knn_cvec_gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [17]:
knn_cvec_gs.best_score_

0.8348968105065666

In [42]:
knn_cvec_gs.best_estimator_.score(X_train, y_train)

0.9995309568480301

In [18]:
knn_cvec_gs.best_estimator_.score(X_test, y_test)

0.9060109289617486

#### SVC

In [22]:
svc_pipe_cvec = Pipeline([
    ('cvec', CountVectorizer()),
    ('ss', StandardScaler(with_mean=False)),
    ('svc', SVC())
])

svc_pipe_cvec_params = {
    'cvec__min_df': [1, 2, 3],
    'cvec__ngram_range': [(1,1), (1,2)],
    'svc__gamma': ['scale'],
    'svc__C': [c**0.01 for c in range(1, 1001, 25)]
}

In [23]:
svc_cvec_gs = GridSearchCV(svc_pipe_cvec, param_grid=svc_pipe_cvec_params, cv=5)

In [24]:
svc_cvec_gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [25]:
svc_cvec_gs.best_score_

0.8954033771106942

In [43]:
svc_cvec_gs.best_estimator_.score(X_train, y_train)

0.9821763602251408

In [26]:
svc_cvec_gs.best_estimator_.score(X_test, y_test)

0.9158469945355191

#### Adaboost

In [8]:
ada_pipe_cvec = Pipeline([
    ('cvec', CountVectorizer()),
    ('ss', StandardScaler(with_mean=False)),
    ('ada', AdaBoostClassifier(DecisionTreeClassifier(random_state=42), random_state=42))
])

ada_pipe_cvec_params = {
    'cvec__min_df': [1, 2, 3],
    'cvec__ngram_range': [(1,1), (1,2)],
    'ada__n_estimators': [i for i in range(50, 150, 10)],
    'ada__base_estimator__max_depth': [i for i in range(1, 5)],
}

In [9]:
ada_cvec_gs = GridSearchCV(ada_pipe_cvec, param_grid=ada_pipe_cvec_params, cv=5)

In [10]:
ada_cvec_gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [11]:
ada_cvec_gs.best_score_

0.9226078799249531

In [13]:
ada_cvec_gs.best_estimator_.score(X_train, y_train)

0.9995309568480301

In [12]:
ada_cvec_gs.best_estimator_.score(X_test, y_test)

0.921311475409836

### Modeling with TfidfVectorizer

#### KNN

In [27]:
knn_pipe_tvec = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('ss', StandardScaler(with_mean=False)),
    ('knn', KNeighborsClassifier())
])

knn_pipe_tvec_params = {
    'tvec__min_df': [1, 2, 3],
    'tvec__ngram_range': [(1,1), (1,2)],
    'knn__n_neighbors': [i for i in range(1, 101, 5)],
    'knn__weights': ['uniform', 'distance'],
}

In [28]:
knn_tvec_gs = GridSearchCV(knn_pipe_tvec, param_grid=knn_pipe_tvec_params, cv=5)

In [29]:
knn_tvec_gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [30]:
knn_tvec_gs.best_score_

0.7922138836772983

In [44]:
knn_tvec_gs.best_estimator_.score(X_train, y_train)

0.9985928705440901

In [31]:
knn_tvec_gs.best_estimator_.score(X_test, y_test)

0.7704918032786885

#### SVC

In [32]:
svc_pipe_tvec = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('ss', StandardScaler(with_mean=False)),
    ('svc', SVC())
])

svc_pipe_tvec_params = {
    'tvec__min_df': [1, 2, 3],
    'tvec__ngram_range': [(1,1), (1,2)],
    'svc__gamma': ['scale'],
    'svc__C': [c**0.01 for c in range(1, 1001, 25)]
}

In [33]:
svc_tvec_gs = GridSearchCV(svc_pipe_tvec, param_grid=svc_pipe_tvec_params, cv=5)

In [34]:
svc_tvec_gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [35]:
svc_tvec_gs.best_score_

0.8719512195121951

In [45]:
svc_tvec_gs.best_estimator_.score(X_train, y_train)

0.9840525328330206

In [36]:
svc_tvec_gs.best_estimator_.score(X_test, y_test)

0.8688524590163934

#### Adaboost

In [37]:
ada_pipe_tvec = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('ss', StandardScaler(with_mean=False)),
    ('ada', AdaBoostClassifier(DecisionTreeClassifier(random_state=42), random_state=42))
])

ada_pipe_tvec_params = {
    'tvec__min_df': [1, 2, 3],
    'tvec__ngram_range': [(1,1), (1,2)],
    'ada__n_estimators': [i for i in range(50, 150, 10)],
    'ada__base_estimator__max_depth': [i for i in range(1, 5)],
}

In [38]:
ada_tvec_gs = GridSearchCV(ada_pipe_tvec, param_grid=ada_pipe_tvec_params, cv=5)

In [39]:
ada_tvec_gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [40]:
ada_tvec_gs.best_score_

0.9197936210131332

In [46]:
ada_tvec_gs.best_estimator_.score(X_train, y_train)

0.950750469043152

In [41]:
ada_tvec_gs.best_estimator_.score(X_test, y_test)

0.9278688524590164

### Score table

In [48]:
scores = pd.DataFrame()

In [49]:
scores['Model Type'] = [
    'CountVectorization-KNN', 'CountVectorization-SVC', 'CountVectorization-Adaboost',
    'TFIDFVectorization-KNN', 'TFIDFVectorization-SVC', 'TFIDFVectorization-Adaboost'
]

scores['Train Accuracy'] = [
    knn_cvec_gs.best_estimator_.score(X_train, y_train), knn_tvec_gs.best_estimator_.score(X_train, y_train),
    svc_cvec_gs.best_estimator_.score(X_train, y_train), svc_tvec_gs.best_estimator_.score(X_train, y_train),
    ada_cvec_gs.best_estimator_.score(X_train, y_train), ada_tvec_gs.best_estimator_.score(X_train, y_train)
]

scores['Test Accuracy'] = [
    knn_cvec_gs.best_estimator_.score(X_test, y_test), knn_tvec_gs.best_estimator_.score(X_test, y_test),
    svc_cvec_gs.best_estimator_.score(X_test, y_test), svc_tvec_gs.best_estimator_.score(X_test, y_test),
    ada_cvec_gs.best_estimator_.score(X_test, y_test), ada_tvec_gs.best_estimator_.score(X_test, y_test)
]

scores['Accuracy Difference'] = scores['Train Accuracy'] - scores['Test Accuracy']

In [50]:
scores.sort_values(by='Accuracy Difference',ascending=True)

Unnamed: 0,Model Type,Train Accuracy,Test Accuracy,Accuracy Difference
5,TFIDFVectorization-Adaboost,0.95075,0.927869,0.022882
2,CountVectorization-Adaboost,0.982176,0.915847,0.066329
4,TFIDFVectorization-SVC,0.999531,0.921311,0.078219
0,CountVectorization-KNN,0.999531,0.906011,0.09352
3,TFIDFVectorization-KNN,0.984053,0.868852,0.1152
1,CountVectorization-SVC,0.998593,0.770492,0.228101
