# Testing out different Classifiers

    1. Random Forest
    2. XGBoost
    3. SVM
    4. Multinomial Naïve Bayes


In [1]:

import pandas as pd
import seaborn as sns
from comet_ml import Experiment
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from xgboost import XGBRegressor, XGBClassifier

In [2]:
df = pd.read_csv('../data/tweets_50.csv')
X = df['text_tokenized']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
pipe = Pipeline([
    ('vect', TfidfVectorizer(strip_accents='ascii', max_features=30_000, stop_words='english')),
    ('model', RandomForestClassifier())
])

search_grid = {
    'model': [
            RandomForestClassifier(),
            XGBClassifier(),
            MultinomialNB(),
            LogisticRegression(multi_class='multinomial', solver='newton-cg'),
            LinearSVC(),
            LogisticRegression(),
    ]
}

grid = GridSearchCV(pipe, search_grid, n_jobs=6, verbose=2, cv=5,
                    scoring='accuracy')

grid.fit(X_train, y_train)
results_cv = pd.DataFrame(grid.cv_results_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [None]:
# print(results_cv.columns)
results_cv.to_csv('../models/sk_clf_vect_guess')

for i, row in results_cv.iterrows():
    exp = Experiment(workspace="henrystoll",
                     project_name="nlp-token-sklearn-clf",
                     api_key="HeH9EtfDC2KUlCOjeQaU1CuOM",)
    for k, v in row.items():
        if k.startswith('mean_') or k.startswith('std_'):
            exp.log_metric(k, v)
        elif k.startswith('param_'):
            exp.log_parameter(k, v)
    exp.end()


In [None]:
y_pred = grid.predict(X_test)

print(classification_report(y_test, y_pred))
results_cv[['mean_fit_time', 'params', 'mean_test_score']]