In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from functions import prepare_data

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, log_loss
from sklearn.pipeline import Pipeline

In [2]:
data = prepare_data()

X_train, X_test, y_train, y_test = train_test_split(data['text'], 
                                                    data['party'],
                                                    stratify=data['party'],
                                                    random_state=1
                                                   )

vectorizer = CountVectorizer()

In [3]:
rf_clf = RandomForestClassifier(random_state=1)

rf_parameters = {
    'n_estimators': [100, 300, 500],
    'max_depth': [3, 5, 8, 15, 25],
    'min_samples_split': [2, 5, 10, 15, 50],
    'min_samples_leaf': [1, 2, 5, 10]
}

In [4]:
grid_rf = GridSearchCV(estimator=rf_clf,
                       param_grid=rf_parameters,
                       n_jobs=-1,
                       cv=3
                      )

In [5]:
pipe = Pipeline([
    ('process', CountVectorizer()),
    ('grid-cv', grid_rf)
])

In [6]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('process',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=...
                                                               min_weight_fraction_leaf=0.0,
                                                               n_estimators='warn',
                                                               n_jobs=None,
                                                               oob_score=False,
   

In [8]:
y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)[:, 1]

In [9]:
log_loss(y_test, y_prob)

0.6697839370894116

In [10]:
pipe.score(X_test, y_test)

0.6117216117216118