In [1]:
import pandas as pd
df_train = pd.read_csv('train.csv', encoding='utf-8')
df_test = pd.read_csv('test.csv', encoding='utf-8')

In [2]:
import re
def preprocessor(text):
    text = re.sub('[\W]+', ' ', text.lower())
    return text

In [3]:
df_train['text'] = df_train['text'].apply(preprocessor)

In [4]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
df_train['author'] = class_le.fit_transform(df_train['author'].values)

In [5]:
from sklearn.model_selection import train_test_split
X, y = df_train.iloc[:, df_train.columns.get_loc('text')].values, \
       df_train.iloc[:, df_train.columns.get_loc('author')].values
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.3,
                                                   random_state=42,
                                                   stratify=y)

In [6]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords as sw
stopwords = sw.words('english')

[nltk_data] Downloading package stopwords to /home/gg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gg/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
def tokenizer(text):
    return text.split()

In [8]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [9]:
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()
def tokenizer_lemma(text):
    return [lemma.lemmatize(word) for word in text.split()]

In [10]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,
                       lowercase=False,
                       preprocessor=None)

param_grid = {'vect__ngram_range': [(1,1)],
              'vect__stop_words': [stopwords],
              'vect__tokenizer': [tokenizer,
                                  tokenizer_porter,
                                  tokenizer_lemma],
              'vect__norm': ['l1', 'l2', None],
              'vect__use_idf':[False, True],
              'clf__penalty': ['l1', 'l2'],
              'clf__C': [0.0005]}

pipe = Pipeline([
                   ('vect', tfidf),
                   ('clf', LogisticRegression(random_state=42))
                   ])

model = RandomizedSearchCV(pipe, param_grid, scoring='neg_log_loss', cv=10, n_iter=25, verbose=1, n_jobs=-1)
model.fit(X_train, y_train)

Fitting 10 folds for each of 25 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   20.9s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  1.7min finished


RandomizedSearchCV(cv=10, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=Tru...alty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
          fit_params=None, iid=True, n_iter=25, n_jobs=-1,
          param_distributions={'vect__ngram_range': [(1, 1)], 'vect__norm': ['l1', 'l2', None], 'vect__stop_words': [[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u"you're", u"you've", u"you'll", u"you'd", u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'... 0x7fe1ed8c3398>], 'vect__use_idf': [False, True], 'clf__C': [0.0005], 'clf__penalty': ['l1', 'l2']},
          pre_dispatch='2*

In [11]:
print(model.best_estimator_)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=None, preprocessor=None, smooth_idf=True...alty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])


In [12]:
print(model.best_params_)

{'vect__ngram_range': (1, 1), 'vect__norm': None, 'vect__stop_words': [u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u"you're", u"you've", u"you'll", u"you'd", u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u"she's", u'her', u'hers', u'herself', u'it', u"it's", u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u"that'll", u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'the

In [13]:
print(model.best_score_)

-0.7873685140334924


In [14]:
y_pred_train = model.best_estimator_.predict(X_train)

In [15]:
from sklearn.metrics import confusion_matrix
conf_matrix_train = confusion_matrix(y_true=y_train, y_pred=y_pred_train)
print(conf_matrix_train)

[[4948  250  332]
 [ 549 3211  184]
 [ 626  188 3417]]


In [16]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train, y_pred_train)

0.8446552353155783

In [17]:
y_pred_test = model.best_estimator_.predict(X_test)

In [18]:
conf_matrix_test = confusion_matrix(y_true=y_test, y_pred=y_pred_test)
print(conf_matrix_test)

[[2018  160  192]
 [ 311 1257  123]
 [ 340  133 1340]]


In [19]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred_test)

0.785665645216207

In [21]:
import pickle
import os
pickle.dump(model.best_estimator_,
           open(os.path.join('./pkl_objects', 'tfidf_1gram_lr_pipe.pkl'), 'wb'),
           protocol=2)