In [21]:
import pandas as pd
df_train = pd.read_csv('train.csv', encoding='utf-8')
df_test = pd.read_csv('test.csv', encoding='utf-8')

In [22]:
import re
def preprocessor(text):
    text = re.sub('[\W]+', ' ', text.lower())
    return text

In [23]:
df_train['text'] = df_train['text'].apply(preprocessor)

In [24]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
df_train['author'] = class_le.fit_transform(df_train['author'].values)

In [25]:
from sklearn.model_selection import train_test_split
X, y = df_train.iloc[:, df_train.columns.get_loc('text')].values, \
       df_train.iloc[:, df_train.columns.get_loc('author')].values
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.3,
                                                   random_state=42,
                                                   stratify=y)

In [26]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords as sw
stopwords = sw.words('english')

[nltk_data] Downloading package stopwords to /home/luce/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/luce/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### Tokenize
Tokenize your sentences into separate words first.

In [27]:
def tokenizer(text):
    return text.split()

#### Stemming and Lemming helper functions
This functions will use stemming and lemmatization which reduces yours words into their root form.
Runner -> run
Running -> run

In [28]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [29]:
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()
def tokenizer_lemma(text):
    return [lemma.lemmatize(word) for word in text.split()]

#### Setup
We will use pipeline to better organize our code. Pipeline will allow us establish our workflow easily.
We will also use GridSearch or RandomSearch with k-fold to find optimized parameters for our model. We will use neg_log_loss instead of accuracy because that is what Kaggle is using to grade submissions.

In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(strip_accents=None,
                       lowercase=False,
                       preprocessor=None)

param_grid = {'vect__ngram_range': [(1,1)],
              'vect__stop_words': [stopwords],
              'vect__tokenizer': [tokenizer,
                                  tokenizer_porter,
                                  tokenizer_lemma],
              'clf__penalty': ['l1', 'l2'],
              'clf__C': [0.001, 0.01]}

pipe = Pipeline([
                 ('vect', count),
                 ('clf', LogisticRegression(random_state=42))
                ])

model = GridSearchCV(pipe, param_grid, scoring='neg_log_loss', cv=10, verbose=1, n_jobs=-1)
model.fit(X_train, y_train)

Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  9.6min finished


GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        stri...alty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vect__tokenizer': [<function tokenizer at 0x7eff507b2840>, <function tokenizer_porter at 0x7eff507b2ea0>, <function tokenizer_lemma at 0x7eff4fa6e268>], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your... "wouldn't"]], 'clf__C': [0.001, 0.01], 'vect__ngram_range': [(1, 1)], 'clf__penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_t

In [31]:
print(model.best_estimator_)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['i', 'me'...alty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])


In [32]:
print(model.best_params_)

{'vect__tokenizer': <function tokenizer_porter at 0x7eff507b2ea0>, 'vect__stop_words': ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'mos

In [33]:
print(model.best_score_)

-0.8464387995927873


#### Check for overfitting
When you notice that training data has a higher accuracy than the test data, that is one indicator of overfitting.
One method to solve this, is to play around with the C parameter of your logistic regression model. The lower the C parameter the more it will be penalized. The C parameter is an inverse of the regularization parameter.

You can use the confusion matrix to get true and false positives of your model.

In [34]:
y_pred_train = model.best_estimator_.predict(X_train)

In [35]:
from sklearn.metrics import confusion_matrix
conf_matrix_train = confusion_matrix(y_true=y_train, y_pred=y_pred_train)
print(conf_matrix_train)

[[5032  213  285]
 [1252 2444  248]
 [1380  206 2645]]


In [36]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train, y_pred_train)

0.7384896023349142

In [37]:
y_pred_test = model.best_estimator_.predict(X_test)

In [38]:
conf_matrix_test = confusion_matrix(y_true=y_test, y_pred=y_pred_test)
print(conf_matrix_test)

[[2108  114  148]
 [ 601  957  133]
 [ 639  125 1049]]


In [39]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred_test)

0.700374531835206

#### Save the model
Save the model with pickle, this way you can use this pickle object later for more fun! Ensemble learning!

WARNING: Pickle objects can contain viruses, I recommend building your own pickle objects.

In [40]:
import pickle
import os
pickle.dump(model.best_estimator_,
           open(os.path.join('./pkl_objects', 'bag_1gram_lr_pipe.pkl'), 'wb'),
           protocol=4)