In [1]:
import zipfile
with zipfile.ZipFile('train.zip', 'r') as unzipped_file:
    unzipped_file.extractall()
with zipfile.ZipFile('test.zip', 'r') as unzipped_file:
    unzipped_file.extractall()

In [2]:
import pandas as pd
df_train = pd.read_csv('train.csv', encoding='utf-8')
df_test = pd.read_csv('test.csv', encoding='utf-8')

In [3]:
df_train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [4]:
df_train.shape

(19579, 3)

In [5]:
import re
def preprocessor(text):
    text = re.sub('[\W]+', ' ', text.lower())
    return text

In [6]:
df_train['text'] = df_train['text'].apply(preprocessor)

In [7]:
#df_train = df_train.drop(['id'], axis=1)

In [8]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
df_train['author'] = class_le.fit_transform(df_train['author'].values)

In [9]:
from sklearn.model_selection import train_test_split
X, y = df_train.iloc[:, df_train.columns.get_loc('text')].values, \
       df_train.iloc[:, df_train.columns.get_loc('author')].values
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.3,
                                                   random_state=42,
                                                   stratify=y)

In [10]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords as sw
stopwords = sw.words('english')

[nltk_data] Downloading package stopwords to /home/luce/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/luce/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
def tokenizer(text):
    return text.split()

In [12]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [13]:
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()
def tokenizer_lemma(text):
    return [lemma.lemmatize(word) for word in text.split()]

In [14]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,
                       lowercase=False,
                       preprocessor=None)

param_grid = {'vect__ngram_range': [(2,2)],
              'vect__stop_words': [stopwords, None],
              'vect__tokenizer': [tokenizer,
                                  tokenizer_porter,
                                  tokenizer_lemma],
              'vect__norm': ['l1', 'l2', None],
              'vect__use_idf':[False, True],
              'clf__penalty': ['l1', 'l2'],
              'clf__C': [0.1, 1.0, 10.0, 100.0]}

lr_tfidf = Pipeline([
                   ('vect', tfidf),
                   ('clf', LogisticRegression(random_state=42))
                   ])

#gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
gs_lr_tfidf = RandomizedSearchCV(lr_tfidf, param_grid, scoring='neg_log_loss', cv=5, n_iter=25, n_jobs=-1)
gs_lr_tfidf.fit(X_train, y_train)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...alty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
          fit_params=None, iid=True, n_iter=25, n_jobs=-1,
          param_distributions={'vect__ngram_range': [(2, 2)], 'vect__tokenizer': [<function tokenizer at 0x7fe30cea6d90>, <function tokenizer_porter at 0x7fe30ced8598>, <function tokenizer_lemma at 0x7fe30cea6f28>], 'vect__norm': ['l1', 'l2', None], 'vect__use_idf': [False, True], 'vect__stop_words': [['i', '..., 'weren', 'won', 'wouldn'], None], 'clf__penalty': ['l1', 'l2'], 'clf__C': [0.1, 1.0, 10.0, 100.0]},
          pre_dispatch='2*n

In [15]:
print(gs_lr_tfidf.best_estimator_)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 2), norm=None, preprocessor=None, smooth_idf=True,
 ...alty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])


In [16]:
print(gs_lr_tfidf.best_params_)

{'vect__ngram_range': (2, 2), 'vect__tokenizer': <function tokenizer_lemma at 0x7fe30cea6f28>, 'vect__norm': None, 'vect__use_idf': True, 'vect__stop_words': None, 'clf__penalty': 'l2', 'clf__C': 1.0}


In [17]:
print(gs_lr_tfidf.best_score_)

-0.720345628665


In [18]:
import pickle
import os
pickle.dump(gs_lr_tfidf,
           open(os.path.join('./pkl_objects', '2gram_tfidf_lr.pkl'), 'wb'),
           protocol=4)