In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
import string

%matplotlib inline

eng_stopwords = set(stopwords.words("english"))
pd.options.mode.chained_assignment = None

import warnings
warnings.filterwarnings('ignore')


import re

from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.model_selection import train_test_split,KFold

from sklearn.metrics import confusion_matrix,roc_auc_score,log_loss, accuracy_score




In [2]:
train_data = pd.read_csv("train.csv")
print('Train Data \n',train_data.head(),'\n')


Train Data 
         id                                               text author
0  id26305  This process, however, afforded me no means of...    EAP
1  id17569  It never once occurred to me that the fumbling...    HPL
2  id11008  In his left hand was a gold snuff box, from wh...    EAP
3  id27763  How lovely is spring As we looked from Windsor...    MWS
4  id12958  Finding nothing else, not even gold, the Super...    HPL 



In [3]:
print('Class Distribution in Data\n', train_data['author'].value_counts())

Class Distribution in Data
 EAP    7900
MWS    6044
HPL    5635
Name: author, dtype: int64


In [4]:
## Text/Data Cleaning

def clean_text(df):
    ps = PorterStemmer()
    corpus = []
    for i in range(0, len(df)):        
        review = re.sub('[^A-Za-z0-9]'," ",df['text'][i])
        review = word_tokenize(review)        
        review = [word for word in review if word.lower() not in set(stopwords.words('english')) \
                                                     and not word in string.punctuation \
                                                     and not word in ("''","'s","``")]
        review = [ps.stem(word) for word in review]
        review = ' '.join(review)
        corpus.append(review)
    
    return corpus
    
    

In [5]:

train_data['clean_text'] = clean_text(train_data)

train_data['author_num'] = train_data.author.map({'EAP':0, 'HPL':1, 'MWS':2})
print(train_data.head())

        id                                               text author  \
0  id26305  This process, however, afforded me no means of...    EAP   
1  id17569  It never once occurred to me that the fumbling...    HPL   
2  id11008  In his left hand was a gold snuff box, from wh...    EAP   
3  id27763  How lovely is spring As we looked from Windsor...    MWS   
4  id12958  Finding nothing else, not even gold, the Super...    HPL   

                                          clean_text  author_num  
0  process howev afford mean ascertain dimens dun...           0  
1                never occur fumbl might mere mistak           1  
2  left hand gold snuff box caper hill cut manner...           0  
3  love spring look windsor terrac sixteen fertil...           2  
4  find noth els even gold superintend abandon at...           1  


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_data['clean_text'], train_data['author_num'],\
                                                                 test_size=0.3, random_state=123)

In [8]:
from sklearn import metrics
tfidf_vec = TfidfVectorizer(token_pattern=r'\w{1,}', ngram_range=(1, 10), analyzer = 'word',max_features=5000)

x_train_tfidf = tfidf_vec.fit_transform(X_train).toarray()
x_test_tfidf = tfidf_vec.fit_transform(X_test).toarray()

## Model Fitting

log_reg = LogisticRegression()

log_reg.fit(x_train_tfidf, y_train)

# Prediction
y_pred = log_reg.predict(x_test_tfidf)

# Model Evaluation
print("\nTrain Data Confusion matrix\n", metrics.confusion_matrix(y_test, y_pred))
print("\nAccuracy on Test Data %: \n", accuracy_score(y_test, y_pred)*100)



Train Data Confusion matrix
 [[1590  437  326]
 [ 949  501  231]
 [1122  366  352]]

Accuracy on Test Data %: 
 41.59005788219272


In [18]:
## Using Grid Search Cross validation 
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [1, 10, 100, 1000]}#, 'kernel': ['rbf']}

GS_clf = GridSearchCV(log_reg, param_grid=param_grid, scoring='accuracy', cv=5)
GS_clf.fit(x_train_tfidf, y_train)

## Prediction
GS_y_pred = GS_clf.predict(x_test_tfidf)

## Model Evaluation
print("\nTrain Data Confusion matrix\n", metrics.confusion_matrix(y_test, GS_y_pred))
print("\nAccuracy on Test Data %: \n", accuracy_score(y_test, GS_y_pred)*100)


Train Data Confusion matrix
 [[1180  693  480]
 [ 703  675  303]
 [ 864  518  458]]

Accuracy on Test Data %: 
 39.376915219611845


ValueError: 'confusion_matrix' is not a valid scoring value. Valid options are ['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'mutual_info_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'v_measure_score']