<a href="https://colab.research.google.com/github/kajsanorin/nlp-hackathon/blob/master/spooky_author.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Spooky Author Identification
###In this task we will identify three classical horror fiction authors (Edgar Allan Poe, Mary Shelley, HP Lovecraft) based on sentences they have written.

###Import from numpy, pandas and sklearn - almost all the libraries you need in ML :)


In [1]:
!pip install scikit-learn -U

Requirement already up-to-date: scikit-learn in /usr/local/lib/python3.6/dist-packages (0.22)


In [0]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import log_loss
from sklearn.svm import LinearSVC
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

###Import data from Github

In [3]:
url = "https://raw.githubusercontent.com/kajsanorin/nlp-hackathon/master/train.csv"
data = pd.read_csv(url)
data.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


###Check how many authors we have


In [4]:
data.author.unique()

array(['EAP', 'HPL', 'MWS'], dtype=object)

###Create a new column with numbers representing the authors

In [0]:
le = preprocessing.LabelEncoder()
le.fit(data.author)

data['author_encoded'] = le.transform(data.author)

In [6]:
data.head()

Unnamed: 0,id,text,author,author_encoded
0,id26305,"This process, however, afforded me no means of...",EAP,0
1,id17569,It never once occurred to me that the fumbling...,HPL,1
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,0
3,id27763,How lovely is spring As we looked from Windsor...,MWS,2
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,1


###Split the data in train and test

In [0]:
X_train, X_test, y_train, y_test = train_test_split(
    data.text, data.author_encoded, test_size=0.2, random_state=42)

###Make a Tfidf model. Try playing around with

```
ngram_range
strip_accents
min_df
lowercase
```

In [0]:
tfidf_model = TfidfVectorizer(
            ngram_range=(1, 2),
            strip_accents='unicode',
            min_df=0.0001,
            lowercase=True)
transformed_train = tfidf_model.fit_transform(X_train)

###Make and train a Support Vector Machine classifier to identify which text strings belong to which author

In [9]:
clf = LinearSVC(random_state=0, tol=1e-5)
clf.fit(transformed_train, y_train) 

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=0, tol=1e-05,
          verbose=0)

In [0]:
transformed_test = tfidf_model.transform(X_test)

###Predict author of the "unknown" sentences

In [0]:
y_pred = clf.predict(transformed_test)

###Let's see how good we are... :)

In [12]:
from sklearn.metrics import accuracy_score
y_true = y_test
accuracy_score(y_true, y_pred)  

0.8398876404494382

In [13]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true, y_pred)

array([[1342,  105,  123],
       [ 138,  865,   68],
       [ 131,   62, 1082]])

In [14]:
confusion_matrix(y_true, y_pred, normalize='true')

array([[0.85477707, 0.06687898, 0.07834395],
       [0.12885154, 0.8076564 , 0.06349206],
       [0.1027451 , 0.04862745, 0.84862745]])