In [104]:
import base64
import numpy as np
import pandas as pd

# Plotly imports
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import cufflinks as cf

# Other imports
from collections import Counter
from scipy.misc import imread
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from matplotlib import pyplot as plt
%matplotlib inline

In [124]:
df = pd.read_csv('../input/train.csv')

In [125]:
df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [108]:
df.shape

(19579, 3)

In [109]:
z = {'EAP': 'Edgar Allen Poe', 'MWS': 'Mary Shelley', 'HPL': 'HP Lovecraft'}
trace = [go.Bar(
    x=df['author'].map(z).unique(), 
    y=df['author'].value_counts().values,
    marker=dict(colorscale='Viridis',
                color = df.author.value_counts().values, 
                line=dict(color='rgb(8,48,107)',width=1.5)),
    text='Text entries attributed to Author'
    )]
layout = go.Layout(title='Text Distribution')
fig = go.Figure(data=trace, layout=layout)
py.iplot(fig)

**plot the most frequent words**

In [126]:
words = df['text'].str.split(expand=True).stack().value_counts()
words.head()

the    33296
of     20851
and    17059
to     12615
I      10382
dtype: int64

In [111]:
trace=[go.Bar(
        x=words.index[:51],
        y=words.values[:51], 
        marker= dict(colorscale='Viridis', color = words.values[:51]),
        text='Word counts', 
        )]

layout = go.Layout(title='Top Words Distribution (Before cleaning)', xaxis=dict(tickangle=-45))
fig = go.Figure(data=trace, layout=layout)
py.iplot(fig)

**Cleaning words (Remove Punctuation and Stopwords)**

In [133]:
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 

[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


**Remove Punctuation**

In [128]:
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

In [131]:
df['text'] = df['text'].apply(remove_punctuation)

**Remove Stopwords**

In [139]:
def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stopwords.words('english')]
    return " ".join(text)

In [140]:
df['text'] = df['text'].apply(remove_stopwords)

**Stemming**

In [142]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
def stemming(text):    
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text) 

In [143]:
df['text'] = df['text'].apply(stemming)

In [145]:
word = df['text'].str.split(expand=True).stack().value_counts()
word.head()

one      1677
upon     1411
could    1316
would    1241
time      874
dtype: int64

In [147]:
trace=[go.Bar(
        x=word.index[:51],
        y=word.values[:51], 
        marker= dict(colorscale='Viridis', color = word.values[:51]),
        text='Word counts', 
        )]

layout = go.Layout(title='Top Words Distribution (After cleaning)', xaxis=dict(tickangle=-45))
fig = go.Figure(data=trace, layout=layout)
py.iplot(fig)

**ApplyingLatent Dirichlet Allocation**

In [151]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df=0.9, min_df=2)
#cv = CountVectorizer(max_df=0.9, min_df=2, stopwords = 'english') if not apply stopwords function above
dtm = cv.fit_transform(df['text'])
dtm

<19579x10074 sparse matrix of type '<class 'numpy.int64'>'
	with 244037 stored elements in Compressed Sparse Row format>

In [152]:
from sklearn.decomposition import LatentDirichletAllocation
LDA = LatentDirichletAllocation(n_components=7, random_state=42)

In [153]:
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=7, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [154]:
len(cv.get_feature_names())

10074

In [155]:
type(cv.get_feature_names())

list

In [156]:
cv.get_feature_names()[5000]

'lark'

In [157]:
len(LDA.components_)

7

In [162]:
single_topic = LDA.components_[0]
top_twenty_words = single_topic.argsort()[-20:]
for index in top_twenty_words:
    print(cv.get_feature_names()[index])

great
call
place
come
thought
human
tone
never
mani
would
death
came
found
like
voic
could
thing
said
night
upon


**Check the result of topic labeling**

In [166]:
for i, topic in enumerate(LDA.components_):
    print(f'The top 20 words for topic #{i}')
    print([cv.get_feature_names()[index] for index in topic.argsort()[-20:]])
    print('\n')


The top 20 words for topic #0
['great', 'call', 'place', 'come', 'thought', 'human', 'tone', 'never', 'mani', 'would', 'death', 'came', 'found', 'like', 'voic', 'could', 'thing', 'said', 'night', 'upon']


The top 20 words for topic #1
['direct', 'stood', 'place', 'hill', 'eye', 'toward', 'day', 'light', 'two', 'side', 'feet', 'turn', 'tree', 'could', 'hous', 'window', 'wall', 'open', 'one', 'upon']


The top 20 words for topic #2
['shall', 'natur', 'raymond', 'word', 'father', 'thought', 'said', 'hope', 'never', 'even', 'must', 'might', 'heart', 'feel', 'one', 'yet', 'could', 'life', 'love', 'would']


The top 20 words for topic #3
['observ', 'first', 'matter', 'appear', 'mr', 'well', 'mean', 'may', 'made', 'howev', 'much', 'would', 'said', 'time', 'us', 'great', 'could', 'say', 'one', 'upon']


The top 20 words for topic #4
['strang', 'still', 'made', 'two', 'place', 'knew', 'seem', 'three', 'came', 'room', 'must', 'street', 'day', 'night', 'thing', 'man', 'hous', 'year', 'one', 'old

In [168]:
df.head()

Unnamed: 0,id,text,author
0,id26305,process howev afford mean ascertain dimens dun...,EAP
1,id17569,never occur fumbl might mere mistak,HPL
2,id11008,left hand gold snuff box caper hill cut manner...,EAP
3,id27763,love spring look windsor terrac sixteen fertil...,MWS
4,id12958,find noth els even gold superintend abandon at...,HPL


**Create new column**

In [169]:
top_result = LDA.transform(dtm)

In [170]:
df['Topic'] = top_result.argmax(axis=1)

In [172]:
df.head(15)

Unnamed: 0,id,text,author,Topic
0,id26305,process howev afford mean ascertain dimens dun...,EAP,3
1,id17569,never occur fumbl might mere mistak,HPL,2
2,id11008,left hand gold snuff box caper hill cut manner...,EAP,1
3,id27763,love spring look windsor terrac sixteen fertil...,MWS,6
4,id12958,find noth els even gold superintend abandon at...,HPL,5
5,id22965,youth pass solitud best year spent gentl femin...,MWS,2
6,id09674,astronom perhap point took refug suggest non l...,EAP,3
7,id13515,surcingl hung riband bodi,EAP,1
8,id19322,knew could say stereotomi without brought thin...,EAP,3
9,id00912,confess neither structur languag code govern p...,MWS,2
