In [79]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances

import pandas as pd
import numpy as np

In [80]:
df = pd.read_csv('movie_data.csv.gz')

In [81]:
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
review       50000 non-null object
sentiment    50000 non-null int64
dtypes: int64(1), object(1)
memory usage: 781.3+ KB


In [83]:
np.random.seed(42)
idx = np.random.randint(0, 50000, size=5000)
df_reduced = df.iloc[idx]

In [125]:
df_reduced.shape

(5000, 2)

In [135]:
keywords = [
    'fight',
    'action',
    'war',
    'car'
]

In [136]:
vec = CountVectorizer(stop_words='english')
docs = df_reduced['review']

sparse_matrix = vec.fit_transform(docs)

In [86]:
dense_matrix = sparse_matrix.toarray()

In [87]:
df_docs  = pd.DataFrame(dense_matrix, columns=vec.get_feature_names())
df_docs['docs'] = docs

In [88]:
df_docs

Unnamed: 0,shining,sun,sweet,weather,docs
0,1,1,0,0,The sun is shining
1,0,0,1,1,The weather is sweet
2,1,1,1,1,"The sun is shining, the weather is sweet, and ..."


In [89]:
cosine_distances(dense_matrix)

array([[0.        , 1.        , 0.29289322],
       [1.        , 0.        , 0.29289322],
       [0.29289322, 0.29289322, 0.        ]])

In [121]:
count = CountVectorizer(stop_words='english')
X = count.fit_transform(df_reduced['review'])


In [104]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(random_state=42, learning_method='batch')
X_topics = lda.fit_transform(X)

In [109]:
lda.components_.shape

(10, 5000)

In [110]:
n_top_words = 5
feature_names = count.get_feature_names()

for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d:" % (topic_idx + 1))
    print(" ".join([feature_names[i]
                    for i in topic.argsort()\
                        [:-n_top_words - 1:-1]]))

Topic 1:
horror game video fun day
Topic 2:
michael plays girl john role
Topic 3:
series dvd book tv version
Topic 4:
worst minutes guy money stupid
Topic 5:
war gets episode guy joe
Topic 6:
performance beautiful role performances wonderful
Topic 7:
woman family girl kids mother
Topic 8:
effects killer horror place fi
Topic 9:
war gets action disney comedy
Topic 10:
horror music action original effects
