## Importing Libraries

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

## Exploring the Dataset

In [2]:
ROOT = '../input/aclimdb-v1/aclImdb/train/pos'

In [3]:
reviews = []

for file in os.listdir(ROOT):
    path = os.path.join(ROOT, file)
    if os.path.isfile(path):
        with open(path, 'r') as fin:
            reviews.append(fin.read())

In [4]:
len(reviews)

12500

In [5]:
for i in range(3):
    print(reviews[i])
    print("=" * 150)

This was one of those wonderful rare moments in T.V. that I wished I'd captured forever on VHS. Won't it ever air again? <br /><br />It was so creative and I remember it was aired once a week and the wait for the next episode was excruciating. I want to see it all again. I want to buy it. I want what I can't have. Not even on EBAY. <br /><br />So, having ranted enough it was, by far, one of the best series the 80's put out. It should be considered a classic but is lost in space. At least this website and Wikipedia mention it. Sob.<br /><br />It was utterly appealing, funny, flirtatious, and original. Maybe not like Sherlock Holmes original, I actually think Quintin is far more attractive and has a better chance with his leading lady than the stiff and chalky Holmes ever could.
Have you seen The Graduate? It was hailed as the movie of its generation. But A River Runs Through It is the story about all generations. Long before Dustin Hoffman's character got all wrapped up in the traps of 

## Feature Extraction

In [6]:
vect = TfidfVectorizer(stop_words="english")
X = vect.fit_transform(reviews)

pd.DataFrame(X.toarray(), columns=vect.get_feature_names())

Unnamed: 0,00,000,000s,003830,006,007,0079,0080,0083,0093638,...,élan,émigré,émigrés,était,état,étc,êxtase,ís,østbye,über
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## NMF Decomposition

In [7]:
N_TOPICS = 15
nmf = NMF(n_components=N_TOPICS)
W = nmf.fit_transform(X) # Document-topic matrix
H = nmf.components_      # Topic-term matrix



In [8]:
# Top 10 words per topic
words = np.array(vect.get_feature_names())
topic_words = pd.DataFrame(np.zeros((N_TOPICS, 10)), index = [f'Topic {i + 1}' for i in range(N_TOPICS)], 
                           columns = [f'Word {i + 1}' for i in range(10)]).astype(str)

for i in range(N_TOPICS):
    ix = H[i].argsort()[::-1][:10]
    topic_words.iloc[i] = words[ix]
    
topic_words

Unnamed: 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10
Topic 1,br,10,ll,spoilers,end,simply,yes,spoiler,quite,just
Topic 2,movie,movies,watch,recommend,10,seen,saw,actors,definitely,acting
Topic 3,film,films,director,characters,cinema,seen,festival,work,scenes,art
Topic 4,series,episode,episodes,season,tv,characters,trek,seasons,shows,television
Topic 5,man,role,character,performance,best,plays,john,played,does,actor
Topic 6,good,pretty,story,bad,acting,really,job,liked,nice,actors
Topic 7,war,world,documentary,people,american,history,soldiers,men,women,hitler
Topic 8,funny,comedy,laugh,hilarious,eddie,fun,jokes,humor,funniest,murphy
Topic 9,like,think,really,just,don,people,know,say,didn,lot
Topic 10,time,years,saw,seen,dvd,old,ve,remember,music,disney


In [9]:
# Create a topic mapping
topic_mapping = {
    'Topic 4': 'TV',
    'Topic 7': 'War',
    'Topic 8': 'Comedy',
    'Topic 12': 'Book Adaptation',
    'Topic 13': 'Horror'
}

In [10]:
# Recall the document-topic matrix, W
W = pd.DataFrame(W, columns=[f'Topic {i + 1}' for i in range(N_TOPICS)])
W['max_topic'] = W.apply(lambda x: topic_mapping.get(x.idxmax()), axis=1)
W[pd.notnull(W['max_topic'])].head(10)

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15,max_topic
0,0.044766,0.000316,0.0,0.051721,0.0,0.0,0.0,0.015996,0.012662,0.014747,0.0,0.014161,0.001374,0.000677,5.8e-05,TV
2,0.027432,0.00091,0.0,0.010477,0.0,0.00368,0.008697,0.046995,0.020005,0.0,0.005395,0.0,0.0,0.008901,0.0,Comedy
12,0.000151,0.000659,0.001277,0.0,0.014545,0.00227,0.0,0.032228,1.4e-05,0.000442,0.009852,0.0,0.009043,0.0,0.00014,Comedy
19,9.3e-05,0.0,0.002369,0.035819,0.004032,0.0,0.0,0.049566,0.012959,0.006171,0.0,0.000177,0.0,0.0,0.0,Comedy
20,0.024645,0.0,0.036125,0.000281,0.036516,0.006029,0.011837,0.003237,0.001003,0.003613,0.0,0.057827,0.0,0.009457,0.010612,Book Adaptation
21,0.0,0.0,0.015057,0.0,0.025955,0.019117,0.0,0.034952,0.0,0.0,0.0,0.0,0.0,0.000583,0.0,Comedy
26,0.021455,0.028193,0.009407,0.0,0.0,0.0,0.007225,0.009767,0.003179,0.0,0.009246,0.06684,0.0,0.0,0.0,Book Adaptation
31,0.0,0.0,0.0,0.182693,0.0,0.0,0.0,0.0,0.007127,0.014455,0.0,0.0,0.0,0.00453,0.0,TV
33,0.021534,0.019068,0.018599,0.002601,0.025494,0.005873,0.0,0.0,0.019123,0.001056,0.0111,0.050678,0.054768,0.000537,0.0,Horror
38,0.000536,0.0,0.0,0.038005,0.021327,0.0,0.003383,0.0,0.009477,0.003164,0.002713,0.0,0.004862,0.0,0.003186,TV


In [11]:
reviews[2]

'I don\'t watch a lot of TV, except for The Office, Weeds, Entourage and E!\'s Soup. I think I hold this show in good company.<br /><br />I love the scathing review of pop culture that this show gives. Soup also helps me stay on top of what people in the office are referring to when talking about a Sanjaya or Heidi Montag (sp?).<br /><br />The best part is that Soup shows clips of the highlights of these shows, which are usually the funniest or most controversial moments (c\'mon, most people get hooked into watching American Idol because of the freak show that are the auditions), which is why most people claim to watch. And that means, I don\'t have to suffer through the other 98% of these mind numbing talk shows or "reality" shows, for one nugget of "funny" or "shock." The only reason why Soup doesn\'t get a 10 in my opinion are sometime the sketches are not that funny, and on an even rarer occasion, the commentary isn\'t always up to par. But they can\'t all be home runs either, if s