## Importing Libraries

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

## Exploring the Dataset

In [2]:
ROOT = '../input/aclimdb-v1/aclImdb/train/pos'

In [3]:
reviews = []

for file in os.listdir(ROOT):
    path = os.path.join(ROOT, file)
    if os.path.isfile(path):
        with open(path, 'r') as fin:
            reviews.append(fin.read())

In [4]:
len(reviews)

12500

In [5]:
for i in range(3):
    print(reviews[i])
    print("=" * 150)

This was one of those wonderful rare moments in T.V. that I wished I'd captured forever on VHS. Won't it ever air again? <br /><br />It was so creative and I remember it was aired once a week and the wait for the next episode was excruciating. I want to see it all again. I want to buy it. I want what I can't have. Not even on EBAY. <br /><br />So, having ranted enough it was, by far, one of the best series the 80's put out. It should be considered a classic but is lost in space. At least this website and Wikipedia mention it. Sob.<br /><br />It was utterly appealing, funny, flirtatious, and original. Maybe not like Sherlock Holmes original, I actually think Quintin is far more attractive and has a better chance with his leading lady than the stiff and chalky Holmes ever could.
Have you seen The Graduate? It was hailed as the movie of its generation. But A River Runs Through It is the story about all generations. Long before Dustin Hoffman's character got all wrapped up in the traps of 

## Feature Extraction

In [6]:
vect = TfidfVectorizer(stop_words="english")
X = vect.fit_transform(reviews)

pd.DataFrame(X.toarray(), columns=vect.get_feature_names())

Unnamed: 0,00,000,000s,003830,006,007,0079,0080,0083,0093638,...,élan,émigré,émigrés,était,état,étc,êxtase,ís,østbye,über
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## NMF Decomposition

In [7]:
N_TOPICS = 15
nmf = NMF(n_components=N_TOPICS)
W = nmf.fit_transform(X) # Document-topic matrix
H = nmf.components_      # Topic-term matrix



In [8]:
# Top 10 words per topic
words = np.array(vect.get_feature_names())
topic_words = pd.DataFrame(np.zeros((N_TOPICS, 10)), index = [f'Topic {i + 1}' for i in range(N_TOPICS)], 
                           columns = [f'Word {i + 1}' for i in range(10)]).astype(str)

for i in range(N_TOPICS):
    ix = H[i].argsort()[::-1][:10]
    topic_words.iloc[i] = words[ix]
    
topic_words

Unnamed: 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10
Topic 1,br,10,ll,spoilers,world,end,simply,yes,just,spoiler
Topic 2,movie,movies,watch,recommend,saw,10,definitely,makes,watching,enjoyed
Topic 3,film,films,scenes,director,cinema,festival,plot,characters,work,art
Topic 4,like,just,think,really,don,know,people,say,didn,watch
Topic 5,man,character,role,performance,john,plays,does,played,scene,woman
Topic 6,good,pretty,action,story,bad,acting,really,plot,nice,scenes
Topic 7,series,episode,episodes,season,tv,characters,trek,seasons,shows,television
Topic 8,funny,comedy,laugh,hilarious,fun,eddie,jokes,humor,funniest,love
Topic 9,story,life,love,people,real,characters,world,true,beautiful,way
Topic 10,family,kids,old,children,disney,years,little,father,saw,child


In [9]:
# Create a topic mapping
topic_mapping = {
    'Topic 4': 'TV',
    'Topic 7': 'War',
    'Topic 8': 'Comedy',
    'Topic 12': 'Book Adaptation',
    'Topic 13': 'Horror'
}

In [10]:
# Recall the document-topic matrix, W
W = pd.DataFrame(W, columns=[f'Topic {i + 1}' for i in range(N_TOPICS)])
W['max_topic'] = W.apply(lambda x: topic_mapping.get(x.idxmax()), axis=1)
W[pd.notnull(W['max_topic'])].head(10)

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15,max_topic
2,0.027248,0.00063,0.0,0.024779,0.0,0.002437,0.008404,0.046157,0.01106,0.0,0.0,0.0,0.0,0.0077,0.004959,Comedy
12,6.4e-05,0.000462,0.001315,0.000453,0.014312,0.001915,0.0,0.03211,0.002269,0.009591,0.005676,0.0,0.008293,0.0,0.0,Comedy
19,0.0,0.0,0.0,0.01269,0.0016,0.0,0.028356,0.047079,0.0,0.0,0.0,0.0,0.0,0.0,0.032527,Comedy
20,0.024588,0.0,0.037042,0.001009,0.037227,0.008481,0.000898,0.003328,0.00198,0.0,0.0,0.060257,0.0,0.00991,0.010345,Book Adaptation
21,0.0,0.0,0.015199,0.0,0.024056,0.019019,0.0,0.034305,0.0,0.0,0.0,0.0,0.0,0.002704,0.0,Comedy
26,0.021267,0.027996,0.008154,0.003417,0.00135,0.0,0.0,0.008976,0.012548,0.0,0.0,0.063981,0.0,0.0,0.005076,Book Adaptation
31,0.0,0.0,0.0,0.012212,0.0,0.0,0.154383,0.0,0.0,0.001579,0.0,0.0,0.0,0.006126,0.0035,War
33,0.021323,0.018183,0.018124,0.025069,0.024701,0.005761,9e-05,0.0,0.007766,0.004143,0.0,0.047957,0.055309,0.00012,0.009621,Horror
38,0.00053,0.0,0.000119,0.013458,0.021478,0.0,0.032299,0.0,0.001239,0.002516,0.008671,0.0,0.004663,0.0,0.0,War
40,0.0,0.0,0.012413,0.027206,0.0,0.0,0.000774,0.067398,0.0,0.0,0.00381,6.8e-05,0.000313,0.0,0.002702,Comedy


In [11]:
reviews[33]

"Okay, okay, maybe not THE greatest. I mean, The Exorcist and Psycho and a few others are hard to pass up, but The Shining is way up there. It is, however, by far the best Stephen King story that has been made into a movie. It's better than The Stand, better than Pet Sematary (if not quite as scary), better than Cujo, better than The Green Mile, better the Dolores Claiborne, better than Stand By Me (just barely, though), and yes, it's better than The Shawshank Redemption (shut up, it's better), I don't care WHAT the IMDb Top 250 says. <br /><br />I read that, a couple of decades ago, Stanley Kubrick was sorting through novels at his home trying to find one that might make a good movie, and from the other room, his wife would hear a pounding noise every half hour or so as he threw books against the wall in frustration. Finally, she didn't hear any noise for almost two hours, and when she went to check and see if he had died in his chair or something (I tell this with all due respect, of