In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA, NMF

from sklearn.linear_model import SGDClassifier, LogisticRegression, Ridge, Lasso
from sklearn.svm import SVR, SVC, LinearSVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold, SelectFromModel, SelectPercentile

from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS, TfidfVectorizer

In [12]:
reviews = load_files('aclImdb/train', categories = ['unsup'], encoding = 'utf-8')

In [14]:
print(reviews.keys(),'\n')
print('\n'.join(reviews.data[:2]),'\n')
print(reviews.target[:10],'\n')
print(reviews.DESCR,'\n')

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR']) 

this is a passive movie by ace director anthony minghella, the movie has an awesome star cast and they all give competent performances early in their careers.<br /><br />the movie does not have much of a plot and though the story seems veers to a cliched and predictable end, there are enough minor twists that abound in the movie, making it quite an enjoyable watch. the standout features of the movie include its tight script, terrific lines and smart performances.<br /><br />the plot in itself is no great shakes but this movie is a fun watch for a relaxed evening.<br /><br />an enjoyable and pleasant 7!
"Confusion of Genders" is all about Alain, a wishy-washy lawyer and mostly gay bisexual who has a male lover, a female fiancé, and another female who is lusting after him for reasons unknown. Although the film is well crafted with believable performances and solid production value, the story is a depressingly misanthro

In [18]:
text = [rev.replace('<br />', ' ') for rev in reviews.data]
print('\n'.join(text[:2]))

this is a passive movie by ace director anthony minghella, the movie has an awesome star cast and they all give competent performances early in their careers.  the movie does not have much of a plot and though the story seems veers to a cliched and predictable end, there are enough minor twists that abound in the movie, making it quite an enjoyable watch. the standout features of the movie include its tight script, terrific lines and smart performances.  the plot in itself is no great shakes but this movie is a fun watch for a relaxed evening.  an enjoyable and pleasant 7!
"Confusion of Genders" is all about Alain, a wishy-washy lawyer and mostly gay bisexual who has a male lover, a female fiancé, and another female who is lusting after him for reasons unknown. Although the film is well crafted with believable performances and solid production value, the story is a depressingly misanthropic satire in which no one has a good word to say to anyone during the entire run leaving us, the au

In [26]:
vect = CountVectorizer(max_df = 0.15, max_features = 10000)
X = vect.fit_transform(text)

In [27]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components = 10, learning_method = 'online', max_iter = 20, random_state = 0)

In [28]:
# Takes a LOT! OF TIME
topics = lda.fit_transform(X)

In [29]:
print(lda.components_.shape)

(10, 10000)


In [34]:
sorting = np.argsort(lda.components_, axis = 1)[:, ::-1]
print(sorting)
feature_names = np.array(vect.get_feature_names())
print(feature_names)

[[1446 6060 8078 ... 4805 2490 6168]
 [9683 9899  444 ... 4009 7829 7580]
 [4366 9910 6143 ...  680 6153 3145]
 ...
 [2613 9893 5939 ... 2716 1487 3901]
 [4883 7587 6676 ... 7685 4739 8430]
 [2868 7932 9541 ... 5617 5956 3778]]
['00' '000' '10' ... 'zoom' 'zorro' 'zucco']


In [35]:
import mglearn

In [58]:
mglearn.tools.print_topics(topics=range(10), feature_names=feature_names,
                           sorting=sorting, topics_per_chunk=5, n_words=20)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
cast          war           horror        around        action        
new           world         worst         girl          down          
show          american      nothing       sex           shot          
role          our           effects       house         gun           
series        us            budget        dead          fight         
comedy        family        script        guy           western       
performance   father        awful         down          car           
performances  own           low           town          cop           
excellent     years         original      gets          city          
episode       mother        terrible      women         keep          
actors        history       minutes       re            guy           
always        america       actors        doesn         road          
wonder

In [68]:
words = np.zeros((20, 10), dtype = 'U20')

for i in range(20):
    for j in range(10):
        words[i][j] = feature_names[sorting[j, i]]
    
words = words
display(pd.DataFrame(words))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,cast,war,horror,around,action,show,woman,director,john,dvd
1,new,world,worst,girl,down,funny,murder,work,role,series
2,show,american,nothing,sex,shot,didn,killer,music,played,version
3,role,our,effects,house,gun,thought,death,real,plays,space
4,series,us,budget,dead,fight,saw,police,interesting,james,fi
5,comedy,family,script,guy,western,now,young,feel,performance,sci
6,performance,father,awful,down,car,want,wife,may,robert,star
7,performances,own,low,town,cop,got,crime,audience,wife,tv
8,excellent,years,original,gets,city,going,prison,between,star,action
9,episode,mother,terrible,women,keep,10,black,quite,george,special
