In [1]:
from utils import *
import nltk
from nltk.corpus import stopwords
from gensim import corpora, models, similarities
from pprint import pprint
import numpy as np
import gensim



In [10]:
# replace , . - ( ) / { } | _ to space
pos = readdata('data/pos')
pos = [replacePunctual(doc) for doc in pos]
neg = readdata('data/neg')
neg = [replacePunctual(doc) for doc in neg]

In [3]:
# Normalize to lower
pos = [doc.lower() for doc in pos]
neg = [doc.lower() for doc in neg]

# Tokenize
pos_tokens = [nltk.word_tokenize(doc) for doc in pos]
neg_tokens = [nltk.word_tokenize(doc) for doc in neg]

# lemmatizer
wnl = nltk.WordNetLemmatizer()
pos_tokens = [[wnl.lemmatize(word) for word in doc_token] for doc_token in pos_tokens]
neg_tokens = [[wnl.lemmatize(word) for word in doc_token] for doc_token in neg_tokens]

# Get training and testing data
trainingX, trainingY, testingX, testingY = split(pos_tokens, neg_tokens)

# Get valid words, including Freq(w) > 2 and non-stopwords and satisfy re = [a-zA-Z\']*
# Attention: only training data can be used 
validwords = genValidWords(trainingX)

# Filter invalid words and use a flag to mark new words or invalid words
trainingX = [[w if w in validwords else "#" for w in doc] for doc in trainingX]
testingX = [[w if w in validwords else "#" for w in doc] for doc in testingX]


In [4]:
# use gensim to build dictionary
dictionary = corpora.Dictionary(trainingX)
numOfTerms = len(dictionary.token2id)
print("diction length is %d" % (numOfTerms))

# print(dictionary.token2id)

diction length is 15037


In [5]:
# convert to BOW representation
trainingBOW = [dictionary.doc2bow(doc) for doc in trainingX]
testingBOW = [dictionary.doc2bow(doc) for doc in testingX]
# print(trainingBOW[0])


In [11]:
# TF-IDF
tfidf = models.TfidfModel(trainingBOW)
trainingTFIDF = tfidf[trainingBOW]
testingTFIDF = tfidf[testingBOW]
print(trainingTFIDF[0])

[(0, 0.24568427101122933), (1, 0.06349722212399098), (2, 0.11326959562795012), (3, 0.02236969954698247), (4, 0.09105761863369047), (5, 0.004218948640747564), (6, 0.02885693845163563), (7, 0.017476616125373136), (8, 0.01318212224677471), (9, 0.05631703816311842), (10, 0.012557280651071962), (11, 0.017039134706569443), (12, 0.021086164576462316), (13, 0.0204606000135045), (14, 0.03544502893251888), (15, 0.02173786862550863), (16, 0.060032551059063015), (17, 0.152193395204837), (18, 0.042858005067506026), (19, 0.02339616395423827), (20, 0.012838996341179494), (21, 0.016479806044053826), (22, 0.022112324424430934), (23, 0.02849763714576092), (24, 0.03018449799096328), (25, 0.027277544375853585), (26, 0.04615680860676326), (27, 0.01978811906883763), (28, 0.0959530297369103), (29, 0.0031013595389703602), (30, 0.026867240610606663), (31, 0.05472021966641885), (32, 0.03435361875896172), (33, 0.03567782110663347), (34, 0.007060600343745879), (35, 0.05472021966641885), (36, 0.002895198159672448)

In [7]:
lsiNumOfTopics = 120
# LSI
lsi = models.LsiModel(trainingTFIDF, id2word=dictionary, num_topics=lsiNumOfTopics)
trainingLSI = lsi[trainingTFIDF]
testingLSI = lsi[testingTFIDF]

trainingLSI = gensim.matutils.corpus2dense(trainingLSI, num_terms = lsiNumOfTopics).transpose()
testingLSI = gensim.matutils.corpus2dense(testingLSI, num_terms = lsiNumOfTopics).transpose()
lsi.print_topics(10)


[(0,
  '0.084*"!" + 0.079*"?" + 0.070*"alien" + 0.067*"\'" + 0.061*"action" + 0.060*"wa" + 0.059*"movie" + 0.058*"bad" + 0.056*"scene" + 0.055*"really"'),
 (1,
  '-0.517*"alien" + 0.195*"jackie" + -0.182*"ripley" + -0.158*"ship" + 0.138*"chan" + -0.133*"war" + -0.130*"jedi" + 0.126*"truman" + -0.125*"planet" + -0.121*"phantom"'),
 (2,
  '0.474*"jackie" + -0.469*"truman" + 0.351*"chan" + -0.216*"carrey" + 0.132*"tarantino" + 0.125*"ordell" + 0.104*"brown" + 0.080*"martial" + 0.079*"tucker" + -0.076*"christof"'),
 (3,
  '-0.502*"truman" + 0.353*"scream" + -0.230*"jackie" + -0.227*"carrey" + 0.176*"horror" + -0.175*"chan" + 0.123*"killer" + 0.117*"sidney" + 0.098*"craven" + 0.098*"williamson"'),
 (4,
  '0.333*"alien" + 0.304*"truman" + 0.304*"scream" + 0.214*"jackie" + 0.151*"chan" + 0.141*"carrey" + 0.138*"horror" + 0.132*"ripley" + -0.121*"war" + -0.114*"phantom"'),
 (5,
  '0.310*"scream" + -0.263*"alien" + 0.211*"phantom" + 0.211*"jedi" + 0.200*"lucas" + 0.181*"menace" + 0.181*"war" + 

In [9]:
ldaNumOfTopics = 50
#LDA
ldaModel = models.LdaModel(trainingBOW, id2word=dictionary, num_topics=ldaNumOfTopics)
trainingLDA = ldaModel[trainingBOW]
testingLDA = ldaModel[testingBOW]

trainingLDA = gensim.matutils.corpus2dense(trainingLDA, num_terms = ldaNumOfTopics).transpose()
testingLDA = gensim.matutils.corpus2dense(testingLDA, num_terms = ldaNumOfTopics).transpose()

ldaModel.print_topics(10)

[(38,
  '0.486*"#" + 0.011*"\'s" + 0.008*"film" + 0.005*"n\'t" + 0.004*"wa" + 0.004*"one" + 0.004*"ha" + 0.003*"?" + 0.003*"movie" + 0.003*"like"'),
 (9,
  '0.250*"#" + 0.012*"\'s" + 0.008*"film" + 0.004*"wa" + 0.003*"n\'t" + 0.003*"movie" + 0.002*"ha" + 0.002*"!" + 0.002*"character" + 0.002*"like"'),
 (48,
  '0.000*"claiming" + 0.000*"uncaring" + 0.000*"marked" + 0.000*"dano" + 0.000*"aficionado" + 0.000*"conduct" + 0.000*"hershey" + 0.000*"rhetoric" + 0.000*"obscured" + 0.000*"obscurity"'),
 (35,
  '0.000*"claiming" + 0.000*"uncaring" + 0.000*"marked" + 0.000*"dano" + 0.000*"aficionado" + 0.000*"conduct" + 0.000*"hershey" + 0.000*"rhetoric" + 0.000*"obscured" + 0.000*"obscurity"'),
 (30,
  '0.000*"claiming" + 0.000*"uncaring" + 0.000*"marked" + 0.000*"dano" + 0.000*"aficionado" + 0.000*"conduct" + 0.000*"hershey" + 0.000*"rhetoric" + 0.000*"obscured" + 0.000*"obscurity"'),
 (40,
  '0.000*"claiming" + 0.000*"uncaring" + 0.000*"marked" + 0.000*"dano" + 0.000*"aficionado" + 0.000*"condu

In [13]:
# data Preparation
trainingTFIDF_m = gensim.matutils.corpus2dense(trainingTFIDF, num_terms = numOfTerms).transpose()
testingTFIDF_m = gensim.matutils.corpus2dense(testingTFIDF, num_terms = numOfTerms).transpose()

trainingBOW_m = gensim.matutils.corpus2dense(trainingBOW, num_terms = numOfTerms).transpose()
testingBOW_m = gensim.matutils.corpus2dense(testingBOW, num_terms = numOfTerms).transpose()
trainingBOW_m = np.where(trainingBOW_m>1, trainingBOW_m, 1)
testingBOW_m = np.where(testingBOW_m>1, testingBOW_m, 1)

trainingY = np.array(trainingY)
testingY = np.array(testingY)

from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
targetNames = ['0', '1']



In [15]:
# chi2 feature importance
from pprint import pprint
id2token = {v: k for k, v in dictionary.token2id.items()}

# from sklearn import feature_selection
# chi2, pval = feature_selection.chi2(trainingBOW_m, trainingY)
# sorted_chi2 = sorted(enumerate(chi2), key=lambda x: x[1], reverse = True)
# pprint([(id2token[p[0]], p[1]) for (i,p) in enumerate(sorted_chi2) if i < 100 ])


from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier()
clf = clf.fit(trainingBOW_m, trainingY)
sorted_fi = sorted(enumerate(clf.feature_importances_), key=lambda x: x[1], reverse = True)
pprint([(id2token[p[0]], p[1]) for (i,p) in enumerate(sorted_fi) if i < 100 ])



[('bad', 0.0083522523002254757),
 ('often', 0.0056484275080608774),
 ("n't", 0.0046324853457242032),
 ('life', 0.0044796634610995371),
 ('#', 0.0043827111477590126),
 ('could', 0.004156346726239369),
 ('great', 0.0041413602484206922),
 ('many', 0.0039758838831258064),
 ('love', 0.003706629040612106),
 ('?', 0.0034742759607361909),
 ('family', 0.0032571502195385951),
 ('small', 0.0031835848974424499),
 ('people', 0.0031572873207524567),
 ('man', 0.0029853527754344061),
 ('bit', 0.0029501901011737544),
 ('always', 0.0029479341989952735),
 ('wonderful', 0.0028538839180434041),
 ('also', 0.0028116114810701033),
 ('make', 0.0027514020546252358),
 ('scene', 0.0026555969307502049),
 ('seen', 0.0026317650221137621),
 ('ha', 0.0026254911402508224),
 ('obvious', 0.002528769808314864),
 ('several', 0.0025071993107617435),
 ('film', 0.0024898176925192347),
 ('men', 0.0024776334705439648),
 ('go', 0.0024467061618328378),
 ('even', 0.0024417032508519893),
 ('good', 0.0024178874115670379),
 ('well', 

In [61]:
# 1. LR for BOW
clf_lr_tfidf = LogisticRegression(penalty='l2', tol=0.001)
clf_lr_tfidf.fit(trainingBOW_m, trainingY)
predicted_lr_tfidf = clf_lr_tfidf.predict(testingBOW_m)
print(classification_report(testingY, predicted_lr_tfidf, target_names = targetNames))
print(confusion_matrix(testingY, predicted_lr_tfidf))

             precision    recall  f1-score   support

          0       0.78      0.86      0.82       283
          1       0.86      0.78      0.82       314

avg / total       0.82      0.82      0.82       597

[[243  40]
 [ 68 246]]


In [62]:
# 2. LR for TF-IDF
clf_lr_tfidf = LogisticRegression(penalty='l2', tol=0.001)
clf_lr_tfidf.fit(trainingTFIDF_m, trainingY)
predicted_lr_tfidf = clf_lr_tfidf.predict(testingTFIDF_m)
print(classification_report(testingY, predicted_lr_tfidf, target_names = targetNames))
print(confusion_matrix(testingY, predicted_lr_tfidf))


             precision    recall  f1-score   support

          0       0.77      0.84      0.80       283
          1       0.85      0.77      0.81       314

avg / total       0.81      0.81      0.81       597

[[239  44]
 [ 72 242]]


In [64]:
# 3. LR for LSI
clf_lr_tfidf = LogisticRegression(penalty='l2', tol=0.001)
clf_lr_tfidf.fit(trainingLSI, trainingY)
# predicted_lr_tfidf = clf_lr_tfidf.predict(testingLSI)
print(classification_report(testingY, predicted_lr_tfidf, target_names = targetNames))

             precision    recall  f1-score   support

          0       0.69      0.80      0.74       283
          1       0.79      0.68      0.73       314

avg / total       0.74      0.74      0.74       597



In [65]:
# 4. LR for LDA
clf_lr_tfidf = LogisticRegression(penalty='l2', tol=0.001)
clf_lr_tfidf.fit(trainingLDA, trainingY)
predicted_lr_tfidf = clf_lr_tfidf.predict(testingLDA)
print(classification_report(testingY, predicted_lr_tfidf, target_names = targetNames))
print(confusion_matrix(testingY, predicted_lr_tfidf))

             precision    recall  f1-score   support

          0       0.48      0.90      0.63       283
          1       0.58      0.12      0.20       314

avg / total       0.53      0.49      0.41       597

[[255  28]
 [275  39]]
