In [41]:
from utils import *
import nltk
from nltk.corpus import stopwords
from gensim import corpora, models, similarities
from pprint import pprint
import numpy as np
import gensim

In [42]:
# replace , . - ( ) / { } | _ to space
pos = readdata('data/pos')
pos = [replacePunctual(doc) for doc in pos]
neg = readdata('data/neg')
neg = [replacePunctual(doc) for doc in neg]

In [43]:
# Normalize to lower
pos = [doc.lower() for doc in pos]
neg = [doc.lower() for doc in neg]

# Tokenize
pos_tokens = [nltk.word_tokenize(doc) for doc in pos]
neg_tokens = [nltk.word_tokenize(doc) for doc in neg]

# lemmatizer
wnl = nltk.WordNetLemmatizer()
pos_tokens = [[wnl.lemmatize(word) for word in doc_token] for doc_token in pos_tokens]
neg_tokens = [[wnl.lemmatize(word) for word in doc_token] for doc_token in neg_tokens]

# Get training and testing data
trainingX, trainingY, testingX, testingY = split(pos_tokens, neg_tokens)

# Get valid words, including Freq(w) > 2 and non-stopwords and satisfy re = [a-zA-Z\']*
# Attention: only training data can be used 
validwords = genValidWords(trainingX)

# Filter invalid words and use a flag to mark new words or invalid words
trainingX = [[w if w in validwords else "#" for w in doc] for doc in trainingX]
testingX = [[w if w in validwords else "#" for w in doc] for doc in testingX]


In [44]:
# use gensim to build dictionary
dictionary = corpora.Dictionary(trainingX)
numOfTerms = len(dictionary.token2id)
print("diction length is %d" % (numOfTerms))
# print(dictionary.token2id)

diction length is 15325


In [45]:
# convert to BOW representation
trainingBOW = [dictionary.doc2bow(doc) for doc in trainingX]
testingBOW = [dictionary.doc2bow(doc) for doc in testingX]
# print(trainingBOW[0])


In [46]:
# TF-IDF
tfidf = models.TfidfModel(trainingBOW)
trainingTFIDF = tfidf[trainingBOW]
testingTFIDF = tfidf[testingBOW]
# print(trainingTFIDF[0])

In [58]:
lsiNumOfTopics = 120
# LSI
lsi = models.LsiModel(trainingTFIDF, id2word=dictionary, num_topics=lsiNumOfTopics)
trainingLSI = lsi[trainingTFIDF]
testingLSI = lsi[testingTFIDF]

trainingLSI = gensim.matutils.corpus2dense(trainingLSI, num_terms = lsiNumOfTopics).transpose()
testingLSI = gensim.matutils.corpus2dense(testingLSI, num_terms = lsiNumOfTopics).transpose()
# lsi.print_topics(10)


In [59]:
ldaNumOfTopics = 50
#LDA
ldaModel = models.LdaModel(trainingBOW, id2word=dictionary, num_topics=ldaNumOfTopics)
trainingLDA = ldaModel[trainingBOW]
testingLDA = ldaModel[testingBOW]

trainingLDA = gensim.matutils.corpus2dense(trainingLDA, num_terms = ldaNumOfTopics).transpose()
testingLDA = gensim.matutils.corpus2dense(testingLDA, num_terms = ldaNumOfTopics).transpose()

# ldaModel.print_topics(10)

In [None]:
rbmNumOfTopics = 100
# RBM
from sklearn.neural_network import BernoulliRBM
rbm = BernoulliRBM(random_state=0, verbose=True, n_components=rbmNumOfTopics)
rbm.fit(X)
get_params

In [60]:
# data Preparation

trainingTFIDF_m = gensim.matutils.corpus2dense(trainingTFIDF, num_terms = numOfTerms).transpose()
testingTFIDF_m = gensim.matutils.corpus2dense(testingTFIDF, num_terms = numOfTerms).transpose()

trainingBOW_m = gensim.matutils.corpus2dense(trainingBOW, num_terms = numOfTerms).transpose()
testingBOW_m = gensim.matutils.corpus2dense(testingBOW, num_terms = numOfTerms).transpose()

trainingY = np.array(trainingY)
testingY = np.array(testingY)

from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
targetNames = ['0', '1']


In [61]:
# 1. LR for BOW
clf_lr_tfidf = LogisticRegression(penalty='l2', tol=0.001)
clf_lr_tfidf.fit(trainingBOW_m, trainingY)
predicted_lr_tfidf = clf_lr_tfidf.predict(testingBOW_m)
print(classification_report(testingY, predicted_lr_tfidf, target_names = targetNames))
print(confusion_matrix(testingY, predicted_lr_tfidf))

             precision    recall  f1-score   support

          0       0.78      0.86      0.82       283
          1       0.86      0.78      0.82       314

avg / total       0.82      0.82      0.82       597

[[243  40]
 [ 68 246]]


In [62]:
# 2. LR for TF-IDF
clf_lr_tfidf = LogisticRegression(penalty='l2', tol=0.001)
clf_lr_tfidf.fit(trainingTFIDF_m, trainingY)
predicted_lr_tfidf = clf_lr_tfidf.predict(testingTFIDF_m)
print(classification_report(testingY, predicted_lr_tfidf, target_names = targetNames))
print(confusion_matrix(testingY, predicted_lr_tfidf))


             precision    recall  f1-score   support

          0       0.77      0.84      0.80       283
          1       0.85      0.77      0.81       314

avg / total       0.81      0.81      0.81       597

[[239  44]
 [ 72 242]]


In [64]:
# 3. LR for LSI
clf_lr_tfidf = LogisticRegression(penalty='l2', tol=0.001)
clf_lr_tfidf.fit(trainingLSI, trainingY)
predicted_lr_tfidf = clf_lr_tfidf.predict(testingLSI)
print(classification_report(testingY, predicted_lr_tfidf, target_names = targetNames))

             precision    recall  f1-score   support

          0       0.69      0.80      0.74       283
          1       0.79      0.68      0.73       314

avg / total       0.74      0.74      0.74       597



In [65]:
# 3. LR for LDA
clf_lr_tfidf = LogisticRegression(penalty='l2', tol=0.001)
clf_lr_tfidf.fit(trainingLDA, trainingY)
predicted_lr_tfidf = clf_lr_tfidf.predict(testingLDA)
print(classification_report(testingY, predicted_lr_tfidf, target_names = targetNames))
print(confusion_matrix(testingY, predicted_lr_tfidf))

             precision    recall  f1-score   support

          0       0.48      0.90      0.63       283
          1       0.58      0.12      0.20       314

avg / total       0.53      0.49      0.41       597

[[255  28]
 [275  39]]
