In [275]:
#imports
import os
import re
import string
import nltk
from nltk import FreqDist
from nltk.collocations import *

In [199]:
#defining file names
fl1 = "reviews1.txt"
fl2 = "reviews2.txt"

In [200]:
#method to combine data from different files to one file
def dataset_join(path, new_file_name):
    fileToWrite = open(new_file_name, "w")
    for directory in os.listdir(path):
        sub_directory = os.path.join(path, directory)
        if os.path.isdir(sub_directory):
            for filename in os.listdir(sub_directory):
                if not filename.startswith('.'):
                    sub_directory2 = os.path.join(sub_directory, filename)
                    for txt_file in os.listdir(sub_directory2):
                        with open(os.path.join(sub_directory2, txt_file)) as f:
                            txt_file = f.read()
                            fileToWrite.write(txt_file)
    fileToWrite.close()

In [229]:
#method to perform preprocessing
def text_cleaning(text):
    '''
    Make text lowercase, remove text in square brackets,remove links,remove special characters
    and remove words containing numbers.
    '''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[$@&]','', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [230]:
#method to extract tokens from input data
def extract_tokens(input_data): 
    pattern = r''' (?x)
         (?:[A-Z][a-z]*[a-z]\.)+           # titles
        | \d+\.\d+                        # number with a decimal
        | (?:\d+,)+?\d{3}(?=(?:[^,]|$))   # number with a comma
        | (?:[A-Z]\.)+                    # simple abbreviations
        | \$?\d+(?:\.\d+)?%?              # currency and percentages, $12.40, 50%
        |(?:https?://|www)\S+             # simple URLs
        |\w+(?:-\w+)*                     # single hypen
        |(?:[w]\/)                        # w/
        |\w+(?:'\w+)*                     # single apostrophe
        |\#\w+                            # hashtags
        | @\w+                            # mentions
        '''
    tokens = nltk.regexp_tokenize(input_data, pattern)
    return tokens

In [231]:
#data from review dataset
dataset_join('op_spam_v1.4/negative_polarity', fl1)
dataset_join('op_spam_v1.4/positive_polarity', fl2)
file1 =  open(fl1).read()
file2 =  open(fl2).read()
file1_cleaned = text_cleaning(file1)
file2_cleaned = text_cleaning(file2)

In [232]:
#generating tokens
tokens1 = extract_tokens(file1_cleaned)
tokens2 = extract_tokens(file2_cleaned)

In [233]:
#examine the NLTK stopword list
nltkstopwords = nltk.corpus.stopwords.words('english')
#defined stop words
morestopwords = ['could','would','might','must','need','sha','wo','y',"'s","'d","'ll","'t","'m","'re","'ve"]
#total stopwords
stopwords = nltkstopwords + morestopwords

In [234]:
#filter shortwords list to not include any stopwords that we defined above
stopped_file_words1 = [w for w in tokens1 if not w in stopwords]
print(len(stopped_file_words1))
#filter shortwords list to not include any stopwords that we defined above
stopped_file_words2 = [w for w in tokens2 if not w in stopwords]
print(len(stopped_file_words2))

27605
18748


In [242]:
#frequency distribution with new filtered word list
dist_words1 = FreqDist(stopped_file_words1)
file_items1 = dist_words1.most_common(50)
total_words_in_doc1 = len(stopped_file_words1)
#finding the normalized frequency of top 50 common words
normalized_freq_words1 = [(word, freq / total_words_in_doc1 * 100) for (word, freq) in file_items1]
#printing top 50 words along with normalized frequency
for pair in normalized_freq_words1:
    print(pair)

('room', 2.5756203586306827)
('hotel', 2.3908712189820687)
('stay', 0.9092555696431806)
('chicago', 0.8694077159934794)
('service', 0.7824669443941316)
('one', 0.6339431262452454)
('staff', 0.6013403368954899)
('desk', 0.5578699510958159)
('like', 0.5216446295960877)
('us', 0.5107770331461692)
('night', 0.5071545009961964)
('rooms', 0.5071545009961964)
('get', 0.4999094366962507)
('even', 0.4745517116464409)
('stayed', 0.4745517116464409)
('front', 0.4528165187466039)
('bed', 0.3839884078971201)
('time', 0.38036587574714725)
('didnt', 0.37674334359717443)
('first', 0.3513856185473646)
('next', 0.3513856185473646)
('made', 0.3296504256475276)
('never', 0.3296504256475276)
('two', 0.3296504256475276)
('got', 0.3260278934975548)
('good', 0.32240536134758196)
('also', 0.32240536134758196)
('back', 0.31516029704763626)
('experience', 0.3079152327476906)
('day', 0.3042927005977178)
('nice', 0.30067016844774497)
('called', 0.29704763629777214)
('arrived', 0.2934251041477993)
('bathroom', 0.29

In [243]:
#frequency distribution with new filtered word list
dist_words2 = FreqDist(stopped_file_words2)
file_items2 = dist_words2.most_common(50)
total_words_in_doc2 = len(stopped_file_words2)
#finding the normalized frequency of top 50 common words
normalized_freq_words2 = [(word, freq / total_words_in_doc2 * 100) for (word, freq) in file_items2]
#printing top 50 words along with normalized frequency
for pair in normalized_freq_words2:
    print(pair)

('hotel', 3.269682099423939)
('room', 1.920204821847664)
('chicago', 1.5788350757414125)
('stay', 1.4241519095370172)
('great', 1.328141668444634)
('staff', 1.024109238318754)
('service', 0.8587582675485385)
('location', 0.7254107104757841)
('stayed', 0.693407296778323)
('rooms', 0.6667377853637722)
('nice', 0.5547258374226584)
('clean', 0.5493919351397483)
('comfortable', 0.5387241305739279)
('friendly', 0.512054619159377)
('one', 0.5013868145935566)
('also', 0.480051205461916)
('us', 0.4747173031790058)
('well', 0.45871559633027525)
('really', 0.4427138894815447)
('place', 0.4373799871986345)
('good', 0.4320460849157244)
('time', 0.41071047578408365)
('bed', 0.3680392575208022)
('like', 0.36270535523789205)
('recommend', 0.36270535523789205)
('helpful', 0.36270535523789205)
('city', 0.35737145295498185)
('even', 0.3467036483891615)
('best', 0.34136974610625137)
('michigan', 0.34136974610625137)
('hotels', 0.3360358438233412)
('definitely', 0.3360358438233412)
('excellent', 0.32536803

In [237]:
#variable for the bigram measures
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [238]:
#filter out bigrams in which the first word’s length is less than 3
finder1 = BigramCollocationFinder.from_words(tokens1)
finder1.apply_word_filter(lambda w: w in stopwords)
finder1.apply_freq_filter(5)
finder1.apply_ngram_filter(lambda w1, w2: len(w1) < 3 and len(w2) < 3)
scored_bigram1 = finder1.score_ngrams(bigram_measures.raw_freq)
for bscore in scored_bigram1[:50]:
    print (bscore)

(('front', 'desk'), 0.0019554991808043974)
(('room', 'service'), 0.0012331976815883586)
(('hard', 'rock'), 0.0007751528284269683)
(('rock', 'hotel'), 0.0004932790726353435)
(('customer', 'service'), 0.00045804485316139034)
(('ambassador', 'east'), 0.0004404277434244138)
(('looked', 'like'), 0.00042281063368743724)
(('next', 'morning'), 0.00038757641421348414)
(('hilton', 'chicago'), 0.00035234219473953104)
(('even', 'though'), 0.0003347250850025545)
(('hotel', 'monaco'), 0.00031710797526557794)
(('talbott', 'hotel'), 0.00031710797526557794)
(('desk', 'staff'), 0.0002994908655286014)
(('east', 'hotel'), 0.0002994908655286014)
(('new', 'room'), 0.0002994908655286014)
(('another', 'room'), 0.00028187375579162484)
(('didnt', 'work'), 0.0002642566460546483)
(('recently', 'stayed'), 0.0002642566460546483)
(('desk', 'clerk'), 0.00024663953631767175)
(('never', 'stay'), 0.00024663953631767175)
(('next', 'time'), 0.00024663953631767175)
(('water', 'tower'), 0.00024663953631767175)
(('chicago', 

In [239]:
#filter out bigrams in which the first word’s length is less than 3
finder2 = BigramCollocationFinder.from_words(tokens2)
finder2.apply_word_filter(lambda w: w in stopwords)
finder2.apply_freq_filter(5)
finder2.apply_ngram_filter(lambda w1, w2: len(w1) < 3 and len(w2) < 3)
scored_bigram2 = finder2.score_ngrams(bigram_measures.raw_freq)
for bscore in scored_bigram2[:50]:
    print (bscore)

(('hard', 'rock'), 0.0012640185030793642)
(('ambassador', 'east'), 0.0011833364709679154)
(('front', 'desk'), 0.0011026544388564666)
(('room', 'service'), 0.001048866417448834)
(('rock', 'hotel'), 0.0009143963639297528)
(('michigan', 'ave'), 0.0007799263104106716)
(('east', 'hotel'), 0.000726138289003039)
(('highly', 'recommend'), 0.0006992442782992228)
(('hotel', 'monaco'), 0.0006992442782992228)
(('downtown', 'chicago'), 0.0006454562568915902)
(('water', 'tower'), 0.0006454562568915902)
(('pump', 'room'), 0.000618562246187774)
(('walking', 'distance'), 0.000618562246187774)
(('definitely', 'stay'), 0.000510986203372509)
(('magnificent', 'mile'), 0.000510986203372509)
(('michigan', 'avenue'), 0.0004571981819648764)
(('hilton', 'chicago'), 0.0004034101605572439)
(('next', 'time'), 0.0004034101605572439)
(('chicago', 'water'), 0.0003496221391496114)
(('felt', 'like'), 0.0003496221391496114)
(('great', 'hotel'), 0.0003496221391496114)
(('great', 'view'), 0.0003496221391496114)
(('sofitel

In [240]:
#applying frequency filter first to finder which has all stopwords and regex filtered
pmi_finder1 = BigramCollocationFinder.from_words(tokens1)
pmi_finder1.apply_word_filter(lambda w: w in stopwords)
pmi_finder1.apply_freq_filter(5)
pmi_finder1.apply_ngram_filter(lambda w1, w2: len(w1) < 3 and len(w2) < 3)
scored1 = pmi_finder1.score_ngrams(bigram_measures.pmi)
for bscore1 in scored1[:50]:
    print (bscore1)

(('michigan', 'ave'), 11.833305203810799)
(('american', 'girl'), 11.371199450875178)
(('fitness', 'center'), 11.355257907006157)
(('air', 'conditioner'), 10.934682224185885)
(('air', 'conditioning'), 10.934682224185885)
(('ambassador', 'east'), 10.720557418833035)
(('trip', 'advisor'), 10.470735124426096)
(('credit', 'card'), 10.45479358055707)
(('shower', 'curtain'), 10.32234328453342)
(('wireless', 'internet'), 10.253504408205425)
(('upon', 'entering'), 10.242466136752975)
(('looking', 'forward'), 10.148807029538732)
(('toilet', 'paper'), 10.104607225628193)
(('water', 'pressure'), 10.011303505788796)
(('pet', 'friendly'), 9.985308297255854)
(('year', 'old'), 9.934682224185881)
(('weekend', 'getaway'), 9.859562744382089)
(('took', 'forever'), 9.748269099955003)
(('valet', 'parking'), 9.733769530259886)
(('water', 'tower'), 9.73119558659606)
(('mini', 'bar'), 9.622738217871143)
(('internet', 'access'), 9.459955285672851)
(('upon', 'arrival'), 9.347648373445034)
(('wrong', 'order'), 9.

In [241]:
#applying frequency filter first to finder which has all stopwords and regex filtered
pmi_finder2 = BigramCollocationFinder.from_words(tokens2)
pmi_finder2.apply_word_filter(lambda w: w in stopwords)
pmi_finder2.apply_freq_filter(5)
pmi_finder2.apply_ngram_filter(lambda w1, w2: len(w1) < 3 and len(w2) < 3)
scored2 = pmi_finder2.score_ngrams(bigram_measures.pmi)
for bscore2 in scored2[:50]:
    print (bscore2)

(('kitty', 'osheas'), 12.860427459183029)
(('pleasantly', 'surprised'), 11.481915835929296)
(('gold', 'coast'), 11.445389959904183)
(('navy', 'pier'), 11.375000632012785)
(('flat', 'screen'), 11.240907736425502)
(('john', 'hancock'), 11.012430552628075)
(('cant', 'wait'), 10.649860473243368)
(('top', 'notch'), 10.538499364295664)
(('look', 'forward'), 10.33435864751544)
(('grant', 'park'), 10.186871035192883)
(('millenium', 'park'), 10.13172948100042)
(('fitness', 'center'), 10.053072537125423)
(('walking', 'distance'), 9.99167199246128)
(('magnificent', 'mile'), 9.950502803484877)
(('screen', 'tv'), 9.896953335208142)
(('wireless', 'internet'), 9.749396146794282)
(('internet', 'access'), 9.597393053349233)
(('ambassador', 'east'), 9.506077886603286)
(('hard', 'rock'), 9.309631561613063)
(('king', 'size'), 9.237497108262852)
(('water', 'tower'), 9.200312684544512)
(('michigan', 'ave'), 9.133445953589444)
(('highly', 'recommended'), 9.12346186501682)
(('windy', 'city'), 9.11626636361261

In [245]:
#extra credit trigrams

In [266]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()

In [281]:
finder3 = TrigramCollocationFinder.from_words(stopped_file_words1)
scored_trigram3 = finder3.score_ngrams(trigram_measures.raw_freq)
for tscore in scored_trigram3[:50]:
    print (tscore)

(('hard', 'rock', 'hotel'), 0.0009780836804926644)
(('rock', 'hotel', 'chicago'), 0.0006882811084948379)
(('ambassador', 'east', 'hotel'), 0.0005796051439956529)
(('front', 'desk', 'staff'), 0.0005796051439956529)
(('called', 'front', 'desk'), 0.0005433798224959246)
(('chicago', 'water', 'tower'), 0.000470929179496468)
(('sofitel', 'chicago', 'water'), 0.00039847853649701144)
(('stayed', 'hard', 'rock'), 0.0003260278934975548)
(('hotel', 'monaco', 'chicago'), 0.00028980257199782647)
(('ordered', 'room', 'service'), 0.00028980257199782647)
(('front', 'desk', 'clerk'), 0.0002535772504980982)
(('never', 'go', 'back'), 0.00021735192899836986)
(('call', 'front', 'desk'), 0.00018112660749864155)
(('looked', 'like', 'someone'), 0.00018112660749864155)
(('room', 'wasnt', 'ready'), 0.00018112660749864155)
(('stayed', 'hilton', 'chicago'), 0.00018112660749864155)
(('stayed', 'hotel', 'monaco'), 0.00018112660749864155)
(('affinia', 'hotel', 'chicago'), 0.00014490128599891323)
(('called', 'room', 

In [280]:
finder4 = TrigramCollocationFinder.from_words(stopped_file_words2)
scored_trigram4 = finder4.score_ngrams(trigram_measures.raw_freq)
for tscore in scored_trigram4[:50]:
    print (tscore)

(('hard', 'rock', 'hotel'), 0.0017601877533603583)
(('ambassador', 'east', 'hotel'), 0.0014934926392148496)
(('rock', 'hotel', 'chicago'), 0.0010667804565820354)
(('chicago', 'water', 'tower'), 0.000693407296778323)
(('sofitel', 'chicago', 'water'), 0.000693407296778323)
(('stay', 'ambassador', 'east'), 0.00048005120546191593)
(('within', 'walking', 'distance'), 0.00048005120546191593)
(('flat', 'screen', 'tv'), 0.0003733731598037124)
(('hotel', 'monaco', 'chicago'), 0.0003733731598037124)
(('recommend', 'hotel', 'anyone'), 0.0003733731598037124)
(('front', 'desk', 'staff'), 0.0003200341369746106)
(('got', 'great', 'deal'), 0.0003200341369746106)
(('hotel', 'located', 'right'), 0.0003200341369746106)
(('one', 'best', 'hotels'), 0.0003200341369746106)
(('staff', 'friendly', 'helpful'), 0.0003200341369746106)
(('stayed', 'hard', 'rock'), 0.0003200341369746106)
(('got', 'great', 'rate'), 0.00026669511414550886)
(('highly', 'recommend', 'hotel'), 0.00026669511414550886)
(('made', 'feel', '