In [1]:
from collections import Counter,OrderedDict
import re
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize

# Part A

A. Using the **McDonalds Yelp Review CSV file**, **process the reviews**.
This means you should think briefly about:
* what stopwords to remove (should you add any custom stopwords to the set? Remove any stopwords?)
* what regex cleaning you may need to perform (for example, are there different ways of saying `hamburger` that you need to account for?)
* stemming/lemmatization (explain in your notebook why you used stemming versus lemmatization). 

Next, **count-vectorize the dataset**. Use the **`sklearn.feature_extraction.text.CountVectorizer`** examples from `Linear Algebra, Distance and Similarity (Completed).ipynb` and `Text Preprocessing Techniques (Completed).ipynb` (read the last section, `Vectorization Techniques`).

I do not want redundant features - for instance, I do not want `hamburgers` and `hamburger` to be two distinct columns in your document-term matrix. Therefore, I'll be taking a look to make sure you've properly performed your cleaning, stopword removal, etc. to reduce the number of dimensions in your dataset. 

In [2]:
reviews=pd.read_csv('mcdonalds-yelp-negative-reviews.csv',encoding='latin1',delimiter=',')

In [3]:
reviews.head()

Unnamed: 0,_unit_id,city,review
0,679455653,Atlanta,"I'm not a huge mcds lover, but I've been to be..."
1,679455654,Atlanta,Terrible customer service. I came in at 9:30pm...
2,679455655,Atlanta,"First they ""lost"" my order, actually they gave..."
3,679455656,Atlanta,I see I'm not the only one giving 1 star. Only...
4,679455657,Atlanta,"Well, it's McDonald's, so you know what the fo..."


In [4]:
words = Counter()
for line in reviews['review']:
    for word in line.split(" "):
        words[word] += 1

In [5]:
y=OrderedDict(words.most_common())
y

OrderedDict([('the', 6208),
             ('I', 4330),
             ('and', 4070),
             ('to', 3953),
             ('a', 3426),
             ('of', 1990),
             ('is', 1865),
             ('was', 1771),
             ('in', 1708),
             ('for', 1617),
             ('my', 1412),
             ('this', 1375),
             ('it', 1177),
             ('that', 1160),
             ('they', 1137),
             ('you', 1048),
             ('at', 1011),
             ('have', 937),
             ('on', 873),
             ('not', 860),
             ('but', 830),
             ('with', 795),
             ('The', 743),
             ('me', 705),
             ('are', 700),
             ('get', 649),
             ('be', 628),
             ('so', 607),
             ('order', 602),
             ('food', 589),
             ('one', 588),
             ("McDonald's", 585),
             ('had', 551),
             ('just', 532),
             ('up', 499),
             ('or', 486),
            

In [6]:
reviews['review']=reviews['review'].str.replace(r'\'','')
reviews['review']=reviews['review'].str.replace(r'[^\w\s]',' ')
reviews['review']=reviews['review'].str.replace(r'\d+','')
reviews['review']=reviews['review'].str.replace(r'_','') 

Since an aspect of this assignment is to eliminate redundant features, I chosen stemming instead of lemmatization. Stemming generally provides us with higher recall & coverage. With our end goal of count-vectorizing the dataset and calculating the most similar pair of reviews. 

In [7]:
stemmer=nltk.stem.porter.PorterStemmer()
stopwords=list(stopwords.words('english'))

In [8]:
reviews['review_stem']=reviews['review'].apply(lambda x: " ".join([stemmer.stem(i) for i in x.split(" ")]))

In [9]:
stopwords.append('mcd')
stopwords.append('mcds')
stopwords.append('mcdonald')
stopwords.append('mcdonalds')
stopwords.append('restaurant')
stopwords.append('order')
stopwords.remove('don\'t')
stopwords.remove('shouldn\'t')
stopwords.remove('didn\'t')
stopwords.remove('no')
stopwords.remove('not')

In [10]:
reviews.head()

Unnamed: 0,_unit_id,city,review,review_stem
0,679455653,Atlanta,Im not a huge mcds lover but Ive been to bett...,Im not a huge mcd lover but ive been to bette...
1,679455654,Atlanta,Terrible customer service I came in at pm an...,terribl custom servic I came in at pm and st...
2,679455655,Atlanta,First they lost my order actually they gave...,first they lost my order actual they gave i...
3,679455656,Atlanta,I see Im not the only one giving star Only b...,I see Im not the onli one give star onli bec...
4,679455657,Atlanta,Well its McDonalds so you know what the food...,well it mcdonald so you know what the food i...


In [11]:
vectorizer=CountVectorizer(stop_words=stopwords,binary=True)
X=vectorizer.fit_transform(reviews['review_stem'])

In [12]:
df=pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())
df.head()

Unnamed: 0,aaaaaaaahhhhhhhhhhh,abbrevi,abc,abil,abl,abod,abour,abov,abram,abras,...,yuppi,zak,zax,zee,zeke,zero,zesti,zip,zombi,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df_sim=pd.DataFrame(cosine_similarity(df))
df_sim

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1515,1516,1517,1518,1519,1520,1521,1522,1523,1524
0,1.000000,0.123443,0.051215,0.189737,0.121268,0.125109,0.131306,0.164833,0.144463,0.144463,...,0.096077,0.023408,0.175038,0.000000,0.073030,0.066667,0.166410,0.141421,0.104641,0.084017
1,0.123443,1.000000,0.158052,0.097590,0.149696,0.064349,0.202610,0.084781,0.167183,0.130032,...,0.049417,0.144479,0.112537,0.000000,0.000000,0.000000,0.085592,0.136386,0.057666,0.129641
2,0.051215,0.158052,1.000000,0.121466,0.093161,0.080093,0.050436,0.175872,0.123311,0.077069,...,0.020502,0.119885,0.168085,0.058747,0.023376,0.085358,0.071022,0.022634,0.047850,0.161359
3,0.189737,0.097590,0.121466,1.000000,0.076696,0.065938,0.207614,0.173749,0.038069,0.114208,...,0.101274,0.000000,0.184506,0.072548,0.173205,0.000000,0.087706,0.111803,0.070908,0.177123
4,0.121268,0.149696,0.093161,0.076696,1.000000,0.075858,0.175156,0.149917,0.175187,0.175187,...,0.116510,0.056773,0.159199,0.000000,0.088561,0.080845,0.100901,0.150061,0.145024,0.101885
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1520,0.066667,0.000000,0.085358,0.000000,0.080845,0.000000,0.000000,0.137361,0.120386,0.080257,...,0.000000,0.039014,0.145865,0.000000,0.000000,1.000000,0.092450,0.000000,0.049829,0.046676
1521,0.166410,0.085592,0.071022,0.087706,0.100901,0.057831,0.109254,0.114291,0.200334,0.100167,...,0.000000,0.032461,0.080911,0.000000,0.101274,0.092450,1.000000,0.049029,0.103651,0.038837
1522,0.141421,0.136386,0.022634,0.111803,0.150061,0.036860,0.185695,0.072846,0.170251,0.127688,...,0.056614,0.082761,0.103142,0.000000,0.032275,0.000000,0.049029,1.000000,0.105703,0.099015
1523,0.104641,0.057666,0.047850,0.070908,0.145024,0.046755,0.137400,0.092401,0.116975,0.143969,...,0.047874,0.096229,0.087220,0.017147,0.054585,0.049829,0.103651,0.105703,1.000000,0.115128


In [14]:
maxvalue=0
for i in df_sim.columns:
    for j in df_sim.index:
        if i==j:
            continue
        else:
            if df_sim.iloc[i,j]>maxvalue:
                maxvalue=df_sim.iloc[i,j]
                loc_i=i
                loc_j=j
print(reviews.loc[loc_i,'review'])
print('')
print(reviews.loc[loc_j,'review'])

A bit of history   this McDonalds is located on the first McDonalds I ever visited  back in the s  Back then  it was a special treat  now  not so much  The menu is basic  but the service here could be better  Sometimes OK  sometimes downright dreadful  They need some additional training 

A bit of history   this McDonalds is located on the first McDonalds I ever visited  back in the s  Back then  it was a special treat  now  not so much  The menu is basic  but the service here could be better  Sometimes OK  sometimes downright dreadful  They need some additional training 


The reviews above are the most similar (they are the same actually).

# Part B

B. **Stopwords, Stemming, Lemmatization Practice**

Using the `tale-of-two-cities.txt` file from Week 1:
* Count-vectorize the corpus. Treat each sentence as a document.

How many features (dimensions) do you get when you:
* Perform **stemming** and then **count-vectorization**.
* Perform **lemmatization** and then **count-vectorization**.
* Perform **lemmatization**, remove **stopwords**, **remove punctuation**, and then perform **count-vectorization**?

In [15]:
tales=open('tale-of-two-cities.txt','r')
prep=tales.readlines()
prep1=''.join(prep)
prep2=prep1.split('  ')
tale1=pd.DataFrame(prep2)
pos=range(3493,3507)
tale1.drop(tale1.index[pos],inplace=True)
tale1.drop(tale1.index[0],inplace=True)
tale1.rename(columns={0:'sentence'}, inplace=True)
tale=tale1.reset_index(drop=True)
tale['sentence']=tale['sentence'].str.replace('\n',' ')

In [16]:
t1=open('tale-of-two-cities.txt','r')
prep1=t1.readlines()
prep2=''.join(prep1)

In [17]:
prep2



In [18]:
data=sent_tokenize(prep2)
tale=pd.DataFrame(data,columns=['sentence'])
tale['sentence']=tale['sentence'].str.replace('\n',' ')

In [19]:
tale.head()

Unnamed: 0,sentence
0,"IT WAS the best of times, it was the worst o..."
1,There were a king with a large jaw and a queen...
2,In both countries it was clearer than crystal ...
3,It was the year of Our Lord one thousand seven...
4,Spiritual revelations were conceded to England...


## Stemming Only

In [20]:
tale['sent_stem']=tale['sentence'].apply(lambda x: " ".join([stemmer.stem(i) for i in x.split(" ")]))

In [21]:
vectorizer=CountVectorizer(token_pattern=r"(?u)\b\w\w+\b|!|\?|\"|\'")
X=vectorizer.fit_transform(tale['sent_stem'])

In [22]:
df=pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())
df

Unnamed: 0,!,"""",',1757,1767,1792,21,?,aback,abandon,...,your,yourn,yours,yourself,yourselv,yourselves,youth,youthfulness,youths,zealou
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7759,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7760,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7761,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7762,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
print(f'Number of features:',len(df.columns))

Number of features: 8754


## Lemmatization Only

In [24]:
lemmatizer=WordNetLemmatizer()
tale['sent_lem']=tale['sentence'].apply(lambda x: " ".join([lemmatizer.lemmatize(i) for i in x.split(" ")]))

In [25]:
X=vectorizer.fit_transform(tale['sent_lem'])

In [26]:
df=pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())
df

Unnamed: 0,!,"""",',1757,1767,1792,21,?,aback,abandon,...,your,yourn,yours,yourself,yourselves,youth,youthful,youthfulness,youths,zealous
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7759,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7760,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7761,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7762,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
print(f'Number of features:',len(df.columns))

Number of features: 9380


## Lemmatization (No Stopwords or Punctuation)

In [28]:
tale['clean_sent']=tale['sentence'].str.replace(r'[^\w\s]',' ')
tale['clean_sent_lem']=tale['clean_sent'].apply(lambda x: " ".join([lemmatizer.lemmatize(i) for i in x.split(" ")]))

In [29]:
vectorizer=CountVectorizer(stop_words='english',binary=True)
X=vectorizer.fit_transform(tale['clean_sent_lem'])

In [30]:
df=pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())
df

Unnamed: 0,1757,1767,1792,21,aback,abandon,abandoned,abandoning,abandonment,abashed,...,yonder,yore,young,younger,youngest,yourn,youth,youthful,youthfulness,zealous
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7759,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7760,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7761,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7762,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
print(f'Number of features:',len(df.columns))

Number of features: 8558
