# NATURAL LANGUAGE PROCESSING

## Data Preparation

In [58]:
import numpy as np
import pandas as pd
import nltk 

df = pd.read_csv('yelp.csv')
df = df[['stars','text', 'cool','useful','funny']]
df.head()

Unnamed: 0,stars,text,cool,useful,funny
0,5,My wife took me here on my birthday for breakf...,2,5,0
1,5,I have no idea why some people give bad review...,0,0,0
2,4,love the gyro plate. Rice is so good and I als...,0,1,0
3,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",1,2,0
4,5,General Manager Scott Petello is a good egg!!!...,0,0,0


### Punctuation, Text Length, Tokenization

In [59]:
# remove punctuation, add a clolumn with text length, , make lower cases

df['text'] = df['text'].str.lower() 
df['text lenght'] = (df['text'].str.split('[\W_]+'))
df['text lenght'] = df['text lenght'].str.len()

df.head()

# NOTE TO MYSELF
# '\w' is a special character that will match any alphanumeric A-z, a-z, 0-9, along with underscores;
# '+' means that the previous character in the regex can appear as many times as we want
# This means that '\w+'' will match arbitrary sequences of alphanumeric characters and underscores.

Unnamed: 0,stars,text,cool,useful,funny,text lenght
0,5,my wife took me here on my birthday for breakf...,2,5,0,161
1,5,i have no idea why some people give bad review...,0,0,0,266
2,4,love the gyro plate. rice is so good and i als...,0,1,0,16
3,5,"rosie, dakota, and i love chaparral dog park!!...",1,2,0,79
4,5,general manager scott petello is a good egg!!!...,0,0,0,89


In [60]:
# prepare a column with the text splited (without puntuation), 
df['text_split'] = (df['text'].str.split('[\W_]+'))
df['text_split'] = df['text_split'].astype(str)
df.head()

Unnamed: 0,stars,text,cool,useful,funny,text lenght,text_split
0,5,my wife took me here on my birthday for breakf...,2,5,0,161,"['my', 'wife', 'took', 'me', 'here', 'on', 'my..."
1,5,i have no idea why some people give bad review...,0,0,0,266,"['i', 'have', 'no', 'idea', 'why', 'some', 'pe..."
2,4,love the gyro plate. rice is so good and i als...,0,1,0,16,"['love', 'the', 'gyro', 'plate', 'rice', 'is',..."
3,5,"rosie, dakota, and i love chaparral dog park!!...",1,2,0,79,"['rosie', 'dakota', 'and', 'i', 'love', 'chapa..."
4,5,general manager scott petello is a good egg!!!...,0,0,0,89,"['general', 'manager', 'scott', 'petello', 'is..."


In [61]:
# this does exactly the same as above - but it keeps the punctuation and doe snot have the ''
from nltk.tokenize import word_tokenize

df['tokenized'] = df['text'].apply(nltk.word_tokenize)
df.head()

Unnamed: 0,stars,text,cool,useful,funny,text lenght,text_split,tokenized
0,5,my wife took me here on my birthday for breakf...,2,5,0,161,"['my', 'wife', 'took', 'me', 'here', 'on', 'my...","[my, wife, took, me, here, on, my, birthday, f..."
1,5,i have no idea why some people give bad review...,0,0,0,266,"['i', 'have', 'no', 'idea', 'why', 'some', 'pe...","[i, have, no, idea, why, some, people, give, b..."
2,4,love the gyro plate. rice is so good and i als...,0,1,0,16,"['love', 'the', 'gyro', 'plate', 'rice', 'is',...","[love, the, gyro, plate, ., rice, is, so, good..."
3,5,"rosie, dakota, and i love chaparral dog park!!...",1,2,0,79,"['rosie', 'dakota', 'and', 'i', 'love', 'chapa...","[rosie, ,, dakota, ,, and, i, love, chaparral,..."
4,5,general manager scott petello is a good egg!!!...,0,0,0,89,"['general', 'manager', 'scott', 'petello', 'is...","[general, manager, scott, petello, is, a, good..."


In [62]:
# QUESTION

# Does the format difference means something?
    # 'text_split ' ['my', 'wife', 'took', 'me', 'here']
    # 'tokenized' [my, wife, took, me, here]

# WHy 1/ works and not 2/ (TypeError: expected string or bytes-like object) ?
    # 1/ df['tokenized'] = df['text'].apply(nltk.word_tokenize)
    # 2/ tk = (nltk.word_tokenize(df['text']))   >> https://pythonhealthcare.org/2018/12/14/101-pre-processing-data-tokenization-stemming-and-removal-of-stop-words/

## Remove Stop Words

### Simple Example with: NLTK

In [63]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

example_sent = "This is a sample sentence, showing off the stop words filtration."

stop_words = set(stopwords.words('english'))

word_tokens = word_tokenize(example_sent)

filtered_sentence = [w for w in word_tokens if not w in stop_words]

filtered_sentence = []

for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

print(word_tokens)
print(filtered_sentence)

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


### Remove Stop Words on df with NLTK

In [67]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stop_words(corpus):
    removed_stop_words = []
    for review in corpus:
        removed_stop_words.append(
            ' '.join([word for word in review.split() 
                      if word not in stop_words]))
    return removed_stop_words

df['cleantext'] = df['text'].str.lower()
df['cleantext'] = df['cleantext'].str.replace('[\W_]+',' ')
df['cleantext'] = remove_stop_words(df['cleantext'])
df.head()

Unnamed: 0,stars,text,cool,useful,funny,text lenght,text_split,tokenized,cleantext
0,5,my wife took me here on my birthday for breakf...,2,5,0,161,"['my', 'wife', 'took', 'me', 'here', 'on', 'my...","[my, wife, took, me, here, on, my, birthday, f...",wife took birthday breakfast excellent weather...
1,5,i have no idea why some people give bad review...,0,0,0,266,"['i', 'have', 'no', 'idea', 'why', 'some', 'pe...","[i, have, no, idea, why, some, people, give, b...",idea people give bad reviews place goes show p...
2,4,love the gyro plate. rice is so good and i als...,0,1,0,16,"['love', 'the', 'gyro', 'plate', 'rice', 'is',...","[love, the, gyro, plate, ., rice, is, so, good...",love gyro plate rice good also dig candy selec...
3,5,"rosie, dakota, and i love chaparral dog park!!...",1,2,0,79,"['rosie', 'dakota', 'and', 'i', 'love', 'chapa...","[rosie, ,, dakota, ,, and, i, love, chaparral,...",rosie dakota love chaparral dog park convenien...
4,5,general manager scott petello is a good egg!!!...,0,0,0,89,"['general', 'manager', 'scott', 'petello', 'is...","[general, manager, scott, petello, is, a, good...",general manager scott petello good egg go deta...


In [34]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

word_tokens = df['tokenized'].astype(str).tolist()

filtered_sentence = [w for w in word_tokens if not w in stop_words]

filtered_sentence = []

for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

filtered_sentence[:1]

# QUESTION
# Why do I get with weird format? [\'my\', \'wife\', \'took\', \'me\', \'here\', \'on\', \'my\', \'birthday\']?
# Why the stopwords have not been removed?

['[\'my\', \'wife\', \'took\', \'me\', \'here\', \'on\', \'my\', \'birthday\', \'for\', \'breakfast\', \'and\', \'it\', \'was\', \'excellent\', \'.\', \'the\', \'weather\', \'was\', \'perfect\', \'which\', \'made\', \'sitting\', \'outside\', \'overlooking\', \'their\', \'grounds\', \'an\', \'absolute\', \'pleasure\', \'.\', \'our\', \'waitress\', \'was\', \'excellent\', \'and\', \'our\', \'food\', \'arrived\', \'quickly\', \'on\', \'the\', \'semi-busy\', \'saturday\', \'morning\', \'.\', \'it\', \'looked\', \'like\', \'the\', \'place\', \'fills\', \'up\', \'pretty\', \'quickly\', \'so\', \'the\', \'earlier\', \'you\', \'get\', \'here\', \'the\', \'better\', \'.\', \'do\', \'yourself\', \'a\', \'favor\', \'and\', \'get\', \'their\', \'bloody\', \'mary\', \'.\', \'it\', \'was\', \'phenomenal\', \'and\', \'simply\', \'the\', \'best\', \'i\', "\'ve", \'ever\', \'had\', \'.\', \'i\', "\'m", \'pretty\', \'sure\', \'they\', \'only\', \'use\', \'ingredients\', \'from\', \'their\', \'garden\', 

In [25]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

word_tokens = df['text_split'].tolist()

filtered_sentence = [w for w in word_tokens if not w in stop_words]

filtered_sentence = []

for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

filtered_sentence[:1]

# QUESTION
# Why the stopwords have not been removed?

["['my', 'wife', 'took', 'me', 'here', 'on', 'my', 'birthday', 'for', 'breakfast', 'and', 'it', 'was', 'excellent', 'the', 'weather', 'was', 'perfect', 'which', 'made', 'sitting', 'outside', 'overlooking', 'their', 'grounds', 'an', 'absolute', 'pleasure', 'our', 'waitress', 'was', 'excellent', 'and', 'our', 'food', 'arrived', 'quickly', 'on', 'the', 'semi', 'busy', 'saturday', 'morning', 'it', 'looked', 'like', 'the', 'place', 'fills', 'up', 'pretty', 'quickly', 'so', 'the', 'earlier', 'you', 'get', 'here', 'the', 'better', 'do', 'yourself', 'a', 'favor', 'and', 'get', 'their', 'bloody', 'mary', 'it', 'was', 'phenomenal', 'and', 'simply', 'the', 'best', 'i', 've', 'ever', 'had', 'i', 'm', 'pretty', 'sure', 'they', 'only', 'use', 'ingredients', 'from', 'their', 'garden', 'and', 'blend', 'them', 'fresh', 'when', 'you', 'order', 'it', 'it', 'was', 'amazing', 'while', 'everything', 'on', 'the', 'menu', 'looks', 'excellent', 'i', 'had', 'the', 'white', 'truffle', 'scrambled', 'eggs', 'veget

## Stemming and Lemmanization

Stemming reduces related words to a common stem.\
It is an optional process step, and it is useful to test accuracy with and without stemming. 

In [43]:
from nltk.stem import PorterStemmer
stemming = PorterStemmer()
my_list = ['frightening', 'frightened', 'frightens']

# Using a Python list comprehension method to apply to all words in my_list
print ([stemming.stem(word) for word in my_list])


['frighten', 'frighten', 'frighten']


In [68]:
import nltk
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

word_data = "player learning a play was playing very well"
# First Word tokenization
nltk_tokens = nltk.word_tokenize(word_data)
#Next find the roots of the word
for w in nltk_tokens:
       print("Actual: %s  Stem: %s"  % (w,porter_stemmer.stem(w)))

Actual: player  Stem: player
Actual: learning  Stem: learn
Actual: a  Stem: a
Actual: play  Stem: play
Actual: was  Stem: wa
Actual: playing  Stem: play
Actual: very  Stem: veri
Actual: well  Stem: well


In [69]:
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

word_data = "player learning a play was playing very well"
nltk_tokens = nltk.word_tokenize(word_data)
for w in nltk_tokens:
       print("Actual: %s  Lemma: %s"  % (w,wordnet_lemmatizer.lemmatize(w)))

Actual: player  Lemma: player
Actual: learning  Lemma: learning
Actual: a  Lemma: a
Actual: play  Lemma: play
Actual: was  Lemma: wa
Actual: playing  Lemma: playing
Actual: very  Lemma: very
Actual: well  Lemma: well


In [70]:
# Stemming
from nltk.stem.porter import PorterStemmer

def get_stemmed_text(corpus):
    stemmer = PorterStemmer()
    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

df['stemmedtext'] = get_stemmed_text(df['cleantext'])
df.head()

Unnamed: 0,stars,text,cool,useful,funny,text lenght,text_split,tokenized,cleantext,stemmedtext
0,5,my wife took me here on my birthday for breakf...,2,5,0,161,"['my', 'wife', 'took', 'me', 'here', 'on', 'my...","[my, wife, took, me, here, on, my, birthday, f...",wife took birthday breakfast excellent weather...,wife took birthday breakfast excel weather per...
1,5,i have no idea why some people give bad review...,0,0,0,266,"['i', 'have', 'no', 'idea', 'why', 'some', 'pe...","[i, have, no, idea, why, some, people, give, b...",idea people give bad reviews place goes show p...,idea peopl give bad review place goe show plea...
2,4,love the gyro plate. rice is so good and i als...,0,1,0,16,"['love', 'the', 'gyro', 'plate', 'rice', 'is',...","[love, the, gyro, plate, ., rice, is, so, good...",love gyro plate rice good also dig candy selec...,love gyro plate rice good also dig candi select
3,5,"rosie, dakota, and i love chaparral dog park!!...",1,2,0,79,"['rosie', 'dakota', 'and', 'i', 'love', 'chapa...","[rosie, ,, dakota, ,, and, i, love, chaparral,...",rosie dakota love chaparral dog park convenien...,rosi dakota love chaparr dog park conveni surr...
4,5,general manager scott petello is a good egg!!!...,0,0,0,89,"['general', 'manager', 'scott', 'petello', 'is...","[general, manager, scott, petello, is, a, good...",general manager scott petello good egg go deta...,gener manag scott petello good egg go detail l...


In [72]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
def get_lemmatized_text(corpus):
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

df['lemmatext'] = get_lemmatized_text(df['cleantext'])
df.head()

Unnamed: 0,stars,text,cool,useful,funny,text lenght,text_split,tokenized,cleantext,stemmedtext,lemmatext
0,5,my wife took me here on my birthday for breakf...,2,5,0,161,"['my', 'wife', 'took', 'me', 'here', 'on', 'my...","[my, wife, took, me, here, on, my, birthday, f...",wife took birthday breakfast excellent weather...,wife took birthday breakfast excel weather per...,wife took birthday breakfast excellent weather...
1,5,i have no idea why some people give bad review...,0,0,0,266,"['i', 'have', 'no', 'idea', 'why', 'some', 'pe...","[i, have, no, idea, why, some, people, give, b...",idea people give bad reviews place goes show p...,idea peopl give bad review place goe show plea...,idea people give bad review place go show plea...
2,4,love the gyro plate. rice is so good and i als...,0,1,0,16,"['love', 'the', 'gyro', 'plate', 'rice', 'is',...","[love, the, gyro, plate, ., rice, is, so, good...",love gyro plate rice good also dig candy selec...,love gyro plate rice good also dig candi select,love gyro plate rice good also dig candy selec...
3,5,"rosie, dakota, and i love chaparral dog park!!...",1,2,0,79,"['rosie', 'dakota', 'and', 'i', 'love', 'chapa...","[rosie, ,, dakota, ,, and, i, love, chaparral,...",rosie dakota love chaparral dog park convenien...,rosi dakota love chaparr dog park conveni surr...,rosie dakota love chaparral dog park convenien...
4,5,general manager scott petello is a good egg!!!...,0,0,0,89,"['general', 'manager', 'scott', 'petello', 'is...","[general, manager, scott, petello, is, a, good...",general manager scott petello good egg go deta...,gener manag scott petello good egg go detail l...,general manager scott petello good egg go deta...


### Word Count: Python style

In [74]:
# word count, make lower cases at the same time
word_count = pd.Series(' '.join(df['lemmatext']).lower().split()).value_counts()
word_count = pd.DataFrame(word_count, columns =['freq'])
wordcount = word_count.reset_index(inplace=True)
word_count.head()

Unnamed: 0,index,freq
0,place,7397
1,good,6857
2,food,6357
3,great,5128
4,like,5109
