# Natural Language Processing

# Text processing 

In [2]:
import pandas as pd
import numpy as np
import scipy as sp

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn import metrics

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

from nltk.stem.snowball import SnowballStemmer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Harry\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Goal: build a model that will predict whether some review is 5 stars or 1 star

In [3]:
'yelp.csv'

'yelp.csv'

In [4]:
#Read yelp reviews into a DataFrame
yelp = pd.read_csv('data/yelp.csv')

#Create a dataframe that only contains 5 star and 1 star reviews
best_worst = yelp[(yelp.stars == 5)| (yelp.stars ==1)]

#Define X and y
X = best_worst.text
y = best_worst.stars

#Split in training and testing sets
X_train, X_test, y_train, y_test  = train_test_split(X, y , random_state = 1)

# Creating Features with Count Vectorizer

In [5]:
#Use Countvectorizer to create a document-term matrix from X_train and X_test
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()

X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [6]:
#Check out the names of our features
vect.get_feature_names()

['00',
 '000',
 '00a',
 '00am',
 '00pm',
 '01',
 '02',
 '03',
 '03342',
 '04',
 '05',
 '06',
 '07',
 '09',
 '0buxoc0crqjpvkezo3bqog',
 '0l',
 '10',
 '100',
 '1000',
 '1000x',
 '1001',
 '100th',
 '101',
 '102',
 '105',
 '1070',
 '108',
 '10am',
 '10ish',
 '10min',
 '10mins',
 '10minutes',
 '10pm',
 '10th',
 '10x',
 '11',
 '110',
 '1100',
 '111',
 '111th',
 '112',
 '115th',
 '118',
 '11a',
 '11am',
 '11p',
 '11pm',
 '12',
 '120',
 '128i',
 '129',
 '12am',
 '12oz',
 '12pm',
 '12th',
 '13',
 '14',
 '140',
 '147',
 '14lbs',
 '15',
 '150',
 '1500',
 '150mm',
 '15am',
 '15mins',
 '15pm',
 '15th',
 '16',
 '160',
 '165',
 '169',
 '16th',
 '17',
 '17p',
 '18',
 '180',
 '18th',
 '19',
 '1900',
 '1913',
 '1928',
 '1929',
 '1930s',
 '1940',
 '1952',
 '1955',
 '1956',
 '1960',
 '1961',
 '1969',
 '1970',
 '1980',
 '1980s',
 '1987',
 '1990s',
 '1992',
 '1995',
 '1996',
 '1998',
 '1999',
 '19th',
 '1cent',
 '1k',
 '1p',
 '1pm',
 '1st',
 '20',
 '200',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007'

In [7]:
# Intatntiate a model with sag solver
lr = LogisticRegression(solver='sag')

#Fit the model on the document-term matrix data
lr.fit(X_train_dtm, y_train)

#Create a list with predictions on testing data
y_pred = lr.predict(X_test_dtm)

#Print the accuracy score 
print(metrics.accuracy_score(y_test, y_pred))

#Print baseline accuracy
print(best_worst.stars.value_counts(normalize=True))


0.9393346379647749
5    0.816691
1    0.183309
Name: stars, dtype: float64




# N-grams

N-grams are feauter which consist of N consecutive words
```
- Unigram :1-gram - 'my', 'cat','is','awesome'
- Bigram : 2-gram -> 'my cat', 'cat is', 'is awesome'
- Trigram: 3-gram -> 'my cat is', 'cat is awesome'

```


In [8]:
#Intantiate a vectorizer that include bigrams
vect = CountVectorizer(ngram_range=(1,2))

In [9]:
#Create  document-term matrices
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [10]:
#Use Logistic Regression to predict star rating
lr = LogisticRegression(solver='sag', max_iter = 10_000)

#Fit the model on the document-term matrix data
lr.fit(X_train_dtm, y_train)

#Create a list with predictions on testing data
y_pred = lr.predict(X_test_dtm)

#Print the accuracy score 
print(metrics.accuracy_score(y_test, y_pred))

0.9344422700587084


# Stop words 

What are Stop words :
Why ? 

In [12]:
vect = CountVectorizer(stop_words ='english',ngram_range=(1,2), max_features=100_000, min_df = 2)

In [13]:
#Create  document-term matrices
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [14]:
#Use Logistic Regression to predict star rating
lr = LogisticRegression(solver='sag', max_iter = 1000)

#Fit the model on the document-term matrix data
lr.fit(X_train_dtm, y_train)

#Create a list with predictions on testing data
y_pred = lr.predict(X_test_dtm)

#Print the accuracy score 
print(metrics.accuracy_score(y_test, y_pred))

0.9315068493150684


# Stemming and Lemmatization

In [None]:
conda install textblob

In [16]:
from textblob import TextBlob

In [None]:
Learn, Learner, Learning

In [None]:
Vectorizer 
Vectorize
vector
Vectoriser
Vectorise

In [17]:
best_worst.text[0]

'My wife took me here on my birthday for breakfast and it was excellent.  The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.  Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning.  It looked like the place fills up pretty quickly so the earlier you get here the better.\r\n\r\nDo yourself a favor and get their Bloody Mary.  It was phenomenal and simply the best I\'ve ever had.  I\'m pretty sure they only use ingredients from their garden and blend them fresh when you order it.  It was amazing.\r\n\r\nWhile EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious.  It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete.  It was the best "toast" I\'ve ever had.\r\n\r\nAnyway, I can\'t wait to go back!'

In [27]:
# Parse a review through Textblob
review = TextBlob(best_worst.text[0])

In [28]:
review

TextBlob("My wife took me here on my birthday for breakfast and it was excellent.  The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.  Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning.  It looked like the place fills up pretty quickly so the earlier you get here the better.

Do yourself a favor and get their Bloody Mary.  It was phenomenal and simply the best I've ever had.  I'm pretty sure they only use ingredients from their garden and blend them fresh when you order it.  It was amazing.

While EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious.  It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete.  It was the best "toast" I've ever had.

Anyway, I can't wait to go back!")

In [31]:
from nltk.stem.snowball import SnowballStemmer

In [29]:
review.words

WordList(['My', 'wife', 'took', 'me', 'here', 'on', 'my', 'birthday', 'for', 'breakfast', 'and', 'it', 'was', 'excellent', 'The', 'weather', 'was', 'perfect', 'which', 'made', 'sitting', 'outside', 'overlooking', 'their', 'grounds', 'an', 'absolute', 'pleasure', 'Our', 'waitress', 'was', 'excellent', 'and', 'our', 'food', 'arrived', 'quickly', 'on', 'the', 'semi-busy', 'Saturday', 'morning', 'It', 'looked', 'like', 'the', 'place', 'fills', 'up', 'pretty', 'quickly', 'so', 'the', 'earlier', 'you', 'get', 'here', 'the', 'better', 'Do', 'yourself', 'a', 'favor', 'and', 'get', 'their', 'Bloody', 'Mary', 'It', 'was', 'phenomenal', 'and', 'simply', 'the', 'best', 'I', "'ve", 'ever', 'had', 'I', "'m", 'pretty', 'sure', 'they', 'only', 'use', 'ingredients', 'from', 'their', 'garden', 'and', 'blend', 'them', 'fresh', 'when', 'you', 'order', 'it', 'It', 'was', 'amazing', 'While', 'EVERYTHING', 'on', 'the', 'menu', 'looks', 'excellent', 'I', 'had', 'the', 'white', 'truffle', 'scrambled', 'eggs', 

In [32]:
stemmer = SnowballStemmer('english')

In [35]:
print([stemmer.stem(word) for word in review.words])

['my', 'wife', 'took', 'me', 'here', 'on', 'my', 'birthday', 'for', 'breakfast', 'and', 'it', 'was', 'excel', 'the', 'weather', 'was', 'perfect', 'which', 'made', 'sit', 'outsid', 'overlook', 'their', 'ground', 'an', 'absolut', 'pleasur', 'our', 'waitress', 'was', 'excel', 'and', 'our', 'food', 'arriv', 'quick', 'on', 'the', 'semi-busi', 'saturday', 'morn', 'it', 'look', 'like', 'the', 'place', 'fill', 'up', 'pretti', 'quick', 'so', 'the', 'earlier', 'you', 'get', 'here', 'the', 'better', 'do', 'yourself', 'a', 'favor', 'and', 'get', 'their', 'bloodi', 'mari', 'it', 'was', 'phenomen', 'and', 'simpli', 'the', 'best', 'i', 've', 'ever', 'had', 'i', "'m", 'pretti', 'sure', 'they', 'onli', 'use', 'ingredi', 'from', 'their', 'garden', 'and', 'blend', 'them', 'fresh', 'when', 'you', 'order', 'it', 'it', 'was', 'amaz', 'while', 'everyth', 'on', 'the', 'menu', 'look', 'excel', 'i', 'had', 'the', 'white', 'truffl', 'scrambl', 'egg', 'veget', 'skillet', 'and', 'it', 'was', 'tasti', 'and', 'delic

### Lemmatisation

In [39]:
print([word.lemmatize(pos = 'v') for word in review.words])

['My', 'wife', 'take', 'me', 'here', 'on', 'my', 'birthday', 'for', 'breakfast', 'and', 'it', 'be', 'excellent', 'The', 'weather', 'be', 'perfect', 'which', 'make', 'sit', 'outside', 'overlook', 'their', 'ground', 'an', 'absolute', 'pleasure', 'Our', 'waitress', 'be', 'excellent', 'and', 'our', 'food', 'arrive', 'quickly', 'on', 'the', 'semi-busy', 'Saturday', 'morning', 'It', 'look', 'like', 'the', 'place', 'fill', 'up', 'pretty', 'quickly', 'so', 'the', 'earlier', 'you', 'get', 'here', 'the', 'better', 'Do', 'yourself', 'a', 'favor', 'and', 'get', 'their', 'Bloody', 'Mary', 'It', 'be', 'phenomenal', 'and', 'simply', 'the', 'best', 'I', "'ve", 'ever', 'have', 'I', "'m", 'pretty', 'sure', 'they', 'only', 'use', 'ingredients', 'from', 'their', 'garden', 'and', 'blend', 'them', 'fresh', 'when', 'you', 'order', 'it', 'It', 'be', 'amaze', 'While', 'EVERYTHING', 'on', 'the', 'menu', 'look', 'excellent', 'I', 'have', 'the', 'white', 'truffle', 'scramble', 'egg', 'vegetable', 'skillet', 'and'

In [40]:
def split_into_lemmas(text):
    
    # pass every review/text through textblob
    words = TextBlob(text).words
    #return a list comprehension that lemmatizes every word 
    return [word.lemmatize(pos='v') for word in words]
    

In [41]:
vect = CountVectorizer(analyzer = split_into_lemmas)

In [53]:
X_train_dtm


<3064x19368 sparse matrix of type '<class 'numpy.int64'>'
	with 243793 stored elements in Compressed Sparse Row format>

In [57]:
#Create document-term matrices
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)


#Intantiate a logistic regression model 
lr = LogisticRegression(max_iter = 10_000, solver='sag')

#Fit our model with training data
lr.fit(X_train_dtm, y_train)

#Create a prediction
y_pred = lr.predict(X_test_dtm)

#measure accuracy score
print(metrics.accuracy_score(y_test, y_pred))

0.9354207436399217
