# Sentiment Analysis with NLTK

pip install nltk

In [1]:
import re # regular expressions
import nltk # Natural Language Toolkit 
nltk.download('stopwords') 
# common words that don't tell us anything about the polarity of a review
nltk.download('wordnet')
#WordNet is a lexical database of English.
#Using synsets, helps find conceptual relationships between words
# such as hypernyms, hyponyms, synonyms, antonyms etc.
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer # Try PorterStemmer
# Stemmers use an algorithmic approach of removing prefixes and suffixes, 
# and the result may not be an actual dictionary word
#In most cases  Lemmatizers will transform the words to actual dictionary words.

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/derrick/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/derrick/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
import numpy as np
import pandas as pd
df = pd.read_csv('yelp_labelled.csv', delimiter = '\t')
# engine - parse engine; c or Python
# quoting - Determines the quoting behavior Use one of; QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3)

In [3]:
df.head()

Unnamed: 0,review,status
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
review    1000 non-null object
status    1000 non-null int64
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


# Cleaning the Dataset

In [5]:
def clean_data(review):
    review = re.sub('[^a-zA-Z]', ' ',review)
    review = review.lower()
    return review       

In [6]:
df['review'] = df['review'].apply(clean_data)

In [7]:
df.head()

Unnamed: 0,review,status
0,wow loved this place,1
1,crust is not good,0
2,not tasty and the texture was just nasty,0
3,stopped by during the late may bank holiday of...,1
4,the selection on the menu was great and so wer...,1


# Remove Stop Words

In [8]:
# stopwords.words('english')

In [9]:
def remove_stop_words(review):
    review_minus_sw = []
    stop_words = stopwords.words('english')
    review = review.split()
    for word in review:
        if word not in stop_words:
            review_minus_sw.append(word)
            
    review = ' '.join(review_minus_sw)
    
    return review       

In [10]:
def remove_stop_words(review):
    review_minus_sw = []
    stop_words = stopwords.words('english')
    review = review.split()
    review = [review_minus_sw.append(word) for word in review if word not in stop_words]            
    review = ' '.join(review_minus_sw)
    return review       

In [11]:
df['review'].apply(remove_stop_words)

0                                        wow loved place
1                                             crust good
2                                    tasty texture nasty
3      stopped late may bank holiday rick steve recom...
4                            selection menu great prices
                             ...                        
995                    think food flavor texture lacking
996                              appetite instantly gone
997                      overall impressed would go back
998    whole experience underwhelming think go ninja ...
999    wasted enough life poured salt wound drawing t...
Name: review, Length: 1000, dtype: object

In [12]:
df['review'] = df['review'].apply(remove_stop_words)

# Lemmatizer

In [13]:
lemmatizer = WordNetLemmatizer()

In [14]:
lemmatizer.lemmatize('dogs')

'dog'

# Stemmer

In [15]:
stem = PorterStemmer()

In [16]:
stem.stem('movie')

'movi'

# Lemmatizer the Review

In [17]:
def lematize(review):
    lematized_review = []
    review = review.split()
    for word in review:
        word = lemmatizer.lemmatize(word)
        lematized_review.append(word)
    review = ' '.join(lematized_review)
    return review

In [18]:
def lematize(review):
    review = review.split()
    review = [lemmatizer.lemmatize(w) for w in review]
    review = ' '.join(review)
    return review

In [19]:
df['review'].apply(lematize)

0                                        wow loved place
1                                             crust good
2                                    tasty texture nasty
3      stopped late may bank holiday rick steve recom...
4                             selection menu great price
                             ...                        
995                    think food flavor texture lacking
996                              appetite instantly gone
997                      overall impressed would go back
998    whole experience underwhelming think go ninja ...
999    wasted enough life poured salt wound drawing t...
Name: review, Length: 1000, dtype: object

In [20]:
df['review'] = df['review'].apply(lematize)

In [21]:
df.head()

Unnamed: 0,review,status
0,wow loved place,1
1,crust good,0
2,tasty texture nasty,0
3,stopped late may bank holiday rick steve recom...,1
4,selection menu great price,1


# A Bag of Words Model

In [22]:
corpus = list(df['review'])

A bag of words model is a numerical represenation of text data to make it consumable by machine learning models. 

Take all the words in our corpus and create a column with each word.

The rows represent the reviews. If a certain word exists in the review, it’s represented by a 1, and if the word doesn’t exist in the review, its represented by a 0. Each word in the column represents a single feature.

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1000) 
 # Stop words are common words in English that don’t tell us anything about the polarity of a review.
    # Such words include the, that, and a
# Converts a collection of text documents to a matrix of token counts
# max_features = maximum number of words we’d like to have in our bag of words model
X = cv.fit_transform(corpus).toarray()
y = df['status'].values

In [24]:
# cv.vocabulary_
# shows a word it's position in the sparse vector

In [25]:
len(cv.get_feature_names())

1000

In [26]:
# cv.get_feature_names()
# A list of feature names.

In [27]:
X.shape

(1000, 1000)

In [28]:
import pandas as pd
pd.DataFrame(X,columns=cv.get_feature_names()).to_csv('words.csv',index=False)

In [29]:
corpus[219:220]

['food rich order accordingly']

In [30]:
X[4:6]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [31]:
# cv.stop_words_
# Terms that were ignored because they either:

# occurred in too many documents (max_df)

# occurred in too few documents (min_df)

# were cut off by feature selection (max_features).

## Occurrences to frequencies

Occurrence count has an issue in that longer documents will have a higher average count for a particular word than shorter documents. To avoid these discrepancies, we divide the number of occurrences of each word in a document by the total number of words as a way of normalization. These new features are called **tf**, short for **Term Frequencies**.

Very common words usually tend to have a higher **tf**. However, some of these words might not be so important in determining whether a review is positive or negative. The way we deal with this issue is by downscaling the weights for common words that are less informative than words that occur less in the corpus.
This downscaling is called **tf–idf** for **“Term Frequency  Inverse Document Frequency”**.

Consider a document containing 200 words, wherein the word love appears 5 times. The **tf** for love is then (5 / 200) = 0.025. Assuming we had one million documents and the word love occurs in one thousand of these, the **inverse document frequency (i.e., idf)** is calculated as log(1000000 / 1000) = 3. The **tf-idf** weight is the product of these quantities: 0.025 * 3 = 0.075.

In [32]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer()
X = tf_transformer.fit_transform(X).toarray()

# Fitting CountVectorizer followed by TfidfTransformer

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfVectorizer = TfidfVectorizer(max_features =1000)
X = tfidfVectorizer.fit_transform(corpus).toarray()

In [34]:
# X[0:1]

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test , y_train, y_test = train_test_split(X, y , test_size = 0.20, random_state=101)

In [36]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
# Naive Bayes is a statistical classification technique based on Bayes Theorem
# common classifier used in sentiment analysis is the Naive Bayes Classifier.
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier # this is experimental
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
classifiers = [GradientBoostingClassifier(),GaussianNB(),HistGradientBoostingClassifier(),
               RandomForestClassifier(),LogisticRegression(),XGBClassifier(),LGBMClassifier(),
               CatBoostClassifier(verbose=0)]


In [37]:
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score


In [38]:
for classifier in classifiers:
    classifier.fit(X_train,y_train)
    print(f'The {classifier}  Accuracy  is {accuracy_score(y_test,classifier.predict(X_test)) }' )

The GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)  Accuracy  is 0.735
The GaussianNB(priors=None, var_smoothing=1e-09)  Accuracy  is 0.69
The HistGradientBoostingClassifier(l2_regularization=0.0, learning_rate=0.1,
                               loss='auto', max_bins=255, max_depth=None,
                               max_iter=100, max_leaf_nodes=

In [39]:
classifier = LogisticRegression()
classifier.fit(X_train,y_train)
predictions = classifier.predict(X_test)
confusion_matrix(y_test, predictions)

array([[81, 21],
       [20, 78]])

In [43]:
classifier.predict(X_test[0:1])

array([1])

In [44]:
classifier.predict(X_test[0:1])[0]

1

In [40]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.80      0.79      0.80       102
           1       0.79      0.80      0.79        98

    accuracy                           0.80       200
   macro avg       0.79      0.80      0.79       200
weighted avg       0.80      0.80      0.80       200



In [41]:
import joblib # pip install joblib
# Joblib is a set of tools to provide lightweight pipelining in Python
joblib.dump(tfidfVectorizer, 'tfidfVectorizer.pkl')
# Dump persist a Python object into one file
joblib.dump(classifier, 'classifier.pkl')

['classifier.pkl']

Happy Learning!