In [1]:
import nltk

# Preprocessing Text
Text preprocessing is a crucial step in performing sentiment analysis, as it helps to clean and normalize the text data, making it easier to analyze. The preprocessing step involves a series of techniques that help transform raw text data into a form you can use for analysis. Some common text preprocessing techniques include tokenization, stop word removal, stemming, and lemmatization.

### Tokenization
Tokenization is a text preprocessing step in sentiment analysis that involves breaking down the text into individual words or tokens. This is an essential step in analyzing text data as it helps to separate individual words from the raw text, making it easier to analyze and understand. 

### Stop words
Stop word removal is a crucial text preprocessing step in sentiment analysis that involves removing common and irrelevant words that are unlikely to convey much sentiment. Stop words are words that are very common in a language and do not carry much meaning, such as "and," "the," "of," and "it." These words can cause noise and skew the analysis if they are not removed.

### Stemming and Lemmatization
Stemming and lemmatization are techniques used to reduce words to their root forms. Stemming involves removing the suffixes from words, such as "ing" or "ed," to reduce them to their base form. For example, the word "jumping" would be stemmed to "jump." 

Lemmatization, however, involves reducing words to their base form based on their part of speech. For example, the word "jumped" would be lemmatized to "jump," but the word "jumping" would be lemmatized to "jumping" since it is a present participle.

# Bag of Words (BoW) Model
The bag of words model is a technique used in natural language processing (NLP) to represent text data as a set of numerical features. In this model, each document or piece of text is represented as a "bag" of words, with each word in the text represented by a separate feature or dimension in the resulting vector. The value of each feature is determined by the number of times the corresponding word appears in the text.

In [24]:
# import libraries
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

In [13]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [17]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...


True

In [20]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...


True

In [34]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...


True

In [6]:
df = pd.read_csv("https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/amazon.csv")

In [7]:
df

Unnamed: 0,reviewText,Positive
0,This is a one of the best apps acording to a b...,1
1,This is a pretty good version of the game for ...,1
2,this is a really cool game. there are a bunch ...,1
3,"This is a silly game and can be frustrating, b...",1
4,This is a terrific game on any pad. Hrs of fun...,1
...,...,...
19995,this app is fricken stupid.it froze on the kin...,0
19996,Please add me!!!!! I need neighbors! Ginger101...,1
19997,love it! this game. is awesome. wish it had m...,1
19998,I love love love this app on my side of fashio...,1


In [27]:
# # preprocess text
def pre_process(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # remove punctuations
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

In [30]:
pre_process("Johar. as I know that Maths???!!")

'johar know math'

In [31]:
df['reviewText'] = df['reviewText'].apply(pre_process)

In [32]:
df

Unnamed: 0,reviewText,Positive
0,one best apps acording bunch people agree bomb...,1
1,pretty good version game free lot different le...,1
2,really cool game bunch level find golden egg s...,1
3,silly game frustrating lot fun definitely reco...,1
4,terrific game pad hr fun grandkids love great ...,1
...,...,...
19995,app fricken stupid.it froze kindle wont allow ...,0
19996,please add need neighbor ginger1016 thanks bun...,1
19997,love game awesome wish free stuff house n't co...,1
19998,love love love app side fashion story fight wo...,1


In [43]:
# NLTK SentimentAnalyzer
"""
The function calls the polarity_scores method of the analyzer object to obtain a 
dictionary of sentiment scores for the text, which includes a score for positive, 
negative, and neutral sentiment. 
"""
analyzer = SentimentIntensityAnalyzer()
def get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    sentiment = 1 if scores['pos'] > 0 else 0
    return sentiment

In [44]:
df['sentiment'] = df['reviewText'].apply(get_sentiment)

In [45]:
df

Unnamed: 0,reviewText,Positive,sentiment
0,one best apps acording bunch people agree bomb...,1,1
1,pretty good version game free lot different le...,1,1
2,really cool game bunch level find golden egg s...,1,1
3,silly game frustrating lot fun definitely reco...,1,1
4,terrific game pad hr fun grandkids love great ...,1,1
...,...,...,...
19995,app fricken stupid.it froze kindle wont allow ...,0,0
19996,please add need neighbor ginger1016 thanks bun...,1,1
19997,love game awesome wish free stuff house n't co...,1,1
19998,love love love app side fashion story fight wo...,1,1


In [46]:
analyzer.polarity_scores("This app is not good for beginners. Its not user friendly")

{'neg': 0.359, 'neu': 0.641, 'pos': 0.0, 'compound': -0.6167}

In [48]:
# evaluation
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(df['Positive'], df['sentiment']))
print(classification_report(df['Positive'], df['sentiment']))

[[ 1131  3636]
 [  576 14657]]
              precision    recall  f1-score   support

           0       0.66      0.24      0.35      4767
           1       0.80      0.96      0.87     15233

    accuracy                           0.79     20000
   macro avg       0.73      0.60      0.61     20000
weighted avg       0.77      0.79      0.75     20000

