# Sentiment Analysis

Title: LA2 - Sentiment Analysis
Name: Kuan-Hung Liu\
ASU ID: 1230540209\
File creation date: 1/24/2024


## Library and data import
Using the first 10000 rows.

In [None]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [None]:
# Import packages
import json
import spacy
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn import metrics
from google.colab import drive
from sklearn.pipeline import Pipeline
from nltk.tokenize import RegexpTokenizer
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold


In [None]:
# Connect to Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import data: Iutput 1
data = pd.read_csv('/content/drive/MyDrive/CIS_509/restaurant_reviews_az.csv', nrows = 10000 )

In [None]:
data

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,Sentiment
0,IVS7do_HBzroiCiymNdxDg,fdFgZQQYQJeEAshH4lxSfQ,sGy67CpJctjeCWClWqonjA,3,1,1,0,"OK, the hype about having Hatch chili in your ...",1/27/2020 22:59,1
1,QP2pSzSqpJTMWOCuUuyXkQ,JBLWSXBTKFvJYYiM-FnCOQ,3w7NRntdQ9h0KwDsksIt5Q,5,1,1,1,Pandemic pit stop to have an ice cream.... onl...,4/19/2020 5:33,1
2,oK0cGYStgDOusZKz9B1qug,2_9fKnXChUjC5xArfF8BLg,OMnPtRGmbY8qH_wIILfYKA,5,1,0,0,I was lucky enough to go to the soft opening a...,2/29/2020 19:43,1
3,E_ABvFCNVLbfOgRg3Pv1KQ,9MExTQ76GSKhxSWnTS901g,V9XlikTxq0My4gE8LULsjw,5,0,0,0,I've gone to claim Jumpers all over the US and...,3/14/2020 21:47,1
4,Rd222CrrnXkXukR2iWj69g,LPxuausjvDN88uPr-Q4cQA,CA5BOxKRDPGJgdUQ8OUOpw,4,1,0,0,"If you haven't been to Maynard's kitchen, it'...",1/17/2020 20:32,1
...,...,...,...,...,...,...,...,...,...,...
9995,_W0OFIxgwbwRhpEPTKHqHg,e-x6Jaeqts5UU5631UpgXQ,dhj16roKb6Z-TyJ0w6iqjQ,3,2,0,1,Found this tepanyaki restaurant a few years ag...,10/12/2021 2:08,0
9996,Fppk7c4xmBJsUGRCB4ho7A,ZNeup5N06v8liYL3BolGsA,#NAME?,5,0,0,0,This was our first time in Tucson. We decided ...,1/14/2022 23:09,1
9997,fI3v-kTUtE7alT2RtOGzvA,P98oZ7bYaerBmDqdMHU0mQ,dhj16roKb6Z-TyJ0w6iqjQ,1,1,1,0,We had a birthday dinner reservation at 8:00 w...,2/21/2021 18:40,0
9998,CjwMkf9nFa7ZNmbwMezcaQ,Js1FsZ6oP_9tO5tbJN-d2g,vI-2mPYOgfix5LcaqDOy_g,4,8,4,7,Took my parents to dinner here for my Dads bir...,5/19/2021 3:06,1


## Predict sentiment with VaderSentiment
Apply lexicon-based approach with VaderSentiment to predict
sentiment on the above offered Input 1 data.

In [None]:
# Calculate the count in label column
data['Sentiment'].value_counts()

1    7202
0    2798
Name: Sentiment, dtype: int64

In [None]:
!python -m spacy download en_core_web_lg

2024-01-25 21:43:27.677873: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-25 21:43:27.677958: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-25 21:43:27.680001: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting en-core-web-lg==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.6.0/en_core_web_lg-3.6.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m915.4 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully insta

In [None]:
# Text preprocessing
nlp = spacy.load("en_core_web_lg")
def normalize(review, lowercase, remove_stopwords):
    if lowercase:
        review = review.lower() # Turn every words into lowercase
    doc = nlp(review)
    lemmatized = list()
    for token in doc:
        if not remove_stopwords or (remove_stopwords and not token.is_stop):
            lemmatized.append(token.lemma_)
    return " ".join(lemmatized)
data['processed'] = data['text'].apply(normalize, lowercase=True, remove_stopwords=True)

In [None]:
data['processed']

0       ok , hype have hatch chili burger overrate . o...
1       pandemic pit stop ice cream .... plain sundae ...
2       lucky soft opening let tell ... good . beer wi...
3       go claim jumper disappoint location different ...
4         maynard kitchen , time ! hope dinner , sure ...
                              ...                        
9995    find tepanyaki restaurant year ago come fix . ...
9996    time tucson . decide stop real authentic mexic...
9997    birthday dinner reservation 8:00 seat 8:40 par...
9998    take parent dinner dad birthday . walk place f...
9999    sad place sbarro outlet mall . advertise produ...
Name: processed, Length: 10000, dtype: object

In [None]:
# Initialize the SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Define a function to analyze sentiment using Vader
def analyze_vader_sentiment(text):
    return analyzer.polarity_scores(text)['compound']

# Apply the function to each review text
data['vader_sentiment'] = data['text'].apply(analyze_vader_sentiment)

# Display the first few rows of the dataframe to verify
data.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,Sentiment,processed,vader_sentiment
0,IVS7do_HBzroiCiymNdxDg,fdFgZQQYQJeEAshH4lxSfQ,sGy67CpJctjeCWClWqonjA,3,1,1,0,"OK, the hype about having Hatch chili in your ...",1/27/2020 22:59,1,"ok , hype have hatch chili burger overrate . o...",0.986
1,QP2pSzSqpJTMWOCuUuyXkQ,JBLWSXBTKFvJYYiM-FnCOQ,3w7NRntdQ9h0KwDsksIt5Q,5,1,1,1,Pandemic pit stop to have an ice cream.... onl...,4/19/2020 5:33,1,pandemic pit stop ice cream .... plain sundae ...,0.5877
2,oK0cGYStgDOusZKz9B1qug,2_9fKnXChUjC5xArfF8BLg,OMnPtRGmbY8qH_wIILfYKA,5,1,0,0,I was lucky enough to go to the soft opening a...,2/29/2020 19:43,1,lucky soft opening let tell ... good . beer wi...,0.9781
3,E_ABvFCNVLbfOgRg3Pv1KQ,9MExTQ76GSKhxSWnTS901g,V9XlikTxq0My4gE8LULsjw,5,0,0,0,I've gone to claim Jumpers all over the US and...,3/14/2020 21:47,1,go claim jumper disappoint location different ...,0.9327
4,Rd222CrrnXkXukR2iWj69g,LPxuausjvDN88uPr-Q4cQA,CA5BOxKRDPGJgdUQ8OUOpw,4,1,0,0,"If you haven't been to Maynard's kitchen, it'...",1/17/2020 20:32,1,"maynard kitchen , time ! hope dinner , sure ...",0.9823


## Describe the processed data
- Number of tokens
- Unique tokens
- Number of unique customers

In [None]:
# Load the Spacy language model
nlp = spacy.load("en_core_web_lg")

In [None]:
# A default list of stop words set by the Spacy language model
stopwords = nlp.Defaults.stop_words
print(stopwords)

{'whereby', 'while', 'third', '’re', 'because', 'a', 'well', 'anywhere', 'however', 'ours', 'them', 'name', 'from', 'sometimes', 'beyond', 'himself', 'else', 'us', 'themselves', 'last', 'upon', "n't", 'never', 'hereupon', 'as', 'i', 'elsewhere', 'everything', 'under', 'why', 'must', 'about', 'since', 'see', 'already', 'the', 'in', 'eleven', 'to', 'your', 'is', 'everyone', 'my', 'whereas', 'become', "'s", '‘m', 'regarding', 'herein', 'take', 'thence', 'besides', 'otherwise', 'therefore', 'doing', 'for', 'into', 'one', 'nine', 'out', 'sometime', 'noone', 'and', 'each', 'amount', 'yourselves', 'became', 'such', 'moreover', 'nobody', 'may', 'so', '’m', 'it', 'back', 'though', 'used', 'or', 'neither', 'wherein', 'move', 'whose', 'its', 'no', "'re", 'any', 'indeed', 'get', 'which', 'has', 'did', 'thus', 'at', '‘ve', 'full', 'on', 'thereafter', 'being', 'too', 'yet', 'have', 'both', 'these', 'me', 'mostly', 'same', 'rather', 'anyway', 'between', 'here', 'whither', 'around', 'below', 'only', '

In [None]:
# variables to store term statistics
unique_word = set() # using the set-type variable since it does not allow duplicates > able to count the number of unique words
total_number_of_tokens = 0 # in a corpus
unique_user = set() # using the set-type variable since it does not allow duplicates > able to count the number of unique users

In [None]:
for index, row in data.iterrows():
    text = row["processed"]
    doc = nlp(text)

    # statistics regarding words
    num_of_tokens = len(doc)
    total_number_of_tokens += num_of_tokens

    for token in doc:
        if token.is_stop is True:
            pass
        else:
            unique_word.add(str(token).lower())

    # statistics regarding users
    user_id = row["user_id"]
    unique_user.add(user_id.lower())


In [None]:
# statistics
print("Number of unique tokens:", len(unique_word))
print("Total number of tokens in the corpus:", total_number_of_tokens)
print("Number of unique users:", len(unique_user))

Number of unique tokens: 15421
Total number of tokens in the corpus: 516694
Number of unique users: 6830


## Split the data


In [None]:
# Split the data
X = data['processed']  # Review text
y = data['Sentiment']  # Label

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Pre-Prcoessing and Bag of Word Vectorization using Count Vectorizer
token = RegexpTokenizer(r'[a-zA-Z]+')
cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize, max_features = 1000)
X_train_vect = cv.fit_transform(X_train)
X_test_vect = cv.fit_transform(X_test)

# Output
print(X_train_vect.shape)
print(X_test_vect.shape)



(8000, 1000)
(2000, 1000)


## Machine Learning Classification
- Naive Bayes
- Support Vector Machines(SVM)


- TF-IDF


- Logistic Regression

### Naive Bayes Classification

In [None]:
#Training the model
MNB = MultinomialNB()
MNB.fit(X_train_vect, y_train)

In [None]:
#Evaluate the performance of the model
predicted = MNB.predict(X_test_vect)
performance_MNB = metrics.classification_report(y_test,predicted, target_names= ['0', '1'])
print(performance_MNB)

              precision    recall  f1-score   support

           0       0.37      0.35      0.36       552
           1       0.76      0.77      0.76      1448

    accuracy                           0.65      2000
   macro avg       0.56      0.56      0.56      2000
weighted avg       0.65      0.65      0.65      2000



### Support Vector Machines (SVM) classification

In [None]:
#Training the model
clf = svm.SVC()
clf.fit(X_train_vect, y_train)

In [None]:
predicted = clf.predict(X_test_vect)
performance_SVM = metrics.classification_report(y_test,predicted, target_names= ['0', '1'])
print(performance_SVM)

              precision    recall  f1-score   support

           0       0.35      0.45      0.39       552
           1       0.77      0.69      0.72      1448

    accuracy                           0.62      2000
   macro avg       0.56      0.57      0.56      2000
weighted avg       0.65      0.62      0.63      2000



### Logistic Regression Classification

In [None]:
#Training the model
logreg_model_1 = LogisticRegression()
logreg_model_1.fit(X_train_vect,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
predicted = logreg_model_1.predict(X_test_vect)
performance_LR = metrics.classification_report(y_test,predicted, target_names= ['0', '1'])
print(performance_LR)

              precision    recall  f1-score   support

           0       0.39      0.37      0.38       552
           1       0.76      0.77      0.77      1448

    accuracy                           0.66      2000
   macro avg       0.58      0.57      0.57      2000
weighted avg       0.66      0.66      0.66      2000



## TF-IDF

In [None]:
token = RegexpTokenizer(r'[a-zA-Z]+')
vectorizer = TfidfVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize, max_features = 1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf= vectorizer.transform(X_test)

print(X_train_tfidf.shape)
print(X_test_tfidf.shape)



(8000, 1000)
(2000, 1000)


### Logistic Regression Classification

In [None]:
#Training the model
logreg_model_2 = LogisticRegression()
logreg_model_2.fit(X_train_tfidf, y_train)

In [None]:
predicted = logreg_model_2.predict(X_test_tfidf)
performance_LR = metrics.classification_report(y_test,predicted, target_names= ['0', '1'])
print(performance_LR)

              precision    recall  f1-score   support

           0       0.86      0.78      0.82       552
           1       0.92      0.95      0.94      1448

    accuracy                           0.90      2000
   macro avg       0.89      0.87      0.88      2000
weighted avg       0.90      0.90      0.90      2000



## Comparison with VaderSentiment

In [None]:
sentiment = SentimentIntensityAnalyzer()
v_predicted = []
for text in X_test:
  sent= sentiment.polarity_scores(text)
  if sent['compound']>0:
    v_predicted.append(1)
  else:
    v_predicted.append(0)
v_performance = metrics.classification_report(y_test,v_predicted, target_names= ['0', '1'])
print(v_performance)

              precision    recall  f1-score   support

           0       0.88      0.39      0.54       552
           1       0.81      0.98      0.89      1448

    accuracy                           0.82      2000
   macro avg       0.84      0.68      0.71      2000
weighted avg       0.83      0.82      0.79      2000



## Import input 2 data

In [None]:
# Define the new reviews
new_reviews = [
    "The service is good, but location is hard to find. Sanitation is not very good with old facilities. Food served tasted extremely fishy, making us difficult to even finish it.",
    "The restaurant is definitely one of my favorites and of my family as well. I was especially impressed with my visit a few days ago. The place is clean, and you just need to wait for fewer than 10 minutes to get food served. And of course, the food is absolutely delicious!",
    "I appreciated the friendly staff. The food was good, not amazing. The service was not prompt but almost acceptable. A reliable spot for a regular meal, but nothing extraordinary."
]

### Logistic Regression on Input 2
Apply the trained logistic regression model to predict sentiment on the three customer reviews listed in the above Input 2.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import RegexpTokenizer

# Assuming you have new_reviews as your new data
new_reviews = [
    "The service is good, but location is hard to find. Sanitation is not very good with old facilities. Food served tasted extremely fishy, making us difficult to even finish it.",
    "The restaurant is definitely one of my favorites and of my family as well. I was especially impressed with my visit a few days ago. The place is clean, and you just need to wait for fewer than 10 minutes to get food served. And of course, the food is absolutely delicious!",
    "I appreciated the friendly staff. The food was good, not amazing. The service was not prompt but almost acceptable. A reliable spot for a regular meal, but nothing extraordinary."
]

# Create a tokenizer
token = RegexpTokenizer(r'[a-zA-Z]+')

# Create and fit the TF-IDF vectorizer on the training data
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 1), tokenizer=token.tokenize)
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform new_reviews using the fitted TF-IDF vectorizer
X_new_tfidf = vectorizer.transform(new_reviews)

# Predict sentiment using the trained logistic regression model
predicted_new = logreg_model_2.predict(X_new_tfidf)

print(predicted_new)




[1 1 1]


## Comment on the classification
Classify these reviews in Input 2 into positive, neutral, or negative sentiments.

In my opinion, the sentiments of the reviews are negative, positive and neutral, respectively. However, the results from the models are all positive.

## Acknowledgment

I asked the most of my code to ChatGPT and I also discussed with my classmates when I do not what to do.

## Gemerate HTML file

In [None]:
!pip install -q jupyter
!pip install -q nbconvert

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.4/123.4 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.5/93.5 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [106]:
!jupyter nbconvert "/content/drive/MyDrive/Colab Notebooks/LA2_Liu_KuanHung.ipynb" --to html

[NbConvertApp] Converting notebook /content/drive/MyDrive/Colab Notebooks/LA2_Liu_KuanHung.ipynb to html
[NbConvertApp] Writing 696888 bytes to /content/drive/MyDrive/Colab Notebooks/LA2_Liu_KuanHung.html
