## Sentimental Analysis on Ultra SkinCare Reviews 
##### Link to the dataset: [Link](https://https://www.kaggle.com/datasets/nenamalikah/nlp-ulta-skincare-reviews)

---



In [None]:
#importing the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv('/content/drive/MyDrive/Datasets/UltraSkincareReview/Ulta Skincare Reviews.csv')
data = pd.DataFrame(data)
data.head()

Unnamed: 0,Review_Title,Review_Text,Verified_Buyer,Review_Date,Review_Location,Review_Upvotes,Review_Downvotes,Product,Brand,Scrape_Date
0,Perfect,Love using this on my face while in the shower...,No,15 days ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23
1,You need this,Even better than the daily microfoliant. I'm o...,No,27 days ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23
2,Clean skin,Enjoy this product so much ! I look forward to...,No,2 months ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23
3,Love This Stuff!,I've never tried anything like this before and...,No,2 months ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23
4,This exfoliates very nicely and,This exfoliates very nicely and gives a very s...,No,2 months ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23


In [None]:
#checking the null values in the dataset
data.isna().sum()

Review_Title        0
Review_Text         2
Verified_Buyer      0
Review_Date         0
Review_Location     1
Review_Upvotes      0
Review_Downvotes    0
Product             0
Brand               0
Scrape_Date         0
dtype: int64

In [None]:
data.shape

(4150, 10)

In [None]:
#dropping the rows with null values to get the purest data
data = data.dropna()
data = pd.DataFrame(data)

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import string
import spacy
import re
#from nltk.tokenize import contractions

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
#now we define a function that performs standard NLP preprocessing tasks on the 'Review_text'
#we define a function to expand the contractions using the spacy library
nlp = spacy.load('en_core_web_sm')
def expand(text):
  doc = nlp(text)
  exp_text = ' '.join([token.text for token in doc])
  return exp_text

def processed_text(text):

  #expand contraction
  text = expand(text)
  
  #remove the HTML tags since the data was scraped from the web
  soup = BeautifulSoup(text, 'html.parser')
  text = soup.get_text()

  #remove special characters
  pattern = r'[^a-zA-Z0-9\s]'
  text = re.sub(pattern, '', text)

  #convert the text into lowercase for standardizing
  text = text.lower()
  
  #tokenize the text into words
  tokens = word_tokenize(text)
  
  
  #removing the punctuations in the text
  tokens = [token for token in tokens if token not in string.punctuation]
  

  #remove the stop words such as 'a', 'and', 'the' etc.
  stop_words = set(stopwords.words('english'))
  tokens = [token for token in tokens if token not in stop_words]

  #performing the lemmatization to reduce the words to their proper base form
  lemmatizer = WordNetLemmatizer()
  tokens = [lemmatizer.lemmatize(token) for token in tokens]

  #joining the clean tokens back into a single string
  processed_text = ' '.join(tokens)

  return processed_text


In [None]:
data['Review_Text'] = [processed_text(sentence) for sentence in data['Review_Text']]

  soup = BeautifulSoup(text, 'html.parser')


In [None]:
data.head(10)

Unnamed: 0,Review_Title,Review_Text,Verified_Buyer,Review_Date,Review_Location,Review_Upvotes,Review_Downvotes,Product,Brand,Scrape_Date
0,Perfect,love using face shower heat give light scrub n...,No,15 days ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23
1,You need this,even better daily microfoliant obsessed skin m...,No,27 days ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23
2,Clean skin,enjoy product much look forward using really f...,No,2 months ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23
3,Love This Stuff!,never tried anything like love apply face get ...,No,2 months ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23
4,This exfoliates very nicely and,exfoliates nicely give smooth skin irritation ...,No,2 months ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23
5,Seriously nice scrub!,love use wet dry control abrasive leaf face so...,No,2 months ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23
6,Absolutely love,absolutely love,No,2 months ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23
7,I truly like the multivitamin,truly like multivitamin thermafoliant product ...,No,2 months ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23
8,Favorite,definitely love skin face feel better using,No,2 months ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23
9,Excelent,excelent,No,2 months ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23


In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
processed_text = data['Review_Text']
vader_analyzer = SentimentIntensityAnalyzer()
vader_scores = processed_text.apply(lambda text: vader_analyzer.polarity_scores(text))
compound_scores = vader_scores.apply(lambda scores: scores['compound'])
compound_scores

0       0.7964
1       0.2960
2       0.8214
3       0.8810
4       0.1761
         ...  
4145    0.4404
4146    0.1078
4147    0.4404
4148    0.7003
4149    0.8481
Name: Review_Text, Length: 4147, dtype: float64

In [None]:
new_data = pd.concat([processed_text, compound_scores], keys=['Review_text', 'compound_scores'], axis = 1, ignore_index= False)
new_data


Unnamed: 0,Review_text,compound_scores
0,love using face shower heat give light scrub n...,0.7964
1,even better daily microfoliant obsessed skin m...,0.2960
2,enjoy product much look forward using really f...,0.8214
3,never tried anything like love apply face get ...,0.8810
4,exfoliates nicely give smooth skin irritation ...,0.1761
...,...,...
4145,much better product scrub,0.4404
4146,using exfoliant month depending condition skin...,0.1078
4147,skin look better 10 year ago,0.4404
4148,product useless known exfoliant use daily woul...,0.7003


In [None]:
def classify_sentiment(score):
    if score >= 0:
        return "Positive"
    else:
        return "Negative"
new_data['sentiment'] = new_data['compound_scores'].apply(lambda x : classify_sentiment(x))
new_data.head()

Unnamed: 0,Review_text,compound_scores,sentiment
0,love using face shower heat give light scrub n...,0.7964,Positive
1,even better daily microfoliant obsessed skin m...,0.296,Positive
2,enjoy product much look forward using really f...,0.8214,Positive
3,never tried anything like love apply face get ...,0.881,Positive
4,exfoliates nicely give smooth skin irritation ...,0.1761,Positive


In [None]:
new_data['sentiment'].value_counts()

Positive    3951
Negative     196
Name: sentiment, dtype: int64

In [None]:
new_data.drop('compound_scores', axis = 1)
new_data = new_data.reset_index(drop = True)

In [None]:
X = new_data['Review_text']
y = new_data['sentiment']

In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
sentences = [text.split() for text in new_data['Review_text']]

# Train the Word2Vec model on the sentences
word2vec_model = Word2Vec(sentences, min_count=1, vector_size=100)

# Function to transform a sentence into a feature vector
def sentence_to_vector(sentence):
    vectors = []
    for word in sentence:
        if word in word2vec_model.wv.key_to_index:
            vectors.append(word2vec_model.wv[word])
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(word2vec_model.vector_size)

In [None]:
X= [sentence_to_vector(sentence.split()) for sentence in X]
X = np.asarray(X)
#X = pd.DataFrame(X)
# Print the shape of X

In [None]:
X

array([[ 0.29128993,  0.27497378, -0.08085808, ..., -0.38301811,
        -0.07881702, -0.08849724],
       [ 0.31651929,  0.37767172, -0.16871485, ..., -0.41064122,
         0.04208481, -0.10176304],
       [ 0.31434938,  0.39940941, -0.16123693, ..., -0.35161829,
        -0.00703217, -0.11294307],
       ...,
       [ 0.23857719,  0.33736885, -0.20305188, ..., -0.48998711,
        -0.09603012, -0.07163415],
       [ 0.15536945,  0.20085803, -0.2711921 , ..., -0.31301001,
        -0.0602124 , -0.04874836],
       [ 0.40381342,  0.53962123, -0.10464861, ..., -0.39293379,
        -0.07335109, -0.14002457]])

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y)
y = pd.DataFrame(le.transform(y))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 1)

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=1)
X_trainSM, y_trainsm = sm.fit_resample(X_train,y_train)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
#y_train = sc.fit_transform(y_train)
#y_test = sc.transform(y_test)
X_trainSM = sc.fit_transform(X_trainSM)
X_test = sc.transform(X_test)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout

#train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
#test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
#X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
#X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

max_length = max([len(x) for x in X_train])

model = Sequential()
model.add(Embedding(input_dim=len(word2vec_model.wv.key_to_index) + 1, output_dim=100, input_length=max_length))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_trainSM, y_trainsm, validation_data=(X_test, y_test), epochs=20, batch_size=32)




Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f7da83f6470>

In [None]:
from sklearn.metrics import classification_report

# Assuming you have the predicted labels and true labels
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)
y_true = y_test

# Generate the classification report
report = classification_report(y_true, y_pred)

# Print the classification report
print(report)


              precision    recall  f1-score   support

           0       0.07      0.81      0.14        42
           1       0.98      0.46      0.63       788

    accuracy                           0.48       830
   macro avg       0.53      0.64      0.38       830
weighted avg       0.93      0.48      0.60       830

