## Sentimental Analysis

In [254]:
################## Codes For Testing ##################
# Printing first 5 rows
# hotel_reviews_df = hotel_reviews_df[["review", "cleaned_review"]]
# print(hotel_reviews_df.head()) 

# Uncomment the below two codes to get a smaller csv file (input the column that you wish to check)
# hotel_reviews_df = hotel_reviews_df[["review", "cleaned_review"]]
# hotel_reviews_df.to_csv('hotel_reviews_df_cleaned.csv', index=False)

In [None]:
#1. This step will add a new column called sentiments to classify the reviews based on four scores: 
# neutrality, positivity, negativity and overall scores that descrbies the previous three scores.
!pip install nltk
!pip install wordcloud
!pip install gensim
!pip install pandas
import nltk 
nltk.download('popular')
nltk.download('vader_lexicon')
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

hotel_reviews_df_cleaned = pd.read_csv("hotel_reviews_df_cleaned.csv")

# IMPORTANT *** Extract out only this 3 columns because this is all what we need for ML
hotel_reviews_df_cleaned = hotel_reviews_df_cleaned[["review", "is_bad_review", "cleaned_review"]]

sid = SentimentIntensityAnalyzer()
hotel_reviews_df_cleaned["sentiments"] = hotel_reviews_df_cleaned["review"].apply(lambda review: sid.polarity_scores(str(review)))

hotel_reviews_df_cleaned = pd.concat([hotel_reviews_df_cleaned.drop(['sentiments'], axis=1), hotel_reviews_df_cleaned['sentiments'].apply(pd.Series)], axis=1)

hotel_reviews_df_cleaned.head()



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [6]:
#2. This will add 2 more new columns, the number of character and number of words column based on each corresponding review

hotel_reviews_df_cleaned["num_chars"] = hotel_reviews_df_cleaned["review"].apply(lambda x: len(str(x)))
hotel_reviews_df_cleaned["num_words"] = hotel_reviews_df_cleaned["review"].apply(lambda x: len(str(x).split(" ")))

hotel_reviews_df_cleaned.head()



NameError: name 'hotel_reviews_df_cleaned' is not defined

In [5]:
#3. Create doc2vec vector columns
# It is using the gensim library to create a Doc2Vec model and apply it to the cleaned review texts, 
# then concatenating the resulting vectors with the original DataFrame to create new columns.
# Doc2Vec is an unsupervised machine learning algorithm that learns fixed-length vector representations 
# (embeddings) from variable-length pieces of texts, such as documents, paragraphs, or sentences. 
# These embeddings can be used for tasks like text classification, clustering, and similarity matching. 
# Doc2Vec is an extension of Word2Vec, which learns embeddings for individual words. 
# Unlike Word2Vec, Doc2Vec learns a separate embedding for each document, while still taking into account 
# the words in the document.

from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Create tagged documents
tagged_documents = [TaggedDocument(str(review).split(" "), [i]) for i, review in enumerate(hotel_reviews_df_cleaned["cleaned_review"])]
# tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(hotel_reviews_df_cleaned["cleaned_review"].apply(lambda x: str(x).split(" ")))]
#Train the Doc2Vec model
model = Doc2Vec(tagged_documents, vector_size=5, window=2, min_count=1, workers=4)

#Infer vectors for each document
doc2vec_df = pd.DataFrame([model.infer_vector(str(review).split(" ")) for review in hotel_reviews_df_cleaned["cleaned_review"]])
doc2vec_df.columns = ["doc2vec_vector_" + str(i) for i in range(doc2vec_df.shape[1])]

# Concatenate the Doc2Vec vector with the original df
hotel_reviews_df_cleaned = pd.concat([hotel_reviews_df_cleaned, doc2vec_df], axis=1)

hotel_reviews_df_cleaned.head()


NameError: name 'hotel_reviews_df_cleaned' is not defined

In [None]:
#4. Create TF-IDFS columns
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer with a minimum document frequency of 10
tfidf = TfidfVectorizer(min_df=10)

# # Fit the vectorizer to the cleaned reviews and transform the text into a matrix of TF-IDF features
hotel_reviews_df_cleaned["cleaned_review"] = hotel_reviews_df_cleaned["cleaned_review"].fillna("")
tfidf_result = tfidf.fit_transform(hotel_reviews_df_cleaned["cleaned_review"])


# # Convert the result to a pandas DataFrame with the feature names as column headers
tfidf_df = pd.DataFrame(tfidf_result.toarray(), columns=tfidf.get_feature_names_out())

# # Add a prefix to each column name for identification purposes
tfidf_df = tfidf_df.add_prefix('word_')

# # Concatenate the original dataframe with the TF-IDF matrix
hotel_reviews_df_cleaned = pd.concat([hotel_reviews_df_cleaned, tfidf_df], axis=1)

In [None]:
#5. Interested to find out the percentage of the dataset that is considered a bad review and good review
# This will help us see whether the dataset is balanced or imbalance, and further understand the skewness
total_Bad_Reviews = "is_bad_review"
results = hotel_reviews_df_cleaned[total_Bad_Reviews].value_counts()

#Get total reviews in the data set
#Query the bad reviews / total reviews * 100% to get percentage of bad reviews same for good reviews
totalReviews = results[0] + results[1]
goodReview = results[0]
badReview = results[1]
numOfBadReviews = badReview / totalReviews
print(round(numOfBadReviews, 3) * 100)

numOfGoodReviews = goodReview / totalReviews
print(round(numOfGoodReviews, 3) * 100)

print(results)


#From this we can see that the only 4.3% of the reviews given are bad and 95.7% are good.
#Dataset is not balanced but also can be used an indicator for client to know that they are doing a good job

In [None]:
#6. Interested to find out the most used words in the reviews, regardless of good or bad
# This helps the client to see what is the sentiment about the hotel among previous guest
#Examples are "Expensive" which could indicate the per night prices are too high and or
#Small, which could indicate the rooms are too small. 
#Further investigation would be needed

def generateWordCloud(data, title = None):
    
    interestedData = str(data)
    
    wordCloud = WordCloud(
        background_color = 'white',
        max_words = 400,
        max_font_size = 40, 
        scale = 3, 
        random_state = 42
    ).generate(interestedData)

    fig = plt.figure(1, figsize = (20, 20))
    plt.axis('off')

    if title: 
        fig.suptitle(title, fontsize = 20)
        fig.subplots_adjust(top = 2.3)

    plt.imshow(wordCloud)
    plt.show()


criteria = "review"
allHotelData = hotel_reviews_df_cleaned[criteria]
generateWordCloud(allHotelData)


In [None]:
# hotel_reviews_df_cleaned = hotel_reviews_df_cleaned[["review", "pos"]]
# hotel_reviews_df_cleaned.head()

#7. Get the first 10 highest reviews with a positive Sentiment
totalNumOfWords = hotel_reviews_df_cleaned["num_words"] 
totalNumOfWordsAboveFive = totalNumOfWords >= 5

getTotalNumOfWordsAboveFive = hotel_reviews_df_cleaned[totalNumOfWordsAboveFive]
getSortedPositiveValue = getTotalNumOfWordsAboveFive.sort_values("pos", ascending = False)[["review", "pos"]].head(10)
print(getSortedPositiveValue)

In [None]:
#8. Get the first 10 highest reviews with a Negative Sentiment
totalNumOfWords = hotel_reviews_df_cleaned["num_words"]
totalNumOfWordsAboveFive = totalNumOfWords >= 5

getTotalNumOfWordsAboveFive = hotel_reviews_df_cleaned[totalNumOfWordsAboveFive]
getSortedPositiveValue = getTotalNumOfWordsAboveFive.sort_values("neg", ascending = False)[["review", "neg"]].head(10)
print(getSortedPositiveValue)

In [None]:
#9. Plot sentiment distribution for positive and negative reviews
for label, is_bad_review in [("Good reviews", 0), ("Bad reviews", 1)]:
    reviewToPlot = is_bad_review
    badReview = hotel_reviews_df_cleaned['is_bad_review'] == reviewToPlot
    group = hotel_reviews_df_cleaned[badReview]
    
    sns.histplot(group['compound'], kde = True, label = label)
    sns.displot(group['compound'], label = label, kind = "kde")


In [None]:
#10. Modeling Reviewer Score

# Feature selection
label = "is_bad_review"
ignore_cols = [label, "review", "cleaned_review"]

features = [c for c in hotel_reviews_df_cleaned.columns if c not in ignore_cols]

# split the data into train and test
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(hotel_reviews_df_cleaned[features], hotel_reviews_df_cleaned[label], test_size = 0.20, random_state = 42)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=ea4e772d-4daa-4712-9df2-11ccbdb38015' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>

In [3]:
#11. Train a random forest classifier
rf = RandomForestClassifier(n_estimators = 100, random_state = 42)
rf.fit(X_train, y_train)

#12. Show feature importance
feature_importances_df = pd.DataFrame({"feature": features, "importance": rf.feature_importances_}).sort_values("importance", ascending = False)
feature_importances_df.head(20)

NameError: name 'X_train' is not defined