In [42]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import gensim.downloader as api
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

## Task 3 - Feature Extraction
First we will load the ground truth reviews dataset.

In [43]:
reviews = pd.read_csv("ground_truth_reviews.csv")
reviews.head()

Unnamed: 0,review_id,location_id,hotel_name,city,review,rating,ground_truth_sentiment
0,1016464488,11953119,Nh Collection Colombo,Colombo,good stay found lighters toilet paper rolls no...,1,1
1,1016435128,11953119,Nh Collection Colombo,Colombo,definitely recommend hotel excellent food good...,5,1
2,1016307864,11953119,Nh Collection Colombo,Colombo,wonderful stay comfortable staycooperative sta...,5,1
3,1016165618,11953119,Nh Collection Colombo,Colombo,favorite 4 star hotel colombo live new york ar...,5,1
4,1015472232,11953119,Nh Collection Colombo,Colombo,excellent food stay excellent food especially ...,5,1


### 3.1. Bag of Words (BoW)
Here, we will create a Bag of Words (BoW) representation of the reviews. 
This involves tokenizing the text and creating a matrix where each row corresponds to a review and each column corresponds to a word in the vocabulary.

In [44]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(reviews["review"])

In [45]:
vocabulary = vectorizer.get_feature_names_out()
print(f"Size of BoW vocabulary: {len(vocabulary)}")

Size of BoW vocabulary: 17968


We can see here that there are 17,968 unique words in the vocabulary extracted from the reviews.

In [46]:
bow_matrix = pd.DataFrame(X.toarray(), columns=vocabulary)
print(f"Shape of the BoW matrix: {bow_matrix.shape}")

Shape of the BoW matrix: (5186, 17968)


The shape of the BoW matrix is (5186, 17968), meaning there are 5186 reviews, and each vector has 17968 features corresponding to the unique words in the vocabulary.

We can even print the first row of the BoW matrix to see how it looks.

In [47]:
print(bow_matrix.iloc[0])

000                   0
01                    0
0111and               0
0120                  0
0130                  0
                     ..
顶楼还有个游泳池不过没来得及享受一下    0
𝐄𝐚𝐬𝐡𝐚𝐧𝐢               0
𝐆𝐮𝐞𝐬𝐭                 0
𝐑𝐞𝐥𝐚𝐭𝐢𝐨𝐧𝐬             0
𝐢𝐧                    0
Name: 0, Length: 17968, dtype: int64


Let's check for some words from the first review that are present.

In [48]:
print(bow_matrix.iloc[0][bow_matrix.iloc[0] > 0])

beds        1
booked      1
even        1
found       1
give        1
good        1
lighters    1
non         1
paper       1
rolls       1
room        1
smoking     1
stay        1
though      1
toilet      1
twin        1
us          1
Name: 0, dtype: int64


### 3.2. Term Frequency-Inverse Document Frequency (TF-IDF)
Here, we will create a Term Frequency-Inverse Document Frequency (TF-IDF) representation of the reviews.

In [49]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(reviews['review'])

In [50]:
feature_names = tfidf_vectorizer.get_feature_names_out()
print(f"Size of TF-IDF vocabulary: {len(feature_names)}")

Size of TF-IDF vocabulary: 17968


Once again, we can see that there are 17,968 unique words in the vocabulary extracted from the reviews.

In [51]:
tfidf_matrix_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
print(f"Shape of the TF-IDF matrix: {tfidf_matrix_df.shape}")

Shape of the TF-IDF matrix: (5186, 17968)


The shape of the TF-IDF matrix is also (5186, 17968), meaning there are 5186 reviews, and each vector has 17968 features corresponding to the unique words in the vocabulary.

Once again, we can print the first row of the TF-IDF matrix to see how it looks.

In [52]:
print(tfidf_matrix_df.iloc[0])

000                   0.0
01                    0.0
0111and               0.0
0120                  0.0
0130                  0.0
                     ... 
顶楼还有个游泳池不过没来得及享受一下    0.0
𝐄𝐚𝐬𝐡𝐚𝐧𝐢               0.0
𝐆𝐮𝐞𝐬𝐭                 0.0
𝐑𝐞𝐥𝐚𝐭𝐢𝐨𝐧𝐬             0.0
𝐢𝐧                    0.0
Name: 0, Length: 17968, dtype: float64


Let's check for some words from the first review that are present in the TF-IDF matrix.

In [53]:
print(tfidf_matrix_df.iloc[0][tfidf_matrix_df.iloc[0] > 0])

beds        0.203157
booked      0.181399
even        0.144703
found       0.203431
give        0.205972
good        0.092402
lighters    0.406354
non         0.280659
paper       0.296387
rolls       0.342779
room        0.095741
smoking     0.374567
stay        0.084388
though      0.192924
toilet      0.232811
twin        0.320513
us          0.112083
Name: 0, dtype: float64


We can also check for the top 10 words with the highest TF-IDF scores in the first review.

In [54]:
top_tfidf_words = tfidf_matrix_df.iloc[0].nlargest(10)
print("Top 10 words with highest TF-IDF scores in the first review:")
print(top_tfidf_words)

Top 10 words with highest TF-IDF scores in the first review:
lighters    0.406354
smoking     0.374567
rolls       0.342779
twin        0.320513
paper       0.296387
non         0.280659
toilet      0.232811
give        0.205972
found       0.203431
beds        0.203157
Name: 0, dtype: float64


### 3.3. Word2Vec
Here, we will create a Word2Vec model using the reviews.

First, we need to tokenize the reviews into words.

In [55]:
tokenized_reviews = [word_tokenize(review.lower()) for review in reviews['review']]

Now we can train a Word2Vec model on the tokenized reviews. We will use a vector size of 500, a window size of 100, and set the minimum count to 0 to include all words.

In [56]:
w2v_model = Word2Vec(sentences=tokenized_reviews, vector_size=500, window=100, min_count=0, workers=8, sg=1)
w2v_model.save("word2vec.model")

In [57]:
print(f"Shape of the Word2Vec matrix: {w2v_model.wv.vectors.shape}")

Shape of the Word2Vec matrix: (17998, 500)


We can see that the Word2Vec model has a shape of (17998, 500), meaning there are 17,998 unique words in the vocabulary, and each word is represented by a 500-dimensional vector.

We can also check the vector value for a specific word, such as "bed".

In [58]:
bed_vector = w2v_model.wv['bed']
print(f"Vector for 'bed': {bed_vector}")

Vector for 'bed': [-2.21983306e-02 -7.76067749e-02 -4.54497337e-02 -4.25716415e-02
  1.13963552e-01 -1.33877397e-01  1.47425700e-02  4.74809147e-02
 -3.23983654e-02 -6.42543286e-02 -6.53947815e-02  1.22837864e-01
  3.47613357e-02 -1.72991261e-01 -5.58905378e-02 -3.55762780e-01
  1.04938745e-01 -1.52713237e-02 -1.23570785e-01 -2.60757748e-02
 -1.14616631e-02  1.36960866e-02 -7.70388171e-02 -6.68081716e-02
  1.57053173e-01  1.16890050e-01  7.44389221e-02  3.51294689e-03
 -2.01975718e-01 -9.80436057e-02  8.40772614e-02 -1.02011755e-01
 -9.42890197e-02  3.48597020e-02  1.57136843e-01 -1.68452058e-02
  1.53935075e-01 -1.93374649e-01  1.17707932e-02 -5.64404204e-02
 -2.40185838e-02 -1.50668293e-01 -1.39312059e-01  1.04409210e-01
 -1.71908900e-01  1.10343425e-02 -8.66222680e-02  1.38353616e-01
 -1.47100121e-01 -4.48582843e-02 -1.12209171e-01 -1.14339506e-02
 -1.58992171e-01  1.88115668e-02  2.10796133e-01 -2.49049038e-01
  9.52634588e-02  2.07482398e-01  1.14556856e-01 -1.16254307e-01
  1.707

We can also find the most similar words to "bed" using the Word2Vec model.

In [59]:
similar_words = w2v_model.wv.most_similar('bed')
print(f"Most similar words to 'bed': {similar_words}")

Most similar words to 'bed': [('bathroom', 0.6732473373413086), ('partially', 0.6687913537025452), ('classed', 0.6533008813858032), ('linen', 0.6511664390563965), ('mezzanine', 0.63996821641922), ('pumped', 0.6394228935241699), ('hacking', 0.6368481516838074), ('airconditioner', 0.6334236860275269), ('doubles', 0.6332963705062866), ('drenching', 0.6326010823249817)]


We can also perform analogy tasks using the Word2Vec model. For example, we can find a word that is to "colombo" as "galle" is to "city".

In [60]:
analogy_result = w2v_model.wv.most_similar(positive=['colombo', 'galle'], negative=['city'], topn=1)
print(f"Analogy result for 'colombo' - 'city' + 'galle': {analogy_result}")

Analogy result for 'colombo' - 'city' + 'galle': [('fort', 0.47828251123428345)]


Here, we can see that the model determines that Colombo - City + Galle = Fort. Which makes intuitive sense. 

We can also perform other analogy tasks, such as finding a word that is to "bed" as "internet" is to "sleep".

In [61]:
analogy_result = w2v_model.wv.most_similar(positive=['bed', 'internet'], negative=['sleep'], topn=1)
print(f"Analogy result for 'bed' - 'sleep' + 'internet': {analogy_result}")

Analogy result for 'bed' - 'sleep' + 'internet': [('connection', 0.5435218214988708)]


Here, it determined that Bed - Sleep + Internet = Connection. Which also makes sense.

Let's check another analogy task, such as finding a word that is to "bed" as "water" is to "pillow".

In [62]:
analogy_result = w2v_model.wv.most_similar(positive=['bed', 'water'], negative=['pillow'], topn=1)
print(f"Analogy result for 'bed' - 'pillow' + 'water': {analogy_result}")

Analogy result for 'bed' - 'pillow' + 'water': [('hot', 0.47072774171829224)]


Here, it determined that Bed - Pillow + Water = Hot. This is unexpected, and highlights the limitations of the model in understanding certain relationships.

We can also check for some common relationships, but those which might not be present in the dataset.

In [63]:
result = w2v_model.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=1)
print(f"Analogy result for 'king' - 'man' + 'woman': {result}")

Analogy result for 'king' - 'man' + 'woman': [('sorts', 0.6136476993560791)]


Here, we see that the model struggles to come up with a meaningful analogy for this relationship, which highlights the limitations of the dataset to generalize.

Finally, we can vectorize the reviews using the Word2Vec model by averaging the word vectors for each review.

In [64]:
def get_review_vector(review, model):
    tokens = word_tokenize(review.lower())
    vector = sum(model.wv[token] for token in tokens if token in model.wv) / len(tokens)
    return vector

w2v_review_vectors = reviews['review'].apply(lambda x: get_review_vector(x, w2v_model))
w2v_review_vectors = np.vstack(w2v_review_vectors.values)
w2v_review_vectors = pd.DataFrame(w2v_review_vectors)
print(f"Shape of the Word2Vec review vectors: {w2v_review_vectors.shape}")

Shape of the Word2Vec review vectors: (5186, 500)


We can see that the dataset now consists of 5186 reviews, and each review is represented by a 500-dimensional vector. We can even print the first review vector to see how it looks.

In [65]:
print(w2v_review_vectors[0])

0       0.026881
1      -0.020825
2      -0.038099
3       0.018888
4      -0.022236
          ...   
5181    0.004298
5182   -0.010009
5183   -0.030506
5184   -0.009015
5185   -0.014567
Name: 0, Length: 5186, dtype: float32


### 3.4. Doc2Vec

In [66]:
tokenized_reviews = [word_tokenize(review.lower()) for review in reviews['review']]
tagged_data = [TaggedDocument(words=words, tags=[str(i)]) for i, words in enumerate(tokenized_reviews)]

doc2vec_model = Doc2Vec(vector_size=500, window=50, min_count=1, workers=8, epochs=20)
doc2vec_model.build_vocab(tagged_data)
doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

doc2vec_review_vectors = np.vstack([doc2vec_model.infer_vector(words) for words in tokenized_reviews])
doc2vec_review_vectors = pd.DataFrame(doc2vec_review_vectors)

In [67]:
print(f"Shape of the Doc2Vec review vectors: {doc2vec_review_vectors.shape}")

Shape of the Doc2Vec review vectors: (5186, 500)


In [68]:
inferred_vector = doc2vec_model.infer_vector("the bed was uncomfortable".lower().split())
print(inferred_vector.shape)

(500,)


In [69]:
print(inferred_vector)

[ 1.79446060e-02  1.53058581e-02  1.76981650e-02  3.17104086e-02
 -1.91894658e-02 -1.23261111e-02  9.94187407e-03  8.87842290e-03
 -2.44432967e-03 -2.31661438e-03 -3.08689009e-03 -9.77275986e-03
  1.59678087e-02 -1.42546585e-02  1.61243435e-02 -3.59454527e-02
 -1.64232217e-02 -2.06510425e-02  1.09232475e-04 -5.10325236e-03
  7.50653213e-03 -1.71949789e-02  1.91980097e-02  7.11631682e-03
 -1.83746638e-03  1.64147317e-02  7.01201055e-03  7.17629306e-03
 -1.20069468e-02 -2.06664037e-02  1.10710729e-02 -8.01882986e-03
  2.12258548e-02 -1.08086960e-02  3.19587998e-02 -2.14461330e-03
  2.16315556e-02 -5.20936884e-02 -1.40117221e-02 -2.57253386e-02
 -1.61490645e-02 -1.59287341e-02 -2.42028479e-02  2.28739269e-02
 -3.45986336e-03 -2.36748178e-02  3.26268841e-03  2.18118988e-02
 -1.05360877e-02 -4.12311358e-03  2.46824021e-03  2.48298189e-03
 -4.27452521e-03 -2.31818315e-02 -1.88607606e-03 -1.39220040e-02
  4.06431081e-03 -1.97593239e-03  1.08816708e-02 -4.74145729e-03
 -5.17733023e-03 -4.14246

In [70]:
similar_docs = doc2vec_model.dv.most_similar([inferred_vector], topn=5)

for doc_id, similarity in similar_docs:
    print(f"Document {doc_id}: Similarity={similarity:.4f}")
    print("Review:", reviews['review'].iloc[int(doc_id)])
    print("---")

Document 3904: Similarity=0.7817
Review: room view lovely view room hotel also restaurants staff helpful rooms well appointed
---
Document 4958: Similarity=0.7610
Review: 134 good room service isvery good food really good come back friends family would like recommend hotel thank tharindi girl help stay nimesh fo team thank
---
Document 4335: Similarity=0.7561
Review: nice gesthouse nice gesthouse quiet comfortable place clean rooms wifi rooms friendly helpful ovners delicious food fresh fruits breakfast tasty lunch dinner come enjoy holidays
---
Document 2614: Similarity=0.7536
Review: beautiful hotel really enyoyed staying beautifull hotel room veary clean air con host frendly arranged us tour really nice tuctuc driver tharindu took us see waterfalls came back got us fresh juice dinner breakfast delicius
---
Document 1335: Similarity=0.7509
Review: far away city rooms simple enough stay 12 nights nice view unfortunately 2km walking uphill city centre roads withought sidewalk tyring no

Finally, we can save all the feature matrices to CSV files for further use.

In [71]:
feature_matrices = {
    'bow': bow_matrix,
    'tfidf': tfidf_matrix_df,
    'word2vec': w2v_review_vectors,
    'doc2vec': doc2vec_review_vectors
}

In [72]:
for name, matrix in feature_matrices.items():
    print(f"Saving {name} feature matrix...")
    matrix.to_csv(f"feature_matrix_{name}.csv", index=False)

Saving bow feature matrix...
Saving tfidf feature matrix...
Saving word2vec feature matrix...
Saving doc2vec feature matrix...
