# 1. Importing all the packages required

In [1]:
import nltk
import re
import sklearn
nltk.download('punkt_tab') # for tokenization
nltk.download('wordnet') # for lemmatization
nltk.download('stopwords') # for stop word removal

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# 2. Putting all the documents of the dataset in a list.

In [4]:
import pandas as pd
df = pd.read_csv('IMDB Dataset.csv')
reviews = df['review'].tolist()
reviews = reviews[:10000]

print("The number of reviews in our dataset is:", len(reviews))
print(reviews[:10]) # First 10 reviews

The number of reviews in our dataset is: 10000
["One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would 

# 3. Case-folding or lowercasing all the reviews

In [5]:
lowercase_reviews = []
for review in reviews:
  lowercase_reviews.append(review.strip().lower())

print(lowercase_reviews[:10])

["one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked. they are right, as this is exactly what happened with me.<br /><br />the first thing that struck me about oz was its brutality and unflinching scenes of violence, which set in right from the word go. trust me, this is not a show for the faint hearted or timid. this show pulls no punches with regards to drugs, sex or violence. its is hardcore, in the classic use of the word.<br /><br />it is called oz as that is the nickname given to the oswald maximum security state penitentary. it focuses mainly on emerald city, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. em city is home to many..aryans, muslims, gangstas, latinos, christians, italians, irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />i would say the main appeal of the show is due to the f

# 4. Removing URL Links from the text using Regex

In [6]:
for lowercase_review in lowercase_reviews:
  url_pattern = re.compile(r'https?://\S+|www\.\S+')
  url_pattern.sub(r'', lowercase_review)

print(lowercase_reviews[:10])

["one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked. they are right, as this is exactly what happened with me.<br /><br />the first thing that struck me about oz was its brutality and unflinching scenes of violence, which set in right from the word go. trust me, this is not a show for the faint hearted or timid. this show pulls no punches with regards to drugs, sex or violence. its is hardcore, in the classic use of the word.<br /><br />it is called oz as that is the nickname given to the oswald maximum security state penitentary. it focuses mainly on emerald city, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. em city is home to many..aryans, muslims, gangstas, latinos, christians, italians, irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />i would say the main appeal of the show is due to the f

# 5. Removing HTML Tags and Numbers from the text

In [7]:
html_removed = []
for lowercase_review in lowercase_reviews:
  html_tags_pattern = r'<.*?>'
  numbers_pattern = r'[0-9]+'
  lowercase_review = re.sub(numbers_pattern, '', lowercase_review)
  html_removed.append(re.sub(html_tags_pattern, '', lowercase_review))

lowercase_reviews = html_removed
print(lowercase_reviews[:10])

["one of the other reviewers has mentioned that after watching just  oz episode you'll be hooked. they are right, as this is exactly what happened with me.the first thing that struck me about oz was its brutality and unflinching scenes of violence, which set in right from the word go. trust me, this is not a show for the faint hearted or timid. this show pulls no punches with regards to drugs, sex or violence. its is hardcore, in the classic use of the word.it is called oz as that is the nickname given to the oswald maximum security state penitentary. it focuses mainly on emerald city, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. em city is home to many..aryans, muslims, gangstas, latinos, christians, italians, irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.i would say the main appeal of the show is due to the fact that it goes where other shows wo

# 6. Removing punctuation marks ('!', ',', '.', etc.) from the text

In [8]:
cleaned_reviews = []
for lowercase_review in lowercase_reviews:
  punctuation_pattern = r'[^\w\s]'
  text_cleaned = re.sub(punctuation_pattern, '', lowercase_review)
  cleaned_reviews.append(text_cleaned)

print(cleaned_reviews[:10])

['one of the other reviewers has mentioned that after watching just  oz episode youll be hooked they are right as this is exactly what happened with methe first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the wordit is called oz as that is the nickname given to the oswald maximum security state penitentary it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda em city is home to manyaryans muslims gangstas latinos christians italians irish and moreso scuffles death stares dodgy dealings and shady agreements are never far awayi would say the main appeal of the show is due to the fact that it goes where other shows wouldnt dare forget pretty pictur

# 7. Tokenization of the documents of the corpus

In [9]:
from nltk.tokenize import word_tokenize
tokenized_reviews = []
for cleaned_review in cleaned_reviews:
  tokenized_review = word_tokenize(cleaned_review)
  tokenized_reviews.append(tokenized_review)

print(tokenized_reviews[:10])

[['one', 'of', 'the', 'other', 'reviewers', 'has', 'mentioned', 'that', 'after', 'watching', 'just', 'oz', 'episode', 'youll', 'be', 'hooked', 'they', 'are', 'right', 'as', 'this', 'is', 'exactly', 'what', 'happened', 'with', 'methe', 'first', 'thing', 'that', 'struck', 'me', 'about', 'oz', 'was', 'its', 'brutality', 'and', 'unflinching', 'scenes', 'of', 'violence', 'which', 'set', 'in', 'right', 'from', 'the', 'word', 'go', 'trust', 'me', 'this', 'is', 'not', 'a', 'show', 'for', 'the', 'faint', 'hearted', 'or', 'timid', 'this', 'show', 'pulls', 'no', 'punches', 'with', 'regards', 'to', 'drugs', 'sex', 'or', 'violence', 'its', 'is', 'hardcore', 'in', 'the', 'classic', 'use', 'of', 'the', 'wordit', 'is', 'called', 'oz', 'as', 'that', 'is', 'the', 'nickname', 'given', 'to', 'the', 'oswald', 'maximum', 'security', 'state', 'penitentary', 'it', 'focuses', 'mainly', 'on', 'emerald', 'city', 'an', 'experimental', 'section', 'of', 'the', 'prison', 'where', 'all', 'the', 'cells', 'have', 'glas

# 8. Removing stop-words from the tokenized reviews

In [10]:
from nltk.corpus import stopwords
stop_words_en = set(stopwords.words("english"))
stop_words_en_temp = set()
for stop_word in stop_words_en:
  punctuation_pattern = r'[^\w\s]'
  stop_word_2 = re.sub(punctuation_pattern, '', stop_word)
  stop_words_en_temp.add(stop_word_2)

# Stop words still have the punctuations, so we remove punctuations from them to remove the stop words effectively
stop_words_en = stop_words_en_temp
print(stop_words_en)

{'through', 'mightn', 'isnt', 'the', 'she', 'themselves', 'couldnt', 'into', 'their', 'will', 'a', 'where', 'or', 'y', 'didnt', 'doesnt', 'well', 're', 'wouldn', 'll', 'they', 'his', 'of', 'isn', 'shouldve', 'wed', 'wasnt', 'ill', 'so', 'theyd', 'himself', 'both', 'all', 'to', 'have', 'o', 'd', 'while', 'ive', 'neednt', 'won', 'theyre', 'than', 'we', 'did', 'hadnt', 'there', 'were', 'being', 'shan', 'above', 'doing', 'here', 'then', 'weren', 'hasnt', 'up', 'youre', 'off', 'mightnt', 'hers', 'shed', 'arent', 'about', 'that', 'how', 'between', 'now', 'am', 'ours', 'my', 'thatll', 'until', 'each', 'ma', 'over', 'ain', 'wouldnt', 'under', 'you', 'is', 'youd', 'theyve', 'its', 'too', 'youve', 'herself', 'in', 'our', 'haven', 'for', 't', 'mustnt', 'and', 'itll', 'yourself', 'other', 'own', 'don', 'been', 'down', 'those', 'dont', 'only', 'hasn', 'had', 'weve', 'are', 'wasn', 'during', 'didn', 'do', 'after', 'further', 'but', 'shouldnt', 'with', 'before', 'why', 'theyll', 'does', 'it', 'oursel

In [11]:
stop_words_removed_list = []
for tokenized_review in tokenized_reviews:
  stop_words_removed_review = []
  for token in tokenized_review:
    if token not in stop_words_en:
      stop_words_removed_review.append(token)

  stop_words_removed_list.append(stop_words_removed_review)

print(stop_words_removed_list[:10])

[['one', 'reviewers', 'mentioned', 'watching', 'oz', 'episode', 'hooked', 'right', 'exactly', 'happened', 'methe', 'first', 'thing', 'struck', 'oz', 'brutality', 'unflinching', 'scenes', 'violence', 'set', 'right', 'word', 'go', 'trust', 'show', 'faint', 'hearted', 'timid', 'show', 'pulls', 'punches', 'regards', 'drugs', 'sex', 'violence', 'hardcore', 'classic', 'use', 'wordit', 'called', 'oz', 'nickname', 'given', 'oswald', 'maximum', 'security', 'state', 'penitentary', 'focuses', 'mainly', 'emerald', 'city', 'experimental', 'section', 'prison', 'cells', 'glass', 'fronts', 'face', 'inwards', 'privacy', 'high', 'agenda', 'em', 'city', 'home', 'manyaryans', 'muslims', 'gangstas', 'latinos', 'christians', 'italians', 'irish', 'moreso', 'scuffles', 'death', 'stares', 'dodgy', 'dealings', 'shady', 'agreements', 'never', 'far', 'awayi', 'would', 'say', 'main', 'appeal', 'show', 'due', 'fact', 'goes', 'shows', 'dare', 'forget', 'pretty', 'pictures', 'painted', 'mainstream', 'audiences', 'for

# 9. Stemming the tokens in the reviews

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

stemmed_reviews_list = []
for review in stop_words_removed_list:
  stemmed_review = []
  for token in review:
    stemmed_token = stemmer.stem(token)
    stemmed_review.append(stemmed_token)

  stemmed_reviews_list.append(stemmed_review)

print(stemmed_reviews_list[:10])

[['one', 'review', 'mention', 'watch', 'oz', 'episod', 'youll', 'hook', 'right', 'exactli', 'happen', 'meth', 'first', 'thing', 'struck', 'oz', 'brutal', 'unflinch', 'scene', 'violenc', 'set', 'right', 'word', 'go', 'trust', 'show', 'faint', 'heart', 'timid', 'show', 'pull', 'punch', 'regard', 'drug', 'sex', 'violenc', 'hardcor', 'classic', 'use', 'wordit', 'call', 'oz', 'nicknam', 'given', 'oswald', 'maximum', 'secur', 'state', 'penitentari', 'focus', 'mainli', 'emerald', 'citi', 'experiment', 'section', 'prison', 'cell', 'glass', 'front', 'face', 'inward', 'privaci', 'high', 'agenda', 'em', 'citi', 'home', 'manyaryan', 'muslim', 'gangsta', 'latino', 'christian', 'italian', 'irish', 'moreso', 'scuffl', 'death', 'stare', 'dodgi', 'deal', 'shadi', 'agreement', 'never', 'far', 'awayi', 'would', 'say', 'main', 'appeal', 'show', 'due', 'fact', 'goe', 'show', 'wouldnt', 'dare', 'forget', 'pretti', 'pictur', 'paint', 'mainstream', 'audienc', 'forget', 'charm', 'forget', 'romanceoz', 'doesnt'

# 10. Lemmatization of the words in all the documents

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

lemmatized_reviews_list = []
for review in stop_words_removed_list:
  lemmatized_review = []
  for token in review:
    lemmatized_token = lemmatizer.lemmatize(token, "v")
    lemmatized_review.append(lemmatized_token)

  lemmatized_reviews_list.append(lemmatized_review)

print(lemmatized_reviews_list[:10])

[['one', 'reviewers', 'mention', 'watch', 'oz', 'episode', 'youll', 'hook', 'right', 'exactly', 'happen', 'methe', 'first', 'thing', 'strike', 'oz', 'brutality', 'unflinching', 'scenes', 'violence', 'set', 'right', 'word', 'go', 'trust', 'show', 'faint', 'hearted', 'timid', 'show', 'pull', 'punch', 'regard', 'drug', 'sex', 'violence', 'hardcore', 'classic', 'use', 'wordit', 'call', 'oz', 'nickname', 'give', 'oswald', 'maximum', 'security', 'state', 'penitentary', 'focus', 'mainly', 'emerald', 'city', 'experimental', 'section', 'prison', 'cells', 'glass', 'front', 'face', 'inwards', 'privacy', 'high', 'agenda', 'em', 'city', 'home', 'manyaryans', 'muslims', 'gangstas', 'latinos', 'christians', 'italians', 'irish', 'moreso', 'scuffle', 'death', 'star', 'dodgy', 'dealings', 'shady', 'agreements', 'never', 'far', 'awayi', 'would', 'say', 'main', 'appeal', 'show', 'due', 'fact', 'go', 'show', 'wouldnt', 'dare', 'forget', 'pretty', 'picture', 'paint', 'mainstream', 'audiences', 'forget', 'ch

# 11. Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = []
for lemmatized_review in lemmatized_reviews_list:
  corpus.append(' '.join(lemmatized_review))

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()
X_array = X.toarray()

print("Unique Word List: \n", feature_names[10:15])
print("Bag of Words Matrix: \n", X_array)

Unique Word List: 
 ['_angel_' '_anyone_' '_anything_' '_both_' '_brooklyn_']
Bag of Words Matrix: 
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
df = pd.DataFrame(data=X_array, columns=feature_names, index=corpus)
(df)

Unnamed: 0,___,____,_____,______,________________________________________________________________,____the,___is,_acting_,_all_,_and_,...,êtrepeutêtre,ís,ísnt,ïn,óli,õs,önsjön,über,überwoman,ünfaithful
one reviewers mention watch oz episode youll hook right exactly happen methe first thing strike oz brutality unflinching scenes violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use wordit call oz nickname give oswald maximum security state penitentary focus mainly emerald city experimental section prison cells glass front face inwards privacy high agenda em city home manyaryans muslims gangstas latinos christians italians irish moreso scuffle death star dodgy dealings shady agreements never far awayi would say main appeal show due fact go show wouldnt dare forget pretty picture paint mainstream audiences forget charm forget romanceoz doesnt mess around first episode ever saw strike nasty surreal couldnt say ready watch develop taste oz get accustom high level graphic violence violence injustice crook guard wholl sell nickel inmates wholl kill order get away well mannered middle class inmates turn prison bitch due lack street skills prison experience watch oz may become comfortable uncomfortable viewingthats get touch darker side,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
wonderful little production film technique unassuming oldtimebbc fashion give comfort sometimes discomforting sense realism entire piece actors extremely well choose michael sheen get polari voice pat truly see seamless edit guide reference williams diary entries well worth watch terrificly write perform piece masterful production one great master comedy life realism really come home little things fantasy guard rather use traditional dream techniques remain solid disappear play knowledge sense particularly scenes concern orton halliwell set particularly flat halliwells murals decorate every surface terribly well do,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
think wonderful way spend time hot summer weekend sit air condition theater watch lighthearted comedy plot simplistic dialogue witty character likable even well bread suspect serial killer may disappoint realize match point risk addiction think proof woody allen still fully control style many us grow lovethis id laugh one woodys comedies years dare say decade ive never impress scarlet johanson manage tone sexy image jump right average spirit young womanthis may crown jewel career wittier devil wear prada interest superman great comedy go see friends,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
basically theres family little boy jake think theres zombie closet parent fight timethis movie slower soap opera suddenly jake decide become rambo kill zombieok first youre go make film must decide thriller drama drama movie watchable parent divorce argue like real life jake closet totally ruin film expect see boogeyman similar movie instead watch drama meaningless thriller spot well play parent descent dialogs shots jake ignore,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
petter matteis love time money visually stun film watch mr mattei offer us vivid portrait human relations movie seem tell us money power success people different situations encounter variation arthur schnitzlers play theme director transfer action present time new york different character meet connect one connect one way another next person one seem know previous point contact stylishly film sophisticate luxurious look take see people live world live habitatthe thing one get souls picture different stag loneliness one inhabit big city exactly best place human relations find sincere fulfillment one discern case people encounterthe act good mr matteis direction steve buscemi rosario dawson carol kane michael imperioli adrian grenier rest talented cast make character come alivewe wish mr mattei good luck await anxiously next work,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fun entertain movie wwii german spy julie andrews fall love american pilot rock hudson try get secrets reason attack critics shun public cant see beautifully shoot wonderful costume interiors excite aerial dogfight also andrews striptease strictly pg material sing beautiful songwhistling dark movie problems andrews hudson get along shoot thisand show love scenes lack spark zero sexual chemistry still turn ok performances film little long even min directors cut saw get way dark serious end still worth catch try see directors cutthe one run half hour longer,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
give break anyone say good hockey movie know movies tend pretty ppoor job portray hockey general public yes make back us hadnt embrace sport extent today really play hockey life watch even friends sheer lunacy scenes ice stupefyingly bizzare particular instance refer sword fight er mean stick fight end film everyone stand around watch fascination actually happen wonder win duel youngblood nemesis rakkie yes story ice little better stress littlei dont know maybe point go mean let face film right hockey one big battle ice oh yeah little piece vulcanize rubber bounce around occasionally loosely term goal youngblood either appal hysterical cant figure maybe someone else luck,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
movie bad movie watch endless series bad horror movies say little different many see plot fairly regular slasher story way scenes cut murder weapon give us lot inane dialogue scenes go lot longer movies type victims seem slightly less like cardboard cutouts slightly difficult time figure exactly happen begin keep wonder certain events dream sequence favorite scene two guy run killer take refuge car glove compartment find handgun thank god one happily exclaim guy head suddenly look like mannequins head go enough time wonder mannequin relation two guy car boom head explode figure suppose one guy car get head shoot shotgun love scene movie bad movie,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
movie probably make entertain middle school early high school age kid maybe funny may possibly even see something scary act poor plot poor theres much value adult viewer saw film weak bore time possibility movie could become interest never really materialize creatures look pretty good see second dont seem substance look time wasnt sure movie try make another attempt comedy another attempt horror fail movie wasnt good,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# 12. TF-IDF Calculation


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
terms = tfidf_vectorizer.get_feature_names_out()
df = pd.DataFrame(tfidf_matrix.toarray(), columns=terms)

df

Unnamed: 0,___,____,_____,______,________________________________________________________________,____the,___is,_acting_,_all_,_and_,...,êtrepeutêtre,ís,ísnt,ïn,óli,õs,önsjön,über,überwoman,ünfaithful
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 13. Encodings

In [None]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

all_words = [word for sentence in lemmatized_reviews_list for word in sentence]
vocab = sorted(set(all_words))

print("Vocabulary:", vocab)

word_array = np.array(all_words).reshape(-1, 1)
one_hot_encoder = OneHotEncoder(sparse_output=False)
one_hot_encoded = one_hot_encoder.fit_transform(word_array)
print("One-hot encoded matrix:\n", one_hot_encoded)



# 14. Word Embeddings and Word2Vec

In [None]:
!pip install gensim
import gensim
from gensim.models import Word2Vec

cbow_model = Word2Vec(lemmatized_reviews_list, vector_size=100, window=5, min_count=1, sg=0, alpha=0.03, min_alpha=0.0007, epochs=100)
skipgram_model = Word2Vec(lemmatized_reviews_list, vector_size=100, window=5, min_count=1, sg=1, alpha=0.03, min_alpha=0.0007, epochs=100)

cbow_model.train(lemmatized_reviews_list, total_examples=len(lemmatized_reviews_list), epochs=100)
skipgram_model.train(lemmatized_reviews_list, total_examples=len(lemmatized_reviews_list), epochs=100)

word_vectors_cbow = cbow_model.wv
similarity_cbow = word_vectors_cbow.similarity('word2vec', 'gensim')
print(f"Similarity between 'word2vec' and 'gensim': {similarity_cbow} with CBOW")

word_vectors_skipgram= skipgram_model.wv
similarity_skip = word_vectors_skipgram.similarity('word2vec', 'gensim')
print(f"Similarity between 'word2vec' and 'gensim': {similarity_skip} with Skip-Gram")

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


KeyboardInterrupt: 