#Import Libraries

In [4]:
import gensim
import gensim.downloader as api
import pandas as pd
from bs4 import BeautifulSoup
!pip install contractions
import contractions as ct
import re
import warnings


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m287.5/287.5 KB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-2.0.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (104 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.5/104.5 KB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.1 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24


#Load Word2vec Model

In [5]:
wv = api.load('word2vec-google-news-300')



#Define Functions

In [6]:
def init_data(data_frame):
    data_frame.dropna(inplace=True)
    data_frame.drop_duplicates(inplace=True)
    data_frame['star_rating'] = data_frame['star_rating'].astype('int')
    return data_frame

In [7]:
def data_cleaning(data_frame):
    for i in range(0, len(data_frame)):
        if data_frame['star_rating'][i] == '1' or data_frame['star_rating'][i] == '2':
            data_frame.loc[i, ['star_rating']] = 'Class 1'
        elif data_frame['star_rating'][i] == '3':
            data_frame.loc[i, ['star_rating']] = 'Class 2'
        elif data_frame['star_rating'][i] == '4' or data_frame['star_rating'][i] == '5':
            data_frame.loc[i, ['star_rating']] = 'Class 3'

        review_text = data_frame['review_body'][i]
        # remove un-wanted html tags
        if BeautifulSoup(review_text, "html.parser").find():
            review_text = BeautifulSoup(review_text, "html.parser").get_text("　")
        # text extend contractions
        review_text = ct.fix(review_text)
        # remove non-alphabetical chars
        regex = re.compile('[^a-zA-Z]')
        review_text = regex.sub(' ', review_text)
        # convert to lower case
        review_text = review_text.lower().strip()
        review_text = " ".join(review_text.split())
        # end of data processing
        review_text = " ".join(review_text.split())
        data_frame.loc[i, ['review_body']] = review_text
     
    return data_frame

#Initialization

In [8]:
 #warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
RANDOM_SAMPLE_SIZE = 20000
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')



#Prepare Balanced Dataset

In [9]:
# reading data
df = pd.read_pickle("/content/drive/MyDrive/Dataset/data.pkl")
df = init_data(df).reset_index(drop=True)

# 3-classes dataset
class1_df = df[df['star_rating'] <= 2].sample(RANDOM_SAMPLE_SIZE)
class2_df = df[df['star_rating'] == 3].sample(RANDOM_SAMPLE_SIZE)
class3_df = df[df['star_rating'] >= 4].sample(RANDOM_SAMPLE_SIZE)

balanced_df = pd.concat([class1_df, class2_df, class3_df]).reset_index(drop=True)
balanced_df['star_rating'] = balanced_df['star_rating'].astype('string')
cleaned_balanced_df = data_cleaning(balanced_df)
print(cleaned_balanced_df)

      star_rating                                        review_body
0         Class 1  these were almost impossible to apply to my na...
1         Class 1  i bought this only because it said there was n...
2         Class 1  this dryer is super noisy even for a hair blow...
3         Class 1  purchased this product several weeks ago looke...
4         Class 1  i have not used this product i received it tod...
...           ...                                                ...
59995     Class 3  i have only used this once but was veery happy...
59996     Class 3  broguht this for my mom she loves liz claiborn...
59997     Class 3  i had my doubts about these press on nail poli...
59998     Class 3  i am a newbie to the manly art of wet shaving ...
59999     Class 3             have not worn yet however very pleased

[60000 rows x 2 columns]


#Task 2a

In [10]:
sims = wv.most_similar(positive=['king', 'woman'], negative=['man'])
print(sims)

[('queen', 0.7118192911148071), ('monarch', 0.6189674139022827), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321243286133), ('kings', 0.5236844420433044), ('Queen_Consort', 0.5235945582389832), ('queens', 0.518113374710083), ('sultan', 0.5098593235015869), ('monarchy', 0.5087411999702454)]


#Task 2b

In [86]:
sentences = cleaned_balanced_df["review_body"].tolist()
sentences_training = [index.split() for index in sentences ]
# Train Word2vec model with Amazon review data
my_word2vec_model = gensim.models.Word2Vec(sentences_training , size=300, window=13, min_count=9)


In [96]:
sims = my_word2vec_model.most_similar(positive=['worst','bad'])
print(sims)


[('terrible', 0.6618835926055908), ('horrible', 0.5968073606491089), ('best', 0.536505937576294), ('awful', 0.528404712677002), ('greatest', 0.49852627515792847), ('amazing', 0.4964771568775177), ('real', 0.44130241870880127), ('good', 0.436043381690979), ('seriously', 0.42813795804977417), ('funny', 0.4183085858821869)]


  sims = my_word2vec_model.most_similar(positive=['worst','bad'])


#check GPU and Memory

In [12]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Not connected to a GPU
Your runtime has 37.8 gigabytes of available RAM

You are using a high-RAM runtime!
