#Import Libraries

In [4]:
import gensim
import gensim.downloader as api
import pandas as pd
from bs4 import BeautifulSoup
!pip install contractions
import contractions as ct
import re
import warnings


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m287.5/287.5 KB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-2.0.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (104 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.5/104.5 KB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.1 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24


#Load Word2vec Model

In [5]:
wv = api.load('word2vec-google-news-300')



#Define Functions

In [6]:
def init_data(data_frame):
    data_frame.dropna(inplace=True)
    data_frame.drop_duplicates(inplace=True)
    data_frame['star_rating'] = data_frame['star_rating'].astype('int')
    return data_frame

In [7]:
def data_cleaning(data_frame):
    for i in range(0, len(data_frame)):
        if data_frame['star_rating'][i] == '1' or data_frame['star_rating'][i] == '2':
            data_frame.loc[i, ['star_rating']] = 'Class 1'
        elif data_frame['star_rating'][i] == '3':
            data_frame.loc[i, ['star_rating']] = 'Class 2'
        elif data_frame['star_rating'][i] == '4' or data_frame['star_rating'][i] == '5':
            data_frame.loc[i, ['star_rating']] = 'Class 3'

        review_text = data_frame['review_body'][i]
        # remove un-wanted html tags
        if BeautifulSoup(review_text, "html.parser").find():
            review_text = BeautifulSoup(review_text, "html.parser").get_text("　")
        # text extend contractions
        review_text = ct.fix(review_text)
        # remove non-alphabetical chars
        regex = re.compile('[^a-zA-Z]')
        review_text = regex.sub(' ', review_text)
        # convert to lower case
        review_text = review_text.lower().strip()
        review_text = " ".join(review_text.split())
        # end of data processing
        review_text = " ".join(review_text.split())
        data_frame.loc[i, ['review_body']] = review_text
     
    return data_frame

#Initialization

In [8]:
 #warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
RANDOM_SAMPLE_SIZE = 20000
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')



#Prepare Balanced Dataset

In [9]:
# reading data
df = pd.read_pickle("/content/drive/MyDrive/Dataset/data.pkl")
df = init_data(df).reset_index(drop=True)

# 3-classes dataset
class1_df = df[df['star_rating'] <= 2].sample(RANDOM_SAMPLE_SIZE)
class2_df = df[df['star_rating'] == 3].sample(RANDOM_SAMPLE_SIZE)
class3_df = df[df['star_rating'] >= 4].sample(RANDOM_SAMPLE_SIZE)

balanced_df = pd.concat([class1_df, class2_df, class3_df]).reset_index(drop=True)
balanced_df['star_rating'] = balanced_df['star_rating'].astype('string')
cleaned_balanced_df = data_cleaning(balanced_df)
print(cleaned_balanced_df)

      star_rating                                        review_body
0         Class 1  these were almost impossible to apply to my na...
1         Class 1  i bought this only because it said there was n...
2         Class 1  this dryer is super noisy even for a hair blow...
3         Class 1  purchased this product several weeks ago looke...
4         Class 1  i have not used this product i received it tod...
...           ...                                                ...
59995     Class 3  i have only used this once but was veery happy...
59996     Class 3  broguht this for my mom she loves liz claiborn...
59997     Class 3  i had my doubts about these press on nail poli...
59998     Class 3  i am a newbie to the manly art of wet shaving ...
59999     Class 3             have not worn yet however very pleased

[60000 rows x 2 columns]


#Task 2a

In [213]:
# 3 examples using word2vec-google-news-300
example_1 = wv.most_similar(positive=['ice','sport'], negative=['walk'])
example_2 = wv.most_similar(positive=['gas', 'dangerous'], negative=['stable'])
example_3 = wv.most_similar(positive=['cold', 'rain'], negative=['sun'])
print(example_1)
print(example_2)
print(example_3)

[('hockey', 0.5072677135467529), ('Melting_polar', 0.49655914306640625), ('Synchronized_skating', 0.4755711555480957), ('sports', 0.4728817343711853), ('dancer_Pasha_Grishuk', 0.440501868724823), ('Ice', 0.4341908097267151), ('inliners', 0.4329093098640442), ('icemaker_Hans_Wuthrich', 0.43183428049087524), ('inline_hockey', 0.4251582622528076), ('floorball', 0.42099007964134216)]
[('natural_gas', 0.4578143358230591), ('gasoline', 0.4454296827316284), ('poisonous_chlorine', 0.4389461278915405), ('hazardous', 0.4383755922317505), ('Lighter_fluid', 0.4313579499721527), ('petroleum', 0.42816662788391113), ('oil', 0.4179289937019348), ('diesel_fuel', 0.41541749238967896), ('carbon_monoxide_colorless_odorless', 0.4021282196044922), ('emit_carbon_monoxide', 0.3903768062591553)]
[('wet_weather', 0.5952470302581787), ('heavy_rain', 0.5799657106399536), ('wet', 0.5452868938446045), ('torrential_rain', 0.5374876856803894), ('rains', 0.5214087963104248), ('downpour', 0.5208485126495361), ('freezin

#Task 2b

In [147]:
sentences = cleaned_balanced_df["review_body"].tolist()
sentences_training = [index.split() for index in sentences ]
# Train Word2vec model with Amazon review data
my_word2vec = gensim.models.Word2Vec(sentences_training , size=300, window=13, min_count=9)


In [214]:
# 3 examples using provided Amazon review
example_1 = my_word2vec.wv.most_similar(positive=['ice','sport'], negative=['walk'])
example_2 = my_word2vec.wv.most_similar(positive=['gas', 'dangerous'], negative=['stable'])
example_3 = my_word2vec.wv.most_similar(positive=['cold', 'rain'], negative=['sun'])
print(example_1)
print(example_2)
print(example_3)


[('aqua', 0.6976382732391357), ('grape', 0.6682608127593994), ('officinalis', 0.6666213274002075), ('defense', 0.6651606559753418), ('cacia', 0.6620233058929443), ('urea', 0.6598890423774719), ('alpha', 0.6543275117874146), ('seed', 0.6529845595359802), ('apricot', 0.6528604030609131), ('neocutis', 0.6516833901405334)]
[('food', 0.5912566184997559), ('voice', 0.5844886302947998), ('corporate', 0.5197523832321167), ('toxic', 0.5187370181083679), ('hygiene', 0.5043749809265137), ('solvent', 0.5040744543075562), ('unknown', 0.49566560983657837), ('animal', 0.4929892420768738), ('nobody', 0.48684537410736084), ('xylitol', 0.48270875215530396)]
[('diffusing', 0.5702681541442871), ('air', 0.5637214183807373), ('smoke', 0.5575622916221619), ('blows', 0.5489580631256104), ('humid', 0.5352522134780884), ('soapy', 0.5325103998184204), ('bathtub', 0.5315570831298828), ('toilet', 0.5305705666542053), ('washer', 0.5228312611579895), ('slippery', 0.5219340920448303)]


#check GPU and Memory

In [12]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Not connected to a GPU
Your runtime has 37.8 gigabytes of available RAM

You are using a high-RAM runtime!
