#Import Libraries

In [212]:
import gensim
import gensim.downloader as api
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
from bs4 import BeautifulSoup
!pip install contractions
import contractions as ct
import re
import warnings


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#Load Word2vec Model

In [198]:
wv = api.load('word2vec-google-news-300')

#Define Functions

In [199]:
def init_data(data_frame):
    data_frame.dropna(inplace=True)
    data_frame.drop_duplicates(inplace=True)
    data_frame['star_rating'] = data_frame['star_rating'].astype('int')
    return data_frame

In [200]:
def data_cleaning(data_frame):
    for i in range(0, len(data_frame)):
        if data_frame['star_rating'][i] == '1' or data_frame['star_rating'][i] == '2':
            data_frame.loc[i, ['star_rating']] = 'Class 1'
        elif data_frame['star_rating'][i] == '3':
            data_frame.loc[i, ['star_rating']] = 'Class 2'
        elif data_frame['star_rating'][i] == '4' or data_frame['star_rating'][i] == '5':
            data_frame.loc[i, ['star_rating']] = 'Class 3'

        review_text = data_frame['review_body'][i]
        # remove un-wanted html tags
        if BeautifulSoup(review_text, "html.parser").find():
            review_text = BeautifulSoup(review_text, "html.parser").get_text("　")
        # text extend contractions
        review_text = ct.fix(review_text)
        # remove non-alphabetical chars
        regex = re.compile('[^a-zA-Z]')
        review_text = regex.sub(' ', review_text)
        # convert to lower case
        review_text = review_text.lower().strip()
        review_text = " ".join(review_text.split())
        # end of data processing
        review_text = " ".join(review_text.split())
        # replace empty string with numpy's nan datatype
        if review_text != "":                  
            data_frame.loc[i, ['review_body']] = review_text
        else:
            data_frame.loc[i, ['review_body']] = np.nan
    return data_frame

In [201]:
def data_prep(data):
    prepared_data = []
    for i in range (0,len(data)):
        words_list = data[i].split()
        vector_sum = np.zeros((300,))
        total_word = len(words_list)
        for word in words_list:
            if word in wv.vocab:
                vector_sum = vector_sum + wv[word]         
        prepared_data.append(vector_sum/total_word)
        
    return np.array(prepared_data)

In [202]:
# Print the training result
def generate_report(y_test, y_pred):
    report = classification_report(y_test, y_pred, zero_division=1, output_dict=True)
    print("Class 1 Precision: " + str(report['Class 1']['precision']) + ", Class 1 Recall: " + str(
        report['Class 1']['recall']) + ", Class 1 f1-score: " + str(report['Class 1']['f1-score']))
    print("Class 2 Precision: " + str(report['Class 2']['precision']) + ", Class 2 Recall: " + str(
        report['Class 2']['recall']) + ", Class 2 f1-score: " + str(report['Class 2']['f1-score']))
    print("Class 3 Precision: " + str(report['Class 3']['precision']) + ", Class 3 Recall: " + str(
        report['Class 3']['recall']) + ", Class 3 f1-score: " + str(report['Class 3']['f1-score']))
    print("Average Precision: " + str(report['macro avg']['precision']) + ", Averagage Recall: " + str(
        report['macro avg']['recall']) + ", Averagage f1-score: " + str(
        report['macro avg']['f1-score']))
    print("\n")

#Initialization

In [203]:
RANDOM_SAMPLE_SIZE = 20000
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')



#Prepare Balanced Dataset

In [204]:
# reading data
df = pd.read_pickle("/content/drive/MyDrive/Dataset/data.pkl")
df = init_data(df).reset_index(drop=True)

# 3-classes dataset
class1_df = df[df['star_rating'] <= 2].sample(RANDOM_SAMPLE_SIZE)
class2_df = df[df['star_rating'] == 3].sample(RANDOM_SAMPLE_SIZE)
class3_df = df[df['star_rating'] >= 4].sample(RANDOM_SAMPLE_SIZE)

balanced_df = pd.concat([class1_df, class2_df, class3_df]).reset_index(drop=True)
balanced_df['star_rating'] = balanced_df['star_rating'].astype('string')
cleaned_balanced_df = data_cleaning(balanced_df)
cleaned_balanced_df.dropna(inplace=True)

#Task 2a

In [205]:
# 3 examples using word2vec-google-news-300
example_1 = wv.most_similar(positive=['ice','sport'], negative=['walk'])
example_2 = wv.most_similar(positive=['gas', 'dangerous'], negative=['stable'])
example_3 = wv.most_similar(positive=['cold', 'rain'], negative=['sun'])
print("ice + sport - walk ~= " + str(example_1[0]))
print("gas + dangerous - stable ~= " + str(example_2[0]))
print("cold + rain - sun ~= " + str(example_3[0]))

ice + sport - walk ~= ('hockey', 0.5072677135467529)
gas + dangerous - stable ~= ('natural_gas', 0.4578143358230591)
cold + rain - sun ~= ('wet_weather', 0.5952470302581787)


#Task 2b

In [206]:
sentences = cleaned_balanced_df["review_body"].tolist()
sentences_training = [index.split() for index in sentences ]
# Train Word2vec model with Amazon review data
my_word2vec = gensim.models.Word2Vec(sentences_training , size=300, window=13, min_count=9)


In [207]:
# 3 examples using provided Amazon review
example_1 = my_word2vec.wv.most_similar(positive=['ice','sport'], negative=['walk'])
example_2 = my_word2vec.wv.most_similar(positive=['gas', 'dangerous'], negative=['stable'])
example_3 = my_word2vec.wv.most_similar(positive=['cold', 'rain'], negative=['sun'])
print("ice + sport - walk ~= " + str(example_1[0]))
print("gas + dangerous - stable ~= " + str(example_2[0]))
print("cold + rain - sun ~= " + str(example_3[0]))


ice + sport - walk ~= ('benzyl', 0.7632110118865967)
gas + dangerous - stable ~= ('consumers', 0.6139011383056641)
cold + rain - sun ~= ('toilet', 0.665246307849884)


#Task 3

####Split dataset into Training and Testing Set

In [208]:
X_train, X_test, y_train, y_test = train_test_split(cleaned_balanced_df['review_body'], cleaned_balanced_df['star_rating'], test_size=0.2)

#### Convert Datasets to Correct Format

In [209]:
X_train_np = data_prep(X_train.to_numpy()) 
X_test_np = data_prep(X_test.to_numpy()) 
y_train_np = y_train.to_numpy()
y_test_np =  y_test.to_numpy()

#### Train Perceptron

In [218]:
clf_perceptron = Perceptron()
clf_perceptron = clf_perceptron.fit(X_train_np, y_train_np)
y_pred_perceptron = clf_perceptron.predict(X_test_np)
generate_report(y_test_np, y_pred_perceptron)

Class 1 Precision: 0.7178147268408551, Class 1 Recall: 0.3783174762143215, Class 1 f1-score: 0.49549106410887034
Class 2 Precision: 0.414054426455391, Class 2 Recall: 0.8945671049367403, Class 2 f1-score: 0.5660910518053376
Class 3 Precision: 0.8968723584108199, Class 3 Recall: 0.2671198388721047, Class 3 f1-score: 0.411639185257032
Average Precision: 0.6762471705690221, Averagage Recall: 0.5133348066743889, Averagage f1-score: 0.49107376705708




####Train Linear SVC

In [217]:
clf_linear_svc = LinearSVC()
clf_linear_svc = clf_linear_svc.fit(X_train_np, y_train_np)
y_pred_linear_svc = clf_linear_svc.predict(X_test_np)
generate_report(y_test_np, y_pred_linear_svc)

Class 1 Precision: 0.6594185194234058, Class 1 Recall: 0.6757636454682023, Class 1 f1-score: 0.6674910349944355
Class 2 Precision: 0.586355089981198, Class 2 Recall: 0.5415529645249317, Class 2 f1-score: 0.5630642249161723
Class 3 Precision: 0.7060511839272902, Class 3 Recall: 0.743202416918429, Class 3 f1-score: 0.7241506194039005
Average Precision: 0.6506082644439647, Averagage Recall: 0.6535063423038544, Averagage f1-score: 0.6515686264381694




#check GPU and Memory

In [211]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Not connected to a GPU
Your runtime has 37.8 gigabytes of available RAM

You are using a high-RAM runtime!
