In [1]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.2.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: gensim
Successfully installed gensim-4.2.0
[0m

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from modules.TextCleaner import Cleaner
from modules.TextPreparation import TextPreparation
import regex as re
import pickle
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score

In [2]:
# Get cpu or gpu device for training
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))

Using cuda device


In [3]:
# load in dataframe
df = pd.read_csv('data/yelp_ratings.csv').iloc[0:35000, :]
df_end_test = pd.read_csv('data/yelp_ratings.csv').iloc[35000:37000, :]
print(df.shape)
print(df.head(3))

(35000, 3)
                                                text  stars  sentiment
0  Total bill for this horrible service? Over $8G...    1.0          0
1  I *adore* Travis at the Hard Rock's new Kelly ...    5.0          1
2  I have to say that this office really has it t...    5.0          1


In [4]:
# clean texts and assess how many tokens in each text

split_ratio = 0.8

cleaner = Cleaner()
textPrepare = TextPreparation()
df['cleaned_text'] = df['text'].apply(lambda x : cleaner.clean_text(x))
train_df, test_df = textPrepare.split_data(df, split_ratio)

#df['text_lens'] = df['cleaned_text'].apply(lambda x : len(x))
#sorted_lens = np.sort(df['text_lens'])
#plt.figure(figsize=(15, 8))
#plt.plot(np.arange(0, 100, 100/len(sorted_lens)), sorted_lens)
#plt.xlabel('Percent texts')
#plt.ylabel('Num tokens')
#plt.grid()
#plt.show()

In [5]:
# extract sentences and create word embeddings

emb_size = 90
w_size = 2
min_count = 1
save = True

embeddings = textPrepare.create_word_embeddings(train_df, emb_size, w_size, min_count, save)

num sentences: 218826


In [6]:
# vectorise texts with embeddings and rebalance

tokens_len = 150

train_df['vectorised_texts'] = train_df['cleaned_text'].apply(lambda x, embeddings=embeddings, tokens_len=tokens_len: 
                                                    textPrepare.vectorise_texts(x, embeddings, tokens_len))
test_df['vectorised_texts'] = test_df['cleaned_text'].apply(lambda x, embeddings=embeddings, tokens_len=tokens_len: 
                                                    textPrepare.vectorise_texts(x, embeddings, tokens_len))
train_df = textPrepare.rebalance(train_df)

In [7]:
#compress texts into vectors
train_df['compressed_texts'] = train_df['vectorised_texts'].apply(lambda x: textPrepare.compress_texts(x))
test_df['compressed_texts'] = test_df['vectorised_texts'].apply(lambda x: textPrepare.compress_texts(x))
train_df = train_df.dropna(axis=0)
test_df = test_df.dropna(axis=0)

# extract data from df into numpy format
X_train = np.zeros((train_df.shape[0], emb_size))
X_test = np.zeros((test_df.shape[0], emb_size))
for i in range(train_df.shape[0]):
    X_train[i, :] = train_df.iloc[i, -1]
    if i < test_df.shape[0]:
        X_test[i, :] = test_df.iloc[i, -1]
y_train = np.reshape(np.array(train_df['sentiment']), (len(train_df), 1))
y_test = np.reshape(np.array(test_df['sentiment']), (len(test_df), 1))
train = np.append(X_train, y_train, axis=1)
test = np.append(X_test, y_test, axis=1)
train = train[~np.isnan(train).any(axis=1), :]
test = test[~np.isnan(test).any(axis=1), :]

with open('data/train_fcnn', 'wb') as fp:
    pickle.dump(train, fp)
with open('data/test_fcnn', 'wb') as fp:
    pickle.dump(test, fp)

In [8]:
# extract data from df into numpy format and append to labels

X_train_trans = np.zeros((train_df.shape[0], tokens_len, emb_size))
X_test_trans = np.zeros((test_df.shape[0], tokens_len, emb_size))

for i in range(train_df.shape[0]):
    X_train_trans[i, :, :] = train_df.iloc[i, -2]
    if i < test_df.shape[0]:
        X_test_trans[i, :, :] = test_df.iloc[i, -2]

#y_train_trans = np.array(train_df['sentiment'])
#y_test_trans = np.array(test_df['sentiment'])

#train_trans = np.append(X_train_trans, np.tile(np.reshape(y_train_trans, (len(y_train_trans), 1, 1)), (1, tokens_len, 1)), axis=2)
#test_trans = np.append(X_test_trans, np.tile(np.reshape(y_test_trans, (len(y_test_trans), 1, 1)), (1, tokens_len, 1)), axis=2)

y_train_trans = np.empty((X_train_trans.shape[0], tokens_len))
y_test_trans = np.empty((X_test_trans.shape[0], tokens_len))
y_train_trans[...] = np.reshape(np.array(train_df['sentiment']), (X_train_trans.shape[0], 1))
y_test_trans[...] = np.reshape(np.array(test_df['sentiment']), (X_test_trans.shape[0], 1))

train_trans = np.empty((X_train_trans.shape[0], tokens_len, emb_size + 1))
test_trans = np.empty((X_test_trans.shape[0], tokens_len, emb_size + 1))
train_trans[:, :, 0:-1] = X_train_trans
train_trans[:, :, -1] = y_train_trans
test_trans[:, :, 0:-1] = X_test_trans
test_trans[:, :, -1] = y_test_trans

print(test_trans.shape)
print(train_trans.shape)

with open('data/train_trans', 'wb') as fp:
    pickle.dump(train_trans, fp)
with open('data/test_trans', 'wb') as fp:
    pickle.dump(test_trans, fp)

(7000, 150, 91)
(41806, 150, 91)


In [9]:
with open('data/test_dataset', 'wb') as fp:
    pickle.dump(df_end_test, fp)