In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from config import *

data = pd.read_csv(DATA_FILE)
data.review = data.review.astype('string')
data.sentiment = data.sentiment.replace({'positive': 1, 'negative': 0})
data = data[:int(len(data) * 0.75)]

In [None]:
from typing import List
from collections import Counter
from itertools import chain

def to_list(df: DataFrame) -> List[str]:
    return list(df.values)

def word_stat(texts: List[str]) -> Counter:
    words = list(chain.from_iterable([text.split() for text in texts]))
    return Counter(words)

In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

eng_stopwords = stopwords.words('english')

In [None]:
import re
from nltk.stem import PorterStemmer
from tqdm import tqdm

stemmer = PorterStemmer()

def preprocess_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r'<.*>', '', text)
    return text

def filter_text(text: str) -> str:
    is_allowed_ch = lambda ch: ch.isalpha() or ch == ' '
    is_not_stopword = lambda word: word not in eng_stopwords
    
    text = ''.join(list(filter(is_allowed_ch, text)))
    text = ' '.join(list(filter(is_not_stopword, text.split())))
    text = ' '.join(list(map(stemmer.stem, text.split())))
    return text

texts = []
for text in tqdm(data.review.tolist()):
    texts.append(filter_text(preprocess_text(text)))
    
stat = word_stat(texts)
stat.most_common(30)

In [None]:
words = [text.split() for text in texts]
word_indx = {}
sorted_words = sorted(stat.keys(), reverse=True)
for i in range(len(sorted_words)):
    word_indx[sorted_words[i]] = i


word_embeddings = list(map(lambda x: [word_indx[word] for word in x], words))

In [None]:
m = max(map(len, word_embeddings))
X = list(map(lambda x: x + [0] * (m - len(x)), word_embeddings))
Y = data.sentiment.values

In [None]:
from sklearn.model_selection import train_test_split
import torch as tch
from torch import tensor

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=23)

X_train = tensor(X_train)
X_test = tensor(X_test)
Y_train = tensor(Y_train)
Y_test = tensor(Y_test)

In [None]:
class CustomModel(tch.nn.Module):
    def __init__(self, h_size, out_size, count_words):
        super(CustomModel, self).__init__()
        self.embedding = tch.nn.Embedding(count_words, h_size)
        self.lstm = tch.nn.GRU(h_size, h_size)
        self.lin = tch.nn.Linear(h_size, out_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.lstm(x)[0]
        x = self.lin(x[:, -1, :])
        return x

In [None]:
from sklearn.metrics import precision_score as prec_score

output_size = 2
vocab_size = len(word_indx)
epochs = 20
hidden_size = 10
lr = 0.001

In [None]:
result = []
model = CustomModel(hidden_size, output_size, vocab_size)
crit = tch.nn.CrossEntropyLoss()
optim = tch.optim.RMSprop(model.parameters(), lr=lr)

for epoch in range(epochs):
    model.train()
    optim.zero_grad()
    loss = crit(model(X_train), Y_train)
    loss.backward()
    optim.step()
    result.append(loss)

model.eval()
with tch.no_grad():
    test_res = tch.argmax(model(X_test), dim=1).numpy()
    train_res = tch.argmax(model(X_train), dim=1).numpy()
    
    print(prec_score(Y_train, train_res), prec_score(Y_test, test_res))

In [None]:
with tch.no_grad():
    plt.plot(list(range(epochs)), result)