In [1]:
import re
import string
import functools
import operator 
import json
import spacy
import numpy as np
import pandas as pd
import matplotlib as plt
import torch
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from dataclasses import dataclass

nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])
stopwords = nlp.Defaults.stop_words

In [2]:
@dataclass
class cfg:
    batch_size = 32
    lr = 3e-4

In [3]:
class RevDataset(Dataset):
    def __init__(self, path):
        self.path = path 
        
        with open('data/Video_Games_5.json', 'r') as f:
            lines = [json.loads(line.rstrip()) for line in f]
            self.data = pd.DataFrame(lines)[['verified', 'reviewText', 'overall']]
            self.data = self.data[self.data['verified']]
        
        self.data = self.data.dropna()
        self.data = self.data.reset_index(drop=True)
        self.data = self.data.drop('verified', axis=1)
        
        self.data['reviewText'] = self.data['reviewText'].apply(self.clean_data)
        self.tok_lemma()
        
        self.data.to_csv('data/preprocessed.csv')
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        assert type(index) == int, 'Index must be int'
        
        item = self.data.iloc[index]
        
        return item['reviewText'], item['overall']
    
    @staticmethod
    def clean_data(text):
        if type(text) != str:
            return '   '
        
        # lowercase
        text = text.lower()
        
        # remove \n signs
        text = text.replace('\n\n\n\n', ' ').replace('\n\n\n', ' ').replace('\n\n', ' ').replace('\n', ' ')
        
        # remove url
        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
        
        # remove punctuations
        text = text.translate(str.maketrans('', '', string.punctuation))

        return text 
    
    def tok_lemma(self):
        reviews = self.data['reviewText'].values

        reviews = nlp.pipe(reviews, batch_size=1, n_process=3)

        reviews = [' '.join([word.lemma_ for word in text if not word in stopwords and word.lemma_ and word.text.isalpha()]) for text in reviews]

        self.data['reviewText'] = pd.Series(reviews) 

In [4]:
data = RevDataset('data/Video_Games_5.json')