In [1]:
# essentials
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os 
import re
import string
from collections import Counter
import pickle

# text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import config

In [20]:
sample_size = 1500

In [4]:
reviews = []
positive = []

for root, dirs, files in os.walk('./data/reviews'):
    for filename in files:
        file = open(os.path.join(root, filename))
        try:
            text = file.read()
            reviews.append(text)

            if root == "./reviews\pos":
                positive.append(True)
            else:
                positive.append(False)
        except UnicodeDecodeError:
            print(f"File: {filename} wasn't loaded successfully.")

In [5]:
reviews_df = pd.DataFrame([reviews, positive])
reviews_df = reviews_df.T
reviews_df.columns = ["review", "positive"]

In [6]:
reviews_df

Unnamed: 0,review,positive
0,Story of a man who has unnatural feelings for ...,False
1,Airport '77 starts as a brand new luxury 747 p...,False
2,This film lacked something I couldn't put my f...,False
3,"Sorry everyone,,, I know this is supposed to b...",False
4,When I was little my parents took me along to ...,False
...,...,...
24994,"Seeing as the vote average was pretty low, and...",False
24995,"The plot had some wretched, unbelievable twist...",False
24996,I am amazed at how this movie(and most others ...,False
24997,A Christmas Together actually came before my t...,False


## Preprocessing

In [None]:
nltk.download('stopwords')

In [40]:
stop_words = stopwords.words('english')

In [7]:
# removing all punctuations except commas

custom_punctuation = string.punctuation.replace(',', '')
custom_punctuation = custom_punctuation.replace('.', '')

In [8]:
# TODO: add removing html parts

def general_preprocess(text):
    # Removing numbers
    text_number = re.sub(r'\d+', '', text)

    # Lowering text
    text_lower = text_number.lower()

    # removing punctuation
    text_no_punctuations = text_lower.translate(str.maketrans('', '', custom_punctuation))

    # making one type of commas
    text_unified_commas = text_no_punctuations.replace('.', ',')

    return text_unified_commas

In [9]:
reviews_df["clean_text"] = reviews_df.review.apply(general_preprocess)

In [10]:
reviews_df["tokenized"] = reviews_df.clean_text.apply(word_tokenize)

In [11]:
all_words = []

for tokenized_reviews in reviews_df.tokenized:
    all_words.extend(tokenized_reviews)

In [12]:
Counter(all_words)

Counter({'story': 11703,
         'of': 145395,
         'a': 162459,
         'man': 5291,
         'who': 20456,
         'has': 16735,
         'unnatural': 49,
         'feelings': 395,
         'for': 44082,
         'pig': 91,
         ',': 601750,
         'starts': 1221,
         'out': 16544,
         'with': 44004,
         'opening': 961,
         'scene': 5272,
         'that': 69644,
         'is': 107126,
         'terrific': 430,
         'example': 1363,
         'absurd': 303,
         'comedy': 3087,
         'formal': 27,
         'orchestra': 46,
         'audience': 2127,
         'turned': 888,
         'into': 9094,
         'an': 21505,
         'insane': 237,
         'violent': 508,
         'mob': 152,
         'by': 22406,
         'the': 335688,
         'crazy': 630,
         'chantings': 1,
         'its': 25267,
         'singers': 75,
         'unfortunately': 1352,
         'it': 78664,
         'stays': 182,
         'whole': 3067,
         'time': 12

In [14]:
vocabulary = [word for word, quantity in Counter(all_words).items() if quantity >= config.MIN_COUNT]

In [15]:
vocabulary_size = len(vocabulary)

In [16]:
reviews_df

Unnamed: 0,review,positive,preprocessed,tokenized
0,Story of a man who has unnatural feelings for ...,False,story of a man who has unnatural feelings for ...,"[story, of, a, man, who, has, unnatural, feeli..."
1,Airport '77 starts as a brand new luxury 747 p...,False,airport starts as a brand new luxury plane i...,"[airport, starts, as, a, brand, new, luxury, p..."
2,This film lacked something I couldn't put my f...,False,this film lacked something i couldnt put my fi...,"[this, film, lacked, something, i, couldnt, pu..."
3,"Sorry everyone,,, I know this is supposed to b...",False,"sorry everyone,,, i know this is supposed to b...","[sorry, everyone, ,, ,, ,, i, know, this, is, ..."
4,When I was little my parents took me along to ...,False,when i was little my parents took me along to ...,"[when, i, was, little, my, parents, took, me, ..."
...,...,...,...,...
24994,"Seeing as the vote average was pretty low, and...",False,"seeing as the vote average was pretty low, and...","[seeing, as, the, vote, average, was, pretty, ..."
24995,"The plot had some wretched, unbelievable twist...",False,"the plot had some wretched, unbelievable twist...","[the, plot, had, some, wretched, ,, unbelievab..."
24996,I am amazed at how this movie(and most others ...,False,i am amazed at how this movieand most others h...,"[i, am, amazed, at, how, this, movieand, most,..."
24997,A Christmas Together actually came before my t...,False,a christmas together actually came before my t...,"[a, christmas, together, actually, came, befor..."


In [19]:
reviews_df.to_csv("data/reviews_cleaned.csv", index=False)

In [21]:
small_reviews_df = reviews_df[:sample_size]
small_reviews_df.to_csv("data/reviews_cleaned_sample.csv", index=False)