In [1]:
# essentials
import pandas as pd
import os 
from collections import Counter

from utils import preprocess

In [2]:
sample_size = 1500

## Loading the data

In [3]:
# walking around files to get the reviews and their positivity

reviews = []
positive = []

for root, dirs, files in os.walk('./data/reviews'):
    for filename in files:
        file = open(os.path.join(root, filename))
        try:
            text = file.read()
            reviews.append(text)

            if root == "./reviews\pos":
                positive.append(True)
            else:
                positive.append(False)
        except UnicodeDecodeError:
            print(f"File: {filename} wasn't loaded successfully.")

In [4]:
# creating a dataframe
reviews_df = pd.DataFrame([reviews, positive])
reviews_df = reviews_df.T
reviews_df.columns = ["review", "positive"]

In [5]:
reviews_df

Unnamed: 0,review,positive
0,Story of a man who has unnatural feelings for ...,False
1,Airport '77 starts as a brand new luxury 747 p...,False
2,This film lacked something I couldn't put my f...,False
3,"Sorry everyone,,, I know this is supposed to b...",False
4,When I was little my parents took me along to ...,False
...,...,...
24994,"Seeing as the vote average was pretty low, and...",False
24995,"The plot had some wretched, unbelievable twist...",False
24996,I am amazed at how this movie(and most others ...,False
24997,A Christmas Together actually came before my t...,False


## Preprocessing

In [6]:
reviews_df["clean_text"] = reviews_df.review.apply(preprocess.general_preprocess)

In [7]:
reviews_df["tokenized"] = reviews_df.clean_text.apply(preprocess.tokenize)

In [10]:
reviews_df

Unnamed: 0,review,positive,clean_text,tokenized
0,Story of a man who has unnatural feelings for ...,False,story of a man who has unnatural feelings for ...,"[story, of, a, man, who, has, unnatural, feeli..."
1,Airport '77 starts as a brand new luxury 747 p...,False,airport starts as a brand new luxury plane i...,"[airport, starts, as, a, brand, new, luxury, p..."
2,This film lacked something I couldn't put my f...,False,this film lacked something i couldnt put my fi...,"[this, film, lacked, something, i, couldnt, pu..."
3,"Sorry everyone,,, I know this is supposed to b...",False,sorry everyone i know this is supposed to be a...,"[sorry, everyone, i, know, this, is, supposed,..."
4,When I was little my parents took me along to ...,False,when i was little my parents took me along to ...,"[when, i, was, little, my, parents, took, me, ..."
...,...,...,...,...
24994,"Seeing as the vote average was pretty low, and...",False,seeing as the vote average was pretty low and ...,"[seeing, as, the, vote, average, was, pretty, ..."
24995,"The plot had some wretched, unbelievable twist...",False,the plot had some wretched unbelievable twists...,"[the, plot, had, some, wretched, unbelievable,..."
24996,I am amazed at how this movie(and most others ...,False,i am amazed at how this movieand most others h...,"[i, am, amazed, at, how, this, movieand, most,..."
24997,A Christmas Together actually came before my t...,False,a christmas together actually came before my t...,"[a, christmas, together, actually, came, befor..."


In [16]:
reviews_df.to_csv("data/reviews_cleaned.csv", index=False)

In [17]:
small_reviews_df = reviews_df.sample(sample_size)
small_reviews_df.to_csv("data/reviews_cleaned_sample.csv", index=False)