In [53]:
# preprocessing
import string
from glob import glob

data_files = glob('./data/*_labelled.txt')
data = dict()

printable = set(string.printable)

def return_lines_from(file):
    with open(filename) as f:
        text = f.read()
    text = ''.join(list(filter(lambda x: x in printable, text)))
    return text.splitlines()

for filename in data_files:    
    data[filename] = return_lines_from(filename)


In [48]:
import numpy as np
import enchant
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

class SentenceCleaner:
    def __init__(self):
        self.dictionary = enchant.Dict("en_US")
        self.stemmer = PorterStemmer()
        self.stopwords = set(stopwords.words("english"))
        self.punc_remover = str.maketrans('', '', string.punctuation)

    def stem(self, word):
        stemmed = self.stemmer.stem(word)
        return stemmed if self.dictionary.check(stemmed) else word

    def clean_sentence(self, sentence):
        # Get the review text, lowercase + remove punctuation
        review_text = sentence.lower().translate(punc_remover)
        
        # Tokenize the review text into words and remove stopwords
        words = word_tokenize(review_text)
        stems = [ 
            self.stem(word) for word in words 
            if word not in self.stopwords 
        ]
        return stems

In [91]:
# Split into training and testing data
cleaner = SentenceCleaner()

testing = { 'data':[], 'labels': [] }
training = { 'data':[], 'labels': [] }

for fname in data.keys():
    positive = []
    negative = []
    for i in range(len(data[fname])):
        observation = data[fname][i]
        sentence, label = observation.split('\t')
        cleaned = cleaner.clean_sentence(sentence)

        if int(label) == 1:
            positive.append(cleaned)
        else:
            negative.append(cleaned)
    
    training['data'] += positive[:400] + negative[:400]
    testing['data'] += positive[-100:] + negative[-100:]
    training['labels'] += [1]*400+[0]*400
    testing['labels'] += [1]*100 +[0]*100   

It does everything the description said it would.	1
int label: 1
