In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/midou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [55]:
from os import listdir
from collections import Counter
import string
from nltk.corpus import stopwords

def add_doc_to_vocab(filename, vocab):
    '''
    add document into vocab Counter
    '''
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    vocab.update(tokens)
    
def doc_to_line(filename, vocab):
    '''
    process document + clean + filter and returns it as a line
    '''
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)
    
def clean_doc(doc):
    '''
    removes punctuation + everything in lowercase + removes '<br/>'
    + only consider english words
    '''
    stop_words = set(stopwords.words('english'))
    words = doc.split()
    table = str.maketrans('', '', string.punctuation)
    # remove punctuation
    words = [w.translate(table) for w in words]
    # replaces upper case by lower case
    words = [word.lower() for word in words]
    # remove words that are not alphabetic
    words = [word for word in words if word.isalpha()]
    # remove 'br'
    words = [word for word in words if word not in ['br']]  
    # removes non-english words
    words = [w for w in words if not w in stop_words]
    # filter out short word
    words = [word for word in words if len(word) > 1]
    return words    

def load_doc(filename):
    '''
    load doc into memory
    '''
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

def process_docs(directory, vocab):
    '''
    load each doc in directory
    '''
    lines = list()
    for filename in listdir(directory):
        # full file path name
        path = directory + '/' + filename
        line = doc_to_line(path, vocab)
        lines.append(line)
    return lines

# load all docs in a directory
def process_docs_to_vocab(directory, vocab):
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip files that do not have the right extensionab
        if not filename.endswith(".txt"):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # add doc to vocab
        add_doc_to_vocab(path, vocab)
        
def save_list(lines, filename):
    '''
    saves tokens to file
    '''
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()
    
def remove_min_occ(vocab, occ, filename):
    tokens = [k for k, c in vocab.items() if c>=min_occurance]
    save_list(tokens, filename)

In [54]:
vocab = Counter()
process_docs_to_vocab('aclImdb/train/neg', vocab)
process_docs_to_vocab('aclImdb/train/pos', vocab)
print(len(vocab))
print(vocab.most_common(50))

117232
[('movie', 41807), ('film', 37455), ('one', 25508), ('like', 19641), ('good', 14555), ('even', 12503), ('would', 12135), ('time', 11779), ('really', 11663), ('story', 11454), ('see', 11223), ('much', 9584), ('well', 9372), ('get', 9212), ('also', 9073), ('people', 8951), ('bad', 8912), ('great', 8894), ('first', 8857), ('dont', 8473), ('made', 7990), ('movies', 7788), ('make', 7729), ('films', 7727), ('could', 7713), ('way', 7685), ('characters', 7290), ('think', 7229), ('watch', 6777), ('two', 6643), ('many', 6640), ('seen', 6529), ('character', 6514), ('never', 6425), ('little', 6387), ('acting', 6291), ('plot', 6275), ('best', 6263), ('love', 6214), ('know', 6038), ('life', 5988), ('show', 5967), ('ever', 5804), ('still', 5561), ('better', 5547), ('end', 5361), ('say', 5331), ('man', 5211), ('scene', 5169), ('scenes', 5063)]


In [56]:
remove_min_occ(vocab, 5, 'vocab.txt')

In [58]:
#load vocab
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

In [None]:
# prepare negative reviews
negative_lines = process_docs('aclImdb/train/neg', vocab)
save_list(negative_lines, 'negative.txt')
# prepare positive reviews
positive_lines = process_docs('aclImdb/train/pos', vocab)
save_list(positive_lines, 'positive.txt')

In [59]:
# prepare test negative reviews
negative_lines_test = process_docs('aclImdb/test/neg', vocab)
save_list(negative_lines_test, 'negative_test.txt')
# prepare test positive reviews
positive_lines_test = process_docs('aclImdb/test/pos', vocab)
save_list(positive_lines_test, 'positive_test.txt')