# Installations
#### (run once)

In [None]:
#!pip install spacy
#!python -m spacy download en_core_web_sm

# Import modules

In [1]:
import spacy
import os
import sys
import re

# 1. Load data and spacy english language model

In [2]:
def read_data(file):
    '''
    Read each text file into a string
    '''
    f = open(file, 'r', encoding='utf-8')
    text = f.read()
    f.close()
    return text


def folder_list(path):
    '''
    Reads each text file in a folder and concatenates each file into a bigger string
    Parameter 'path' is the path of your local folder
    '''
    filelist = os.listdir(path)
    text = ''
    for infile in filelist:
        file = os.path.join(path, infile)
        text_data = read_data(file)
        text += '\n ' + text_data
    return text

In [43]:
# you might have to change path if on mac
path = "dummy_data/"

data = folder_list(path)

In [67]:
data



In [111]:
nlp = spacy.load('en_core_web_sm', disable=['parser','ner'])
nlp.max_length = 2**31

# 2. Preprocessing
## Part I. Basic methods

### a. Expanding contractions

In [112]:
sys.path.append('./helpers/')
from contraction_map import *

In [113]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):

    contractions_pattern = re.compile('({})'.format('|'.join(
        contraction_mapping.keys())), flags=re.IGNORECASE | re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        #print(match)
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())
        #print(expanded_contraction)
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [114]:
# test
print(expand_contractions('''test: it's I'll I'm wouldn't it'll'''))

test: it is I will I am would not it will


### b. Remove non-alphabetical characters

In [115]:
def remove_non_alphabetical_characters(text):
    return re.sub(r'[^a-zA-Z\s]', '', text)

In [116]:
# test
print(remove_non_alphabetical_characters('t !@#$%^&*()_+-={}[]:"e;'
                                       '">? s ./\|<>,t1234567890`'))

t e s t


### c. Remove stopwords

In [117]:
#data_doc = [token for token in data_doc if not token.is_stop]
#print(data_doc)

def remove_stopwords(text):
    
    stopword_list = nlp.Defaults.stop_words
    filtered_tokens = [token for token in text.split() if token not in stopword_list]

    return ' '.join(filtered_tokens)

In [118]:
# test
print(remove_stopwords('this test is a test to test the test from a test'))

test test test test test


### d. Lemmatization

In [119]:
def lemmatize_text(text):
    return ' '.join([token.lemma_ for token in nlp(text)])

In [120]:
#test
print(lemmatize_text('test tests tested testing'))
print(lemmatize_text('try tries tried trying'))

test test test test
try try try try


# Part II. Preprocess all

In [124]:
def preprocess(text):
    text = text.lower()
    text = expand_contractions(text)
    text = remove_non_alphabetical_characters(text)
    text = remove_stopwords(text)
    text = lemmatize_text(text)
    return text

In [125]:
preprocessed_text = preprocess(data)

print(preprocessed_text)



In [159]:
# weird bug:
# in both cases they should not be printing anything. However in case 2 the apostrophe is different.
print('-'*20)
print('1.')
print(preprocess('''they'll''')) # correct

print('\n2.')
print(preprocess('''they’ll''')) # incorrect (copy & pasted from data) -> they'll should be removed (since they'll -> they will -> both are removed bc they are stopwords)
print('-'*20)

--------------------
1.


2.
they ll
--------------------
