# Installations
#### (run once)

In [1]:
#Note: make sure you're using python 3.9 or below - 3.10 doesn't have wheels yet for spacy
#!pip install spacy
#!python -m spacy download en_core_web_sm

#!pip install beautifulsoup4

# Import modules

In [2]:
import os
import sys
import re

import spacy
from bs4 import BeautifulSoup

# 1. Load data and spacy english language model

In [66]:
def read_data(file):
    '''
    Read each text file into a string
    '''
    f = open(file, 'r', encoding='utf-8', errors='ignore')
    #f = open(file, 'r', encoding='ISO-8859-1')
    text = f.read()
    f.close()
    return text

def folder_list(path):
    '''
    Reads each text file in a folder and concatenates each file into a bigger string
    Parameter 'path' is the path of your local folder
    '''
    filelist = os.listdir(path)
    text = ''
    for infile in filelist:
        file = os.path.join(path, infile)
        text_data = read_data(file)
        text += '\n ' + text_data
    return text

In [72]:
# you might have to change path if on mac
#path = "dummy_data/"

data = read_data('articles_3_sources.txt')
#data = read_data('out.txt')
#data = folder_list(path)

In [73]:
data



In [21]:
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 2**31

# 2. Preprocessing
## Part I. Basic methods

### a. Remove HTML tags

In [22]:
def remove_HTML(text):
    return BeautifulSoup(text, "html.parser").text

In [8]:
# test
print(remove_HTML('<!DOCTYPE html><html><body><h1>text and </h1><p>some more text</p><img src="w3schools.jpg" alt="W3Schools.com" width="104" height="142"></body></html>'))

text and some more text


### b. Expanding contractions

In [23]:
sys.path.append('./helpers/')
from contraction_map import *

In [24]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):

    contractions_pattern = re.compile('({})'.format('|'.join(
        contraction_mapping.keys())), flags=re.IGNORECASE | re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        #print(match)
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())
        #print(expanded_contraction)
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [25]:
# test
print(expand_contractions('''test: it's I'll I'm wouldn't it'll'''))

test: it is I will I am would not it will


### c. Remove non-alphabetical characters

In [26]:
def remove_non_alphabetical_characters(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

In [27]:
# test
print(remove_non_alphabetical_characters('t !@#$%^&*()_+-={}[]:"e;'
                                       '">? s ./\|<>,t 1234567890`'))

t e s t 1234567890


### d. Remove stopwords

In [28]:
def remove_stopwords(text):
    
    stopword_list = nlp.Defaults.stop_words
    filtered_tokens = [token for token in text.split() if token not in stopword_list]

    return ' '.join(filtered_tokens)

In [29]:
# test
print(remove_stopwords('this test is a test to test the test from a test'))

test test test test test


### e. Lemmatization

In [30]:
def lemmatize_text(text):
    return ' '.join([token.lemma_ for token in nlp(text)])

In [31]:
#test
print(lemmatize_text('test tests tested testing'))
print(lemmatize_text('try tries tried trying'))

test test test test
try try try try


# Part II. Preprocess all

In [36]:
def preprocess(text):
    text = text.lower()
    text = remove_HTML(text)
    text = expand_contractions(text)
    text = remove_non_alphabetical_characters(text)
    text = remove_stopwords(text)
    text = lemmatize_text(text)
    return text

In [69]:
preprocessed_text = preprocess(data)

print(preprocessed_text)



In [None]:
# weird bug:
# in both cases they should not be printing anything. However in case 2 the apostrophe is different.
print('-'*20)
print('1.')
print(preprocess('''they'll''')) # correct

print('\n2.')
print(preprocess('''they’ll''')) # incorrect (copy & pasted from data) -> they'll should be removed (since they'll -> they will -> both are removed bc they are stopwords)
print('-'*20)

# write to file

In [70]:
file = open(r'preprocessed_out.txt','w')
file.write(preprocessed_text)
file.close()