## PREPROCESSING TEXT

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import numpy as np
import matplotlib as plt
import seaborn as sns
import re
import string
import operator
from collections import defaultdict
import ast
import pickle
import warnings
warnings.filterwarnings("ignore")

import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def clean_text(corpus):

    #replacing whatever is in two backward slash with space
    corpus = re.sub(r"\\.*?\\", ' ',corpus)
    # replacing all the words ending with ':'
    corpus = re.sub('\S.*(?<=:)', ' ',corpus)
    # replacing whatever is in angular brackets with space
    corpus = re.sub(r"\<.*\>", ' ',corpus)
    # replacing whatever is in paranthesis with space
    corpus = re.sub(r"\(.*\)", ' ',corpus)
    #replacing whatever is in square brackets with space
    corpus = re.sub(r"\[.*\]", ' ',corpus)
    #replacing whatever is in double square brackets with space
    corpus = re.sub(r"\[\[.*?\]\]", ' ',corpus)
    #replacing whatever is in braces with space
    corpus = re.sub(r"\{.*\}", ' ',corpus)
    #replacing whatever is in double braces with space
    corpus = re.sub(r"\{{.*\}}", ' ',corpus)
    #replacing whatever is in hyphen with space
    corpus = re.sub(r'-.*?-', ' ',corpus)
    #replacing new line like \n and \t with space
    corpus = re.sub(r'\n|\t', ' ',corpus)
    #replacing single charater ending with fullstop like 'a.' or multiple charater aswell 'a.b.c.' with space
    corpus = re.sub(r'\b\w\.(?!\w)|\b\w(\.\w)+\b', ' ',corpus)


    # url starting with https:// or http:// or www.
    corpus = re.sub('http://\S+|https://\S+|www.\S+', '', corpus)

    # removing trailing and leading underscore from the words
    x = [i.strip('_') for i in corpus.split()]
    corpus = ' '.join(x)

    # removing two and one letter words before underscore
    x=[]
    # splitting words
    for i in corpus.split():
        # if underscore _ is there then
        if '_' in i:
            # it will take word before underscore
            # and check if it is greater two
            if len(i.split('_')[0]) > 2:
                # if it is greater than 2 then it will append
                x.append(i)
            # if the word before underscore is less than or equal to 2
            if len(i.split('_')[0]) <= 2:
                # then it will only append word after the underscore
                x.append(i.split('_')[1])
        else:
            x.append(i)
    corpus= ' '.join(x)



    # Decontractions
    corpus = re.sub(r"he's", "he is", corpus)
    corpus = re.sub(r"there's", "there is", corpus)
    corpus = re.sub(r"We're", "We are", corpus)
    corpus = re.sub(r"That's", "That is", corpus)
    corpus = re.sub(r"won't", "will not", corpus)
    corpus = re.sub(r"they're", "they are", corpus)
    corpus = re.sub(r"Can't", "Cannot", corpus)
    corpus = re.sub(r"wasn't", "was not", corpus)
    corpus = re.sub(r"dont", "do not", corpus)
    corpus = re.sub(r"aren't", "are not", corpus)
    corpus = re.sub(r"isn't", "is not", corpus)
    corpus = re.sub(r"What's", "What is", corpus)
    corpus = re.sub(r"haven't", "have not", corpus)
    corpus = re.sub(r"hasn't", "has not", corpus)
    corpus = re.sub(r"There's", "There is", corpus)
    corpus = re.sub(r"He's", "He is", corpus)
    corpus = re.sub(r"It's", "It is", corpus)
    corpus = re.sub(r"You're", "You are", corpus)
    corpus = re.sub(r"I'M", "I am", corpus)
    corpus = re.sub(r"shouldn't", "should not", corpus)
    corpus = re.sub(r"wouldn't", "would not", corpus)
    corpus = re.sub(r"i'm", "I am", corpus)
    corpus = re.sub(r"Im", "I am", corpus)
    corpus = re.sub(r"I'm", "I am", corpus)
    corpus = re.sub(r"Isn't", "is not", corpus)
    corpus = re.sub(r"Here's", "Here is", corpus)
    corpus = re.sub(r"you've", "you have", corpus)
    corpus = re.sub(r"youve", "you have", corpus)
    corpus = re.sub(r"we're", "we are", corpus)
    corpus = re.sub(r"what's", "what is", corpus)
    corpus = re.sub(r"couldn't", "could not", corpus)
    corpus = re.sub(r"we've", "we have", corpus)
    corpus = re.sub(r"its", "it is", corpus)
    corpus = re.sub(r"doesnt", "does not", corpus)
    corpus = re.sub(r"Its", "It is", corpus)
    corpus = re.sub(r"Heres", "Here is", corpus)
    corpus = re.sub(r"who's", "who is", corpus)
    corpus = re.sub(r"Ive", "I have", corpus)
    corpus = re.sub(r"y'all", "you all", corpus)
    corpus = re.sub(r"cant", "cannot", corpus)
    corpus = re.sub(r"would've", "would have", corpus)
    corpus = re.sub(r"it'll", "it will", corpus)
    corpus = re.sub(r"we'll", "we will", corpus)
    corpus = re.sub(r"wouldnt", "would not", corpus)
    corpus = re.sub(r"We've", "We have", corpus)
    corpus = re.sub(r"he'll", "he will", corpus)
    corpus = re.sub(r"Y'all", "You all", corpus)
    corpus = re.sub(r"Weren't", "Were not", corpus)
    corpus = re.sub(r"Didn't", "Did not", corpus)
    corpus = re.sub(r"they'll", "they will", corpus)
    corpus = re.sub(r"they'd", "they would", corpus)
    corpus = re.sub(r"DON'T", "DO NOT", corpus)
    corpus = re.sub(r"Thats", "That is", corpus)
    corpus = re.sub(r"they've", "they have", corpus)
    corpus = re.sub(r"i'd", "I would", corpus)
    corpus = re.sub(r"should've", "should have", corpus)
    corpus = re.sub(r"Youre", "You are", corpus)
    corpus = re.sub(r"where's", "where is", corpus)
    corpus = re.sub(r"Dont", "Do not", corpus)
    corpus = re.sub(r"we'd", "we would", corpus)
    corpus = re.sub(r"i'll", "I will", corpus)
    corpus = re.sub(r"weren't", "were not", corpus)
    corpus = re.sub(r"They're", "They are", corpus)
    corpus = re.sub(r"Cant", "Cannot", corpus)
    corpus = re.sub(r"youll", "you will", corpus)
    corpus = re.sub(r"Id", "I would", corpus)
    corpus = re.sub(r"let's", "let us", corpus)
    corpus = re.sub(r"it's", "it is", corpus)
    corpus = re.sub(r"can't", "cannot", corpus)
    corpus = re.sub(r"don't", "do not", corpus)
    corpus = re.sub(r"you're", "you are", corpus)
    corpus = re.sub(r"i've", "I have", corpus)
    corpus = re.sub(r"that's", "that is", corpus)
    corpus = re.sub(r"i'll", "I will", corpus)
    corpus = re.sub(r"doesn't", "does not", corpus)
    corpus = re.sub(r"i'd", "I would", corpus)
    corpus = re.sub(r"didn't", "did not", corpus)
    corpus = re.sub(r"ain't", "am not", corpus)
    corpus = re.sub(r"you'll", "you will", corpus)
    corpus = re.sub(r"I've", "I have", corpus)
    corpus = re.sub(r"Don't", "do not", corpus)
    corpus = re.sub(r"I'll", "I will", corpus)
    corpus = re.sub(r"I'd", "I would", corpus)
    corpus = re.sub(r"Let's", "Let us", corpus)
    corpus = re.sub(r"you'd", "You would", corpus)
    corpus = re.sub(r"It's", "It is", corpus)
    corpus = re.sub(r"Ain't", "am not", corpus)
    corpus = re.sub(r"Haven't", "Have not", corpus)
    corpus = re.sub(r"Could've", "Could have", corpus)
    corpus = re.sub(r"youve", "you have", corpus)
    corpus = re.sub(r"dont", "do not", corpus)
    corpus = re.sub(r"wasnt", "was not", corpus)


    #lowering all the words
    corpus = corpus.lower()

    # replacing all the digits with space
    corpus = re.sub('[0-9]', '', corpus)

    # Text chunking
    words = word_tokenize(corpus)
    tagged = pos_tag(words)
    chunks = ne_chunk(tagged)
    person = []
    place= []

    for chunk in chunks:
        if hasattr(chunk, 'label'):
            if chunk.label() == 'GPE':
                if type(chunk) is nltk.Tree:
                    t= ' '.join(c[0] for c in chunk.leaves())
                    place.append(t)

            if chunk.label() == 'PERSON':
                if type(chunk) is nltk.Tree:
                    t= ' '.join(c[0] for c in chunk.leaves())
                    person.append(t)

    # replacing space between two letter place words with underscore '_'
    place = re.sub(r'[^a-z]+', ' ', str(place))
    place_= list(map(lambda x: x.replace(' ','_'), place))
    place_ = re.sub(r'[^a-z]+', ' ', str(place_))

    # now replacing place words with words with undercore in between
    for k, j in enumerate(place):
        corpus = re.sub(str(j), place_[k], corpus)
    # replacing person name with space
    corpus = ' '.join(e for e in corpus.split() if e not in person)

    # removing trailing and leading underscore from the words
    x = [i.strip('_') for i in corpus.split()]
    corpus = ' '.join(x)

    # keeping words which are greater than 3
    corpus = re.findall(r'\b\w{3,}\b', corpus)
    corpus= ' '.join(corpus)





    # replacing all the words except "A-Za-z_" with space
    corpus = re.sub(r'[^a-z_]+', ' ', corpus)
    preprocessed_text = (''.join(corpus))

    return preprocessed_text

In [None]:
def data_cleaning(text):
    stop_words = set(stopwords.words('english'))
    corpus= []

    count = 0

    # open text corpus as ascii to avoid all the Unicode characters
    with open(text, 'r',  encoding= 'ascii', errors='ignore') as f:

        # for every line the corpus
        for line in f:
           count+= 1
           # it will clean the text line by line
           line = clean_text(line)
           # if the text is empty it will continue the loop
           if line == ' ': continue

           #tokenizing
           line= word_tokenize(line)
           # removing stopwords
           line = [word for word in line if word not in stop_words]
           line = ' '.join(line)
           corpus.append(line)

    f.close

    return corpus

In [None]:
corpus_arc = data_cleaning('/content/ARC-V1-Feb2018-2/ARC_Corpus.txt')
corpus_aristo = data_cleaning('/content/Aristo-Mini-Corpus-Dec2016/Aristo-Mini-Corpus-Dec2016.txt')

In [None]:
pickle.dump((corpus_arc, corpus_aristo),
            open('/content/drive/MyDrive/my assignments/33. A12 Reasoning Challenge- Self case study 2/corpus_arc_aristo.pkl','wb'))

In [None]:
corpus_arc, corpus_aristo= \
pickle.load(open('/content/drive/MyDrive/my assignments/33. A12 Reasoning Challenge- Self case study 2/corpus_arc_aristo.pkl', 'rb'))