# Import and function definitions

In [9]:
import re
import sys
import neuralcoref
import spacy 
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import inflect 
from nltk.stem import WordNetLemmatizer

def read_text(filename):
    raw_text = ''
    with open(filename) as file:
        for line in file:
            raw_text += line
    return raw_text

def write_text(text, filename):
    with open(filename, 'w') as file:
        for line in text:
            file.write(line + '\n')

In [13]:

p = inflect.engine() 
nlp = spacy.load('en')
neuralcoref.add_to_pipe(nlp)
stemmer= PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()


# convert number into words 
def convert_number(text): 
    temp_str = text.split() 
    new_string = [] 

    for word in temp_str: 
        if word.isdigit(): 
            temp = p.number_to_words(word) 
            new_string.append(temp) 
        else: 
            new_string.append(word) 
    temp_str = ' '.join(new_string) 
    return temp_str 

def coreference_resolution(text):
    return nlp(text)._.coref_resolved

def preprocess(sentence, stop_words = [], lemmatize = False, stem = False):
    temp = sentence.replace(',', ' and ')
    sentence = ''
    for token in temp.split():
        if token[:4] == 'http':
            sentence += token + ' '
        else:
            sentence += token.replace('/', ' or ') + ' '
#     sentence = sentence.lower()
    preprocessed_sentence = ''
    for char in sentence:
        if char in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz,!?.[]{}\'’\"@#%^&*()-_=+\\;:<>/1234567890 ':
            preprocessed_sentence += char
    words = preprocessed_sentence.split() 
    words = [word for word in words if word not in stop_words]
    if stem:
        words = [stemmer.stem(word) for word in words]
    if lemmatize:
        words = [wordnet_lemmatizer.lemmatize(word) for word in words]
    preprocessed_sentence = ' '.join(word for word in words)
    return preprocessed_sentence

# Read and preprocess text

In [14]:
# read raw text
raw_text = read_text('../data/handbook.txt')
# coreference resolution
resolved_text = coreference_resolution(raw_text)
# split into sentences
sentences = resolved_text.replace('\n','$').split('$')

# replace ',' with 'and' and replace '/' with 'or' except in website links
# convert to lower case
# remove uncommon special characters
# stemming, lemmatisation and removing stopwords
stopwords = ['a', 'an', 'the', 'all', 'any']
preprocessed_sentences = []
for sentence in sentences:
	preprocessed_sentences.append(preprocess(sentence, stop_words = stopwords))
sentences = preprocessed_sentences

sentences = [sentence for sentence in sentences if sentence]

for i in range(len(sentences)):
	print(i, sentences[i])

0 STUDENT
1 HANDBOOK 2017
2 Foreword
3 This handbook has been put together to provide information which will be useful during your stay here you may consider This handbook as user manual for IIIT Delhi. This handbook also provides guidelines for expected conduct and behaviour of students within campus and both academic and non- academic. But it is not rule-book you will still need to refer to official documents available on our website and other places. Please read disclaimer on last page.
4 The information governing academics is available in other places. UG or PG regulations should be used for rules governing your programs. rule-book is more about student life in campus extra-curricular activities and sports and hostels etc. Please make use of facilities and opportunities to develop your personality and bond with other students.
5 Life here is lot different from school you have more freedom and will be treated as adult. We expect you to exercise more freedom with sense of responsibil

In [15]:
write_text(sentences, '../data/handbook_preprocessed_text.txt')