In [28]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import chardet
import re
import os
import json
from pathlib import Path


def main():
    dir = "/Users/janine/Documents/NLP/Project/code/text_data/"
    raw_text_data = read_file(dir)
    data_before_token, processed_text_data = preprocessing(raw_text_data)
    #print(processed_text_data['Harry Potter and the Prisoner of Azkaban'])
    
    return data_before_token, processed_text_data



In [29]:
def read_file(dir):
    json_file = Path('{}Raw-Harry-Potter-Series.json'.format(dir))
    book = {}
    if not json_file.exists():
        files = []
        pattern = '\.txt'
        for readfile in os.listdir(dir):
            if re.search(pattern, readfile):
                files.append(readfile)
        pattern = '(.+)\..+$'
        for file_name in files:
            decode_text = []
            with open(dir + file_name, 'rb') as f_read:
                text = f_read.readlines()
                for line in text:
                    type = chardet.detect(line)
                    line = line.decode(type["encoding"])
                    decode_text.append(line)
                file_name = re.findall(pattern, file_name)[0]
                book[file_name] = decode_text
            f_read.close()

        ##TODO:Save book to json file
        with open(dir + 'Raw-Harry-Potter-Series.json', 'a') as outfile:
            json.dump(book, outfile, ensure_ascii=False)
            outfile.write('\n')
    else:
        readfile = open('{}Raw-Harry-Potter-Series.json'.format(dir),"r")
        book = eval(readfile.read())
    return book

In [30]:
def preprocessing(data):
    processed_data = {}
    data_before_token = {}
    
    for key in data.keys():
        text_of_book = data[key]
        pattern = r'^[¡]*'
        new_sentence = []
        for sentence in text_of_book:
            sentence = sentence.strip('\n')
            sentence = re.sub(pattern, '', sentence)
            new_sentence.append(sentence)
            
        data_before_token[key] = new_sentence

        sentence_word_list = []
        
        ##Tokenization
        for sentence in new_sentence:
            sentence_word_list.append(word_tokenize(sentence))
            
        processed_data[key] = sentence_word_list

        
        
        ##Stemming
        #new_stemming_sentence_word_list = []
        #ps = PorterStemmer()

        #for sentence_word in sentence_word_list:
            #stemming_words = []
            #for word in sentence_word:
                #stemming_words.append(ps.stem(word))
            #new_stemming_sentence_word_list.append(stemming_words)

        ##Lemmatization
        #new_lemmatized_sentence_word_list = []
        #lemmatizer = WordNetLemmatizer()

        #for sentence_word in new_stemming_sentence_word_list:
            #lemmatized_words = []
            #for word in sentence_word:
                #lemmatized_words.append(lemmatizer.lemmatize(word))
            #new_lemmatized_sentence_word_list.append(lemmatized_words)
        #processed_data[key] = new_lemmatized_sentence_word_list

    return data_before_token, processed_data

In [31]:
if __name__ == '__main__':
    data_before_token, processed_text_data = main()

In [111]:
print(processed_text_data['Harry Potter and the Sorcerer Stone'][:2])
list1=data_before_token['Harry Potter and the Sorcerer Stone'][:3]                                 
print(list1)


[['CHAPTER', 'ONE'], ['THE', 'BOY', 'WHO', 'LIVED']]
['CHAPTER ONE', 'THE BOY WHO LIVED', "Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense."]


In [112]:
#get all the characters name
import spacy
from spacy import displacy
import en_core_web_sm
nlp = en_core_web_sm.load()
import json

In [102]:
all_name = {}#name list for each book

for key in data_before_token.keys():
    text = data_before_token[key]
    name = []
    
    for sentence in text:       
        doc = nlp(sentence)
        for X in doc.ents:
            if X.label_ == 'PERSON':
                if X.text not in name: #avoid overlapping
                    name.append(X.text)
            
    all_name[key] = name
    

In [105]:
all_name.keys()

dict_keys(['Harry Potter and The Chamber Of Secrets', 'Harry Potter and the Deathly Hallows', 'Harry Potter and the Goblet of Fire', 'Harry Potter and The Half-Blood Prince', 'Harry Potter and the Order of the Phoenix', 'Harry Potter and the Prisoner of Azkaban', 'Harry Potter and the Sorcerer Stone'])

In [106]:
#first eposide
len(all_name['Harry Potter and the Sorcerer Stone'])

283

In [127]:
print(all_name['Harry Potter and the Sorcerer Stone'])

['Dursley', 'Potter', 'Harry', 'Next Door', 'Jim McGuffin', 'Jim', 'Ted', 'Yorkshire', 'Dundee', 'Mysterious', 'Howard', 'Albus Dumbledore', 'the Put-Outer', 'McGonagall', 'Muggle', 'Dumbledore', 'Voldemort', 'Pomfrey', 'Lily', 'James Potter', 'James', 'Albus', 'Harry Potter', 'Bristol', 'Duddy', 'Dudley', 'Scotch', 'Uncle Vernon', 'Marge', 'Atta', 'Figg', 'Snowy', 'Paws', 'Majorca', 'Mummy', 'Piers Polkiss', 'Piers', 'MOTORCYCLES', "Uncle Vernon's", 'Boa Constrictor', 'Dennis', 'Malcolm', 'Gordon', 'Harry Hunting', 'Ickle Dudleykins', 'H. Potter', 'Little Whinging', 'Surrey', 'H.', 'Spray', 'Yeh', 'Grounds', 'Hogwarts', 'Keeper', 'Keys', 'Headmaster', 'Wizardry', 'Minerva McGonagall', 'Gallopin', 'Gorgons', 'Knew', 'Blimey', 'Nah', 'Don', 'Horribly', "Reckon Dumbledore's", 'McKinnons', 'Disappeared', "James Potter' s", 'Yer', 'ALBUS- DUMBLEDORE-', 'FRONT-', "Shouldn'ta", 'Tap', 'Mm', 'Gringotts', 'Run', 'Goblins', 'Fer Dumbledore', 'Miranda Goshawk', 'Adalbert Waffling', 'Drafts', 'Ar

In [124]:
#save all names into one json file
with open('name_list'+'.json','a') as outfile:
    json.dump(all_name,outfile,ensure_ascii=False)
    outfile.write('\n')

In [126]:
#save name list for each eposide
for key in all_name.keys():
    content = all_name[key]
    file = open('name_list-'+str(key)+'.txt','w')
    file.write(str(content));
    file.close()
    