<a href="https://colab.research.google.com/github/jacobpad/My_Own_NLP_Stuff/blob/master/Harry_Potter/harry_potter_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CHECKING OUT HARRY POTTER BOOK

## Starting Timer

In [1]:
import time 
# Time the running of everything
start_of_notebook_time = time.time()

## Imports

In [2]:
# Installations
# import sys
# if 'google.colab' in sys.modules:
#     !python -m spacy download en_core_web_lg
#     !pip install gensim

In [3]:
import pandas as pd
from urllib import request
import re

#Natural Language Processing (NLP)
import string
import spacy
import gensim
from spacy.tokenizer import Tokenizer
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS as SW
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from wordcloud import STOPWORDS
stopwords = set(STOPWORDS)

In [4]:
# Establish the English core web
# Make sure to restart the runtime after running installations and libraries
nlp = spacy.load('en_core_web_lg')

In [5]:
# Set the URL
url_The_Philosophers_Stone = "https://raw.githubusercontent.com/jacobpad/Deep-Learning/master/GPT2-HarryPotter-Training/books/Book%201%20-%20The%20Philosopher's%20Stone.txt"

# Reading in original text - it's a .txt
text_The_Philosophers_Stone = request.urlopen(url_The_Philosophers_Stone)

# Empty list for the text of the book
corpus = []

for line in text_The_Philosophers_Stone:

  # Decoding as utf-8
  decode_line = line.decode("utf-8")

  # Remove any new line characters
  decode_line = decode_line.strip('\n')

  # Strip away any extra spaces
  decode_line = decode_line.strip()

  # Remove the end of page lines
  # https://stackoverflow.com/questions/32015356/find-and-remove-a-string-starting-and-ending-with-a-specific-substring-in-python
  decode_line = re.sub(r'Page.+Rowling', '', decode_line)

  # Strip away any blank lines
  decode_line = [line for line in decode_line.split('\n') if line.strip() != '']
  
  # Add them all together - Bibbidi-bobbidi-boo
  corpus.append(decode_line)

# Remove empty lists within the corpus list
corpus = [x for x in corpus if x]

In [6]:
# Make it a DataFrame
df = pd.DataFrame(corpus) 

# Rename the default column
df.rename(columns={0 : 'original_text'}, inplace=True)

# Remove rows that are strings of "/"
df = df[~df['original_text'].str.contains("/")]

# View df
df

Unnamed: 0,original_text
1,THE BOY WHO LIVED
2,"Mr. and Mrs. Dursley, of number four, Privet D..."
3,were proud to say that they were perfectly nor...
4,thank you very much. They were the last people...
5,expect to be involved in anything strange or
...,...
9792,shocked that anyone could be so unpleasant.
9793,"“Oh, I will,” said Harry, and they were surpri..."
9794,the grin that was spreading over his face. “ T...
9795,know we’re not allowed to use magic at home. I’m


## Tokenize For Removing Punctuation

In [7]:
# Establish the English core web
nlp = spacy.load('en_core_web_lg')


# Tokenizer
tokenizer = Tokenizer(nlp.vocab)

# Tokenizer function
def tokenize(text):
    """
    Parses a string into a list of semantic units (words)
    Args:
        text (str): The string that the function will tokenize.
    Returns:
        list: tokens parsed
    """

    tokens = re.sub(r'[^a-zA-Z 0-9]', '', text)

    # Remove punctuation
    tokens = re.sub('[%s]' % re.escape(string.punctuation), '', tokens)

    # Make text lowercase and split it
    tokens = tokens.lower().split()
  
    return tokens

# Apply tokenizer
df['tokens'] = df['original_text'].apply(tokenize)

# Make tokens a string again
df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['tokens']]

# View those tokens
df

Unnamed: 0,original_text,tokens,tokens_back_to_text
1,THE BOY WHO LIVED,"[the, boy, who, lived]",the boy who lived
2,"Mr. and Mrs. Dursley, of number four, Privet D...","[mr, and, mrs, dursley, of, number, four, priv...",mr and mrs dursley of number four privet drive
3,were proud to say that they were perfectly nor...,"[were, proud, to, say, that, they, were, perfe...",were proud to say that they were perfectly normal
4,thank you very much. They were the last people...,"[thank, you, very, much, they, were, the, last...",thank you very much they were the last people ...
5,expect to be involved in anything strange or,"[expect, to, be, involved, in, anything, stran...",expect to be involved in anything strange or
...,...,...,...
9792,shocked that anyone could be so unpleasant.,"[shocked, that, anyone, could, be, so, unpleas...",shocked that anyone could be so unpleasant
9793,"“Oh, I will,” said Harry, and they were surpri...","[oh, i, will, said, harry, and, they, were, su...",oh i will said harry and they were surprised at
9794,the grin that was spreading over his face. “ T...,"[the, grin, that, was, spreading, over, his, f...",the grin that was spreading over his face they...
9795,know we’re not allowed to use magic at home. I’m,"[know, were, not, allowed, to, use, magic, at,...",know were not allowed to use magic at home im


## Tokenize For Removing STOPWORDS

In [8]:
# Custom stopwords
custom_stopwords = ['hi','\n','\n\n', '&amp;', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@']

# Customize stop words by adding to the default list
STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)

# ALL_STOP_WORDS = spacy + gensim + wordcloud
ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords)


tokens = []

for doc in tokenizer.pipe(df['tokens_back_to_text'], batch_size=500):
    doc_tokens = []    
    for token in doc: 
        if token.text.lower() not in STOP_WORDS:
          doc_tokens.append(token.text.lower())   
    tokens.append(doc_tokens)

# Makes tokens column
df['tokens'] = tokens

# Make tokens a string again
df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['tokens']]

# View df
df

Unnamed: 0,original_text,tokens,tokens_back_to_text
1,THE BOY WHO LIVED,"[boy, lived]",boy lived
2,"Mr. and Mrs. Dursley, of number four, Privet D...","[mr, mrs, dursley, number, privet, drive]",mr mrs dursley number privet drive
3,were proud to say that they were perfectly nor...,"[proud, perfectly, normal]",proud perfectly normal
4,thank you very much. They were the last people...,"[thank, people, youd]",thank people youd
5,expect to be involved in anything strange or,"[expect, involved, strange]",expect involved strange
...,...,...,...
9792,shocked that anyone could be so unpleasant.,"[shocked, unpleasant]",shocked unpleasant
9793,"“Oh, I will,” said Harry, and they were surpri...","[oh, said, harry, surprised]",oh said harry surprised
9794,the grin that was spreading over his face. “ T...,"[grin, spreading, face, dont]",grin spreading face dont
9795,know we’re not allowed to use magic at home. I’m,"[know, allowed, use, magic, home]",know allowed use magic home


## Lemmatization

In [9]:
# Timing Start
lemma_start_time = time.time()

In [10]:
def get_lemmas(text):
    '''Used to lemmatize the processed tweets'''
    lemmas = []
    
    doc = nlp(text)
    
    # Something goes here :P
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
            lemmas.append(token.lemma_)
    
    return lemmas

df['lemmas'] = df['tokens_back_to_text'].apply(get_lemmas)

# Make lemmas a string again
df['lemmas_text'] = [' '.join(map(str, l)) for l in df['lemmas']]

# Drop and rename coolumns
df.drop(columns={'tokens','tokens_back_to_text'}, axis=1, inplace=True)
df.rename(columns={'lemas':'lemma_tokens', 'lemmas_back_to_text':'lemma_text'}, inplace=True)

# View df
df

Unnamed: 0,original_text,lemmas,lemmas_text
1,THE BOY WHO LIVED,"[boy, live]",boy live
2,"Mr. and Mrs. Dursley, of number four, Privet D...","[mr, mrs, dursley, number, privet, drive]",mr mrs dursley number privet drive
3,were proud to say that they were perfectly nor...,"[proud, perfectly, normal]",proud perfectly normal
4,thank you very much. They were the last people...,"[thank, people, would]",thank people would
5,expect to be involved in anything strange or,"[expect, involve, strange]",expect involve strange
...,...,...,...
9792,shocked that anyone could be so unpleasant.,"[shock, unpleasant]",shock unpleasant
9793,"“Oh, I will,” said Harry, and they were surpri...","[oh, say, harry, surprise]",oh say harry surprise
9794,the grin that was spreading over his face. “ T...,"[grin, spread, face, not]",grin spread face not
9795,know we’re not allowed to use magic at home. I’m,"[know, allow, use, magic, home]",know allow use magic home


In [11]:
# Timing End
lemma_end_time = time.time()

# Lemma total time
lemma_total = lemma_end_time - lemma_start_time

#Printing Lemmetization Time
print('Lemmatization took {:.2f} seconds to finish'.format(lemma_total))

Lemmatization took 85.38 seconds to finish


## Notebook Ending Timer

In [12]:
end_of_notebook_time = time.time()
total_notebook_time_seconds = end_of_notebook_time - start_of_notebook_time
total_running_time_of_notebook_minutes = (end_of_notebook_time - start_of_notebook_time) / 60

print('Duration for the entire notebook to run: {:.2f} seconds.'.format(total_notebook_time_seconds))
print('Which is {:.2f} minutes.'.format(total_running_time_of_notebook_minutes))

Duration for the entire notebook to run: 113.22 seconds.
Which is 1.89 minutes.
