<a href="https://colab.research.google.com/github/kgovindaraju123/AIML/blob/master/NLPSimpsons.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!python -m spacy download en_core_web_lg
!python -m spacy download en

In [0]:
import re 
import pandas as pd
from time import time
from collections import defaultdict
import spacy
import logging
logging.basicConfig(format="%(levelname)s - %(asctime)s:%(message)s",datefmt='%H:%M:%S',level=logging.INFO)

In [2]:
df= pd.read_csv("simpsons_dataset.csv")
df.head(3)

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...


In [3]:
#Verify Null
df.isnull().sum()
df.shape

(89058, 2)

In [5]:
#Remove Null
df=df.dropna().reset_index(drop=True)
df.isnull().sum()

raw_character_text    0
spoken_words          0
dtype: int64

In [6]:
#Check again shape
df.shape

(74574, 2)

In [0]:
#Preprocessing part cleaning, Bigrams and most Frequent Words
nlp=spacy.load('en_core_web_sm',disable=['ner','parser'])
def cleaning(doc):
  txt=[token.lemma_ for token in doc if not token.is_stop]
  if len(txt)>2:
    return ' '.join(txt)

In [0]:
#Remove non alphabetic characters
brief_cleaning=(re.sub("[^A-Za-z']+", ' ',str(row)).lower() for row in df['spoken_words'])


In [10]:
type(brief_cleaning)

generator

In [11]:
#Define the pipeline
t=time()
txt=[cleaning(doc) for doc in nlp.pipe(brief_cleaning,batch_size=5000,n_threads=-1)]
print('Time to clean up everything:{}mins'.format(round((time()-t)/60,2)))

Time to clean up everything:0.89mins


In [13]:
type(txt)

list

In [14]:
#DF to remove missing values and duplicates
df_clean = pd.DataFrame({'Clean':txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(48668, 1)

In [15]:
#Bigrams - Gensim phases package automatically detect common phrases (bigrams=two word sentences) from a list of Sentences
#Main reason to use bigram is to catch words like "Mr_burns" or "bart_simpson"
from gensim.models.phrases import Phrases,Phraser

INFO - 06:51:07:'pattern' package not found; tag filters are not available for English


In [0]:
#Phrases takes a list of words as input
sent=[row.split() for row in df_clean['Clean']]


In [0]:
#out put the sentences in DataFrame
sent

In [18]:
#creates the relvant phrases from list of sentences
#progress per - write logs for every n sentences
phrases=Phrases(sent,min_count=30,progress_per=10000)


INFO - 06:51:24:collecting all words and their counts
INFO - 06:51:24:PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 06:51:24:PROGRESS: at sentence #10000, processed 63561 words and 52712 word types
INFO - 06:51:24:PROGRESS: at sentence #20000, processed 130943 words and 99633 word types
INFO - 06:51:24:PROGRESS: at sentence #30000, processed 192972 words and 138210 word types
INFO - 06:51:24:PROGRESS: at sentence #40000, processed 249843 words and 172231 word types
INFO - 06:51:24:collected 203185 word types from a corpus of 302864 words (unigram + bigrams) and 48668 sentences
INFO - 06:51:24:using 203185 counts as vocab in Phrases<0 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>


In [19]:
type(phrases)

gensim.models.phrases.Phrases

In [20]:
#Goal of phraser() is to cut down memeory consumption of Phrases()
bigram = Phraser(phrases) #we cant see as it is a black box process

INFO - 06:52:05:source_vocab length 203185
INFO - 06:52:07:Phraser built with 67 phrasegrams


In [22]:
#transform the corpus based on the bigrams detected
sentences=bigram[sent]
sentences

<gensim.interfaces.TransformedCorpus at 0x7fe047130cf8>

In [23]:
#Example for Gensim Model Phrases
documents = ["Mr_Raju_Govind""the mayor of new york was there" "machine learning can be useful sometimes"," new york mayor was present"]
sentence_stream=[doc.split(" ")for doc in documents]
print(sentence_stream)
bigram=Phrases(sentence_stream,min_count=1,threshold=2)

INFO - 06:52:25:collecting all words and their counts
INFO - 06:52:25:PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 06:52:25:collected 29 word types from a corpus of 18 words (unigram + bigrams) and 2 sentences
INFO - 06:52:25:using 29 counts as vocab in Phrases<0 vocab, min_count=1, threshold=2, max_vocab_size=40000000>


[['Mr_Raju_Govindthe', 'mayor', 'of', 'new', 'york', 'was', 'theremachine', 'learning', 'can', 'be', 'useful', 'sometimes'], ['', 'new', 'york', 'mayor', 'was', 'present']]


In [24]:
#threshold - the minimum score for a bigram to be taken into account
sents=[u'Mr_Raju_Govind',u'the',u'mayor',u'of',u'the',u'new',u'york',u'was',u'there']
print(bigram[sents])

['Mr_Raju_Govind', 'the', 'mayor', 'of', 'the', 'new_york', 'was', 'there']




In [0]:
#Most Frequent words sanity check of effectiveness of Lemmatization
word_freq =defaultdict(int)
for sent in sentences:
  for i in sent:
    word_freq[i]+=1
    len(word_freq)

In [0]:
word_freq

In [0]:
#display 10 words in sort
sorted(word_freq,key=word_freq.get,reverse=True)[:11]

['e', 'r', 'n', 'w', 'y', 'o', 'a', 's', '_', 'k', 'm']

In [0]:
End of preprocessing step

In [0]:
#Training th model GENSIM word2vec implementation
import multiprocessing
from gensim.models import Word2Vec

In [31]:
#to process the data speed, let us define cores
cores=multiprocessing.cpu_count() # count the noof cores in a computer)
cores

2

In [0]:
#Define the Word2vec model
w2v_model=Word2Vec(min_count=20,window=2,size=300,sample=6e-5,alpha=0.03,min_alpha=0.0007,negative=20,workers=cores-1)
#min_count=ignoresall words with total frequency lower than value
#window = max distance between the current and predicted word within a sentence
#size =Dimensionality of feature vectors
#alpha = initial learning rate
#negative - negative sampleing will b eused how many noisy words are to be drawn
#worker - use threads to train the model

In [33]:
#Build the vocabulary table
t=time()
w2v_model.build_vocab(sentences,progress_per=10000)
print('Time to build vocabulary:{}mins'.format(round((time()-t)/60,2)))

INFO - 06:54:29:collecting all words and their counts
INFO - 06:54:29:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 06:54:30:PROGRESS: at sentence #10000, processed 62066 words, keeping 9443 word types
INFO - 06:54:30:PROGRESS: at sentence #20000, processed 128031 words, keeping 14327 word types
INFO - 06:54:30:PROGRESS: at sentence #30000, processed 188835 words, keeping 17378 word types
INFO - 06:54:30:PROGRESS: at sentence #40000, processed 244581 words, keeping 20069 word types
INFO - 06:54:31:collected 22199 word types from a corpus of 296497 raw words and 48668 sentences
INFO - 06:54:31:Loading a fresh vocabulary
INFO - 06:54:31:effective_min_count=20 retains 2118 unique words (9% of original 22199, drops 20081)
INFO - 06:54:31:effective_min_count=20 leaves 232809 word corpus (78% of original 296497, drops 63688)
INFO - 06:54:31:deleting the raw counts dictionary of 22199 items
INFO - 06:54:31:sample=6e-05 downsamples 1275 most-common words
INFO - 06:54

Time to build vocabulary:0.03mins


In [0]:
#Training model
t=time()
w2v_model.train(sentences,total_examples=w2v_model.corpus_count,epochs=30,report_delay=1)
print('Time to train the model:{}mins'.format(round((time()-t)/60,2)))
#Total examples - count the sentences

In [36]:
#Use init_sims() make the model much more effiecient 
w2v_model.init_sims(replace=True)

INFO - 06:55:47:precomputing L2-norms of word weight vectors


In [40]:
#Exploring the model
#Ask our model to find the word most similar to some of most iconoc character
w2v_model=w2v_model.most_similar(positive=["homer"])

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


In [43]:
#bigram
w2v_model=w2v.most_similar(positive=["homer_simpson"])

NameError: ignored