In [213]:
import spacy
import en_core_web_sm
from collections import OrderedDict

In [211]:
# TODO: Create a wrapper class with the below functionalities as its methods

# Clean and Tokenise

In [3]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

In [4]:
sample_claim = "Drake Bell, puting out an EP!"

In [5]:
# grab the list of punctuation marks
punctuations = string.punctuation
# grab the list of stop words
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [6]:

def clean_tokenize(sentence, lemmatize=True):
    parser = English()

    tokens = parser(sentence)

    # Lemmatize each token and lower it lemmatizing is optional
    if lemmatize:
        tokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ 
                    for word in tokens ]
    else:
        tokens = [words.lower().strip() for word in tokens]

    # Discard the stop words and the puntuation marks
    tokens = [ word for word in tokens if word not in stop_words and word not in punctuations ]

    return tokens

In [7]:
clean_tokenize("scaling")

['scale']

# Extracting the POS
For the list of Universal POS tags see:
https://www.sketchengine.eu/universal-pos-tags/

In [8]:
# POS tagging 

In [9]:
sample_claim1 = "Colin Kaepernick became a starting quarterback during the 49ers 63rd season in the National Football League."

In [10]:
nlp = en_core_web_sm.load()

In [215]:
def get_POS_tags_dict(text):
    docs = nlp(text)
    pos_list = []
    for word in docs:
        pos_list.append((word.text,word.pos_))
        
    return pos_list
        

In [217]:
get_POS_tags_dict(sample_claim)

[('Drake', 'PROPN'), ('Bell', 'PROPN'), (',', 'PUNCT'), ('puting', 'VERB'), ('out', 'PART'), ('an', 'DET'), ('EP', 'NOUN'), ('!', 'PUNCT')]


# Extracting the ENTITY
See here for list of entities: https://spacy.io/api/annotation#named-entities

HINT :  use spacy.explain("pobj") to get description of the Abbrevation

In [13]:
from spacy import displacy
from nltk.chunk import conlltags2tree

## Extract the NER

In [14]:
sample_claim2 = "Marnie was directed by someone who was \"The Master of Nothing\"."

In [15]:
def get_enitities_dict(text):
    docs = nlp(text)
    entities_dict = {word: word.label_ for word in docs.ents}
    return entities_dict

In [16]:
get_enitities_dict(sample_claim2)

{Marnie: 'ORG', The Master of Nothing: 'WORK_OF_ART'}

In [17]:
doc = nlp(sample_claim2)
displacy.render(doc, style='ent', jupyter=True)

## Extract the BOI

In [18]:
doc = nlp("Next week I'll be in Madrid.")
iob_tagged = [
    (
        token.text, 
        token.tag_, 
        "{0}-{1}".format(token.ent_iob_, token.ent_type_) if token.ent_iob_ != 'O' else token.ent_iob_
    ) for token in doc
]
 
print(iob_tagged)

[('Next', 'JJ', 'B-DATE'), ('week', 'NN', 'I-DATE'), ('I', 'PRP', 'O'), ("'ll", 'MD', 'O'), ('be', 'VB', 'O'), ('in', 'IN', 'O'), ('Madrid', 'NNP', 'B-GPE'), ('.', '.', 'O')]


## Chunking and Extracting the Noun Phrase

In [19]:
doc = nlp(sample_claim2)
for chunk in doc.noun_chunks:
    print(chunk.text, ": " ,spacy.explain(chunk.label_),";", chunk.root.text)

Marnie :  noun phrase ; Marnie
someone :  noun phrase ; someone
who :  noun phrase ; who
The Master :  noun phrase ; Master
Nothing :  noun phrase ; Nothing


# Dependancy Parsing Using Spacy

In [20]:

doc = nlp("James VI was born in England")
 
for token in doc:
    print("{0}/{1} <--{2}-- {3}/{4}".format(
        token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))

James/NNP <--compound-- VI/NNP
VI/NNP <--nsubjpass-- born/VBN
was/VBD <--auxpass-- born/VBN
born/VBN <--ROOT-- born/VBN
in/IN <--prep-- born/VBN
England/NNP <--pobj-- in/IN


In [21]:
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

In [22]:
get_enitities_dict("James VI was born in England")

{James VI: 'PERSON', England: 'GPE'}


# Spacy to caliculate similarity between texts

In [23]:
target = nlp("Alexander is a businessman")
 
doc1 = nlp("He reached an agreement with his mother and brothers that the brothers would have a boxing match and that the winner would receive the father 's property .")
doc2 = nlp("Alexander was defeated.")
doc3 = nlp("He became a member of the Pennsylvania militia and fought for the government in the Whiskey Rebellion in 1791 and 1794 .")
doc4 = nlp("He also became a successful businessman ,and served two terms on the Board of Trustees of the Town of St. Louis , in 1808 and 1813 .")

In [24]:
print(target.similarity(doc1))  
print(target.similarity(doc2))  
print(target.similarity(doc3)) 
print(target.similarity(doc4))

0.6602821563026594
0.4484390708485616
0.6787924449636612
0.5496646193808755


# Extracting the Keywords for Query of the Claim

## ASSETS

In [25]:
#desired_pos = ["ADJ","ADV", "NOUN", "PROPN", "SYM", "NUM"]
desired_pos = ["NOUN", "PROPN", "SYM", "NUM"]
# We take all entities possible

In [26]:
claim = "Advertising is a personal message."

In [27]:
ents = get_enitities_dict(claim)

In [28]:
pos = get_POS_tags_dict(claim)

In [29]:
pos_filtered = {key:value for key,value in pos.items() if value in desired_pos}

In [30]:
pos_filtered

{'Advertising': 'NOUN', 'message': 'NOUN'}

In [31]:
ents_list = list(ents.keys())
ents_list = [str(item) for item in ents_list]
pos_of_ents = get_POS_tags_dict(" ".join(ents_list)).keys()
pos_of_ents = list(pos_of_ents)

In [32]:
pos_list = list(pos_filtered.keys())

In [33]:
ents

{}

In [34]:
ents_list

[]

In [35]:
other_keywords = [word for word in pos_list if word not in pos_of_ents]

In [36]:
other_keywords

['Advertising', 'message']

In [37]:
" ".join(ents_list + other_keywords)

'Advertising message'

In [38]:
doc = nlp(claim)
iob_tagged = [
    (
        token.text, 
        token.tag_, 
        "{0}-{1}".format(token.ent_iob_, token.ent_type_) if token.ent_iob_ != 'O' else token.ent_iob_
    ) for token in doc
]
 
print(iob_tagged)

[('Advertising', 'NN', 'O'), ('is', 'VBZ', 'O'), ('a', 'DT', 'O'), ('personal', 'JJ', 'O'), ('message', 'NN', 'O'), ('.', '.', 'O')]


In [39]:
# Search the index

In [40]:
from whoosh.qparser import QueryParser
from whoosh import scoring
from whoosh.index import open_dir

# Topic Modelling and extraction
https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#18dominanttopicineachsentence


In [169]:
target = "Alexander is a businessman"
 
doc1 = "He reached an agreement with his mother and brothers that the brothers would have a boxing match and that the winner would receive the father 's property ."
doc2 = "Alexander was defeated."
doc3 = "He became a member of the Pennsylvania militia and fought for the government in the Whiskey Rebellion in 1791 and 1794 ."
doc4 = "He also became a successful businessman ,and served two terms on the Board of Trustees of the Town of St. Louis , in 1808 and 1813 ."

In [170]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
import gensim
from gensim import corpora
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc_complete).split()]  

In [171]:
doc_list = [target, doc1, doc2, doc3, doc4]

In [200]:
for doc_complete in doc_list:
    doc_clean = [clean(doc_complete).split()]  
    dictionary = corpora.Dictionary(doc_clean)
    #print(dictionary)
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

    # Creating the object for LDA model using gensim library
    Lda = gensim.models.ldamodel.LdaModel

    # Running and Trainign LDA model on the document term matrix.
    ldamodel = Lda(doc_term_matrix, num_topics=20, id2word = dictionary, passes=50)

    print(ldamodel.print_topics(num_topics=4, num_words=1),"\n")

[(2, '0.500*"alexander"'), (13, '0.500*"alexander"'), (3, '0.500*"alexander"'), (10, '0.500*"alexander"')] 

[(15, '0.083*"agreement"'), (17, '0.083*"agreement"'), (12, '0.140*"brother"'), (11, '0.083*"agreement"')] 

[(0, '0.500*"alexander"'), (3, '0.500*"alexander"'), (1, '0.500*"alexander"'), (14, '0.500*"alexander"')] 

[(6, '0.100*"1791"'), (1, '0.100*"1791"'), (13, '0.100*"1791"'), (16, '0.100*"1791"')] 

[(19, '0.067*"1808"'), (9, '0.067*"1808"'), (15, '0.067*"1808"'), (12, '0.067*"1808"')] 

