## Natural Language Processing (NLP) using NLTK and SpaCy modules

#### Install packages into Python environment

In [None]:
!pip install nltk
!pip install textblob
!pip install spacy

#### import modules from packages

In [None]:
#from text.blob import TextBlob as tb (would get all of TextBlob)
#from BeautifulSoup import BeautifulSoup (would get all of BeautifulSoup)
#from nltk.stem.porter import PorterStemmer
# from nltk.stem.lancaster import LancasterStemmer
import pandas as pd
from pandas import DataFrame
from itertools import combinations
import numpy as np
from numpy import linalg as LA
import math
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
import string
import re
from nltk.corpus import stopwords	#IMPORT STOPWORDS CORPUS (have to get into lowercase)
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from textblob import Word
import codecs
codecs.open
import io
io.open
import glob
import os

In [None]:
# Convert a collection of raw documents to a matrix of TF-IDF features.
#TF-IDF: Text Frequency-Inverse Document Frequency weights
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# function to remove html tags from a html file
def stripAllTags( html ):
        if html is None:
                return None
        return ''.join( BeautifulSoup( html ).findAll( text = True ) )

[SpaCy](https://spacy.io)

In [None]:
import spacy
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_lg

### Tokenization

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text, token.pos_, token.dep_)

In [None]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

![alt text](spacy_lem.png "SpaCy tokens")

In [None]:
from spacy import displacy
displacy.render(doc, style="dep", jupyter=True)

### Example: Process full document

Example using SpaCy to process entire document
Here we are iterating through the nouns and verbs using the method noun_chunks

In [None]:
# Process whole documents
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

In [None]:
#generate a list of nouns
[chunk.text for chunk in doc.noun_chunks]

##### Scanning text for parts of speech (pos)

In [None]:
for token in doc:
    print(token.text, token.pos_)

#### load large model and define function to clean text

In [None]:
nlp = spacy.load("en_core_web_lg")

#### Function definition

Function below makes text lowercase, removes stopwords (shown below), removes punctuation, and pronouns.

In [None]:
def clean(text):
    doc = nlp(text.lower())
    result = []
    for token in doc:
        if token.text in nlp.Defaults.stop_words:
            continue
        if token.is_punct:
            continue
        if token.lemma_ == '-PRON-':
            continue
        result.append(token.lemma_)
    return " ".join(result)

#### show stopwords

In [None]:
print(nlp.Defaults.stop_words)

### Example descriptions

#### note: RecargaPay and Cardup are startups in Mastercard Accelerator

In [None]:
recargapay = 'RecargaPay is a Brazilian company that offers a mobile payment platform and wallet. \
It aims to simplify daily transactions such as mobile top-ups, bill payments, gift cards and more.'

In [None]:
cardup = 'CardUp is a platform that enables users to pay for big-ticket items, such as rent and insurance,  \
sans friction by using a credit card. With each payment, users earn points and miles. \
Users can unlock additional benefits such as discounted house moving services, free rent for a month, or vouchers for school supplies. \
Uses sign up, schedule their payment, Cardup makes the payment for the specified amount plus a processing fee, \
and the user receives a notification when the payment is made.'

#### Text similarity with and without clean

Cosine similarity *without* applying clean

In [None]:
nlp(cardup).similarity(nlp(recargapay))

Cosine similarity applying clean

In [None]:
nlp(clean(cardup)).similarity(nlp(clean(recargapay)))

#### now add another to compare

In [None]:
tink = 'Tink enables banks to become data-driven, capture the opportunities of open banking, \
and deliver a more personal, intelligent financial experience to their users. Tink provides APIs \
as building blocks to create banking services. The company\'s products include account aggregation, \
categorization, payment initiation, and personal finance management.On June 24th, 2021, \
Tink was acquired by Visa at a valuation of $2.15B.'

In [None]:
nlp(cardup).similarity(nlp(tink))

In [None]:
nlp(recargapay).similarity(nlp(tink))

In [None]:
nlp(clean(recargapay)).similarity(nlp(clean(tink)))

#### Another example: Brown dogs, orange dogs, and startups

In [None]:
browndog = 'the quick brown dog goes to the zoo'

In [None]:
orangedog = 'Phineas is a dog'

In [None]:
orangedog2 = 'the dog Phineas is orange'

In [None]:
nlp(orangedog).similarity(nlp(browndog))

In [None]:
clean(orangedog)

In [None]:
clean(browndog)

In [None]:
clean(orangedog2)

In [None]:
nlp(clean(orangedog)).similarity(nlp(clean(browndog)))

In [None]:
nlp(cardup).similarity(nlp(browndog))

In [None]:
nlp(clean(cardup)).similarity(nlp(clean(browndog)))

In [None]:
nlp(cardup).similarity(nlp(orangedog))

In [None]:
nlp(clean(cardup)).similarity(nlp(clean(orangedog)))

#### Exercise: Now try comparing two texts yourself.
- add in text in place of elipses (...) below

In [None]:
text1 = ... 

In [None]:
text2 = ...

In [None]:
nlp(text1).similarity(nlp(text2))

In [None]:
nlp(clean(text1)).similarity(nlp(clean(text2)))

***

#### note: Hyro is a startup in Deloitte Accelerator

In [None]:
hyro = 'Hyro provides a conversational AI program that \
           allows businesses to better interact with customers. \
           Through machine learning, the program keeps customer engagement \
           at a high level and optimizes digital interactions.'

In [None]:
nlp(clean(hyro))

In [None]:
nlp(cardup).similarity(nlp(hyro))

In [None]:
nlp(clean(cardup)).similarity(nlp(clean(hyro)))

## Research example

#### note: this function was also defined above

In [None]:
def clean(text):
    doc = nlp(text.lower())
    result = []
    for token in doc:
        if token.text in nlp.Defaults.stop_words:
            continue
        if token.is_punct:
            continue
        if token.lemma_ == '-PRON-':
            continue
        result.append(token.lemma_)
    return " ".join(result)

In [None]:
# Files
file="nordeavc.csv"
file="nordeaacc.csv"
df=pd.read_csv(file, sep=',').fillna(value = 0)
print(df['Description'][1])
df['Description_processed'] = df['Description'].apply(lambda x: clean(x))
print('\nDescription with punctuation and stopwords---\n_______________________')
print(df['Description_processed'][1])

### Now process all companies in portfolio

In [None]:
shape=df.shape
n=shape[0] #number of rows
score=np.zeros((n,n)) #making array with scores, set to 0 to start
ix=range(0,n) #index 0 to n
cix=list(combinations(ix, 2)) #all possible combinations
# print(cix)
num=len(cix)
slist=[] #empty list

In [None]:
for i, j in cix:
    desc1 = nlp(clean(df['Description_processed'][i]))
    desc2 = nlp(clean(df['Description_processed'][j]))
    score[i,j]=desc1.similarity(desc2)
    slist.append(desc1.similarity(desc2))

In [None]:
print("Score matrix: ",score.shape)
print("\n",score[0,0],score[0,1])
# Want all the cross scores without double counting then draw histogram and stats
dfscore= pd.DataFrame(slist) 
stats_numeric = dfscore.describe()
print("Statistics")
print (stats_numeric)

In [None]:
dfscore

In [None]:
m, bins, patches = plt.hist(x=slist, bins=25, color='#0504aa',
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlim(0,1.0)
plt.xticks(np.arange(0, 1, step=0.1))
plt.xlabel('cosine similarity')
plt.ylabel('Frequency')
plt.title(file)
#plt.text(23, 45, r'$\mu=15, b=3$')

plt.show()