# Environment information

In [1]:
import sys
print("Python:", sys.version)

import numpy as np
print("NumPy:", np.__version__)

import pandas as pd
print("Pandas:", pd.__version__)

import sklearn
print("Scikit-learn:", sklearn.__version__)

import spacy
print("SpaCy:", spacy.__version__)

Python: 3.6.9 (default, Oct  8 2020, 12:12:24) 
[GCC 8.4.0]
NumPy: 1.19.5
Pandas: 1.1.5
Scikit-learn: 0.24.1
SpaCy: 2.3.5


# Pandas example

In [2]:
df_bites = pd.read_csv('datasets/Health_AnimalBites.csv', sep=',', quoting=0, header=0)

#print(type(df))

df_bites.head()

Unnamed: 0,bite_date,SpeciesIDDesc,BreedIDDesc,GenderIDDesc,color,vaccination_yrs,vaccination_date,victim_zip,AdvIssuedYNDesc,WhereBittenIDDesc,quarantine_date,DispositionIDDesc,head_sent_date,release_date,ResultsIDDesc
0,1985-05-05 00:00:00,DOG,,FEMALE,LIG. BROWN,1.0,1985-06-20 00:00:00,40229.0,NO,BODY,1985-05-05 00:00:00,UNKNOWN,,,UNKNOWN
1,1986-02-12 00:00:00,DOG,,UNKNOWN,BRO & BLA,,,40218.0,NO,BODY,1986-02-12 00:00:00,UNKNOWN,,,UNKNOWN
2,1987-05-07 00:00:00,DOG,,UNKNOWN,,,,40219.0,NO,BODY,1990-05-07 00:00:00,UNKNOWN,,,UNKNOWN
3,1988-10-02 00:00:00,DOG,,MALE,BLA & BRO,,,,NO,BODY,1990-10-02 00:00:00,UNKNOWN,,,UNKNOWN
4,1989-08-29 00:00:00,DOG,,FEMALE,BLK-WHT,,,,NO,BODY,,UNKNOWN,,,UNKNOWN


In [3]:
df_alexa = pd.read_csv('datasets/amazon_alexa.tsv', sep='\t', quoting=0, header=0)

df_alexa.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [4]:
df_alexa.describe()

Unnamed: 0,rating,feedback
count,3150.0,3150.0
mean,4.463175,0.918413
std,1.068506,0.273778
min,1.0,0.0
25%,4.0,1.0
50%,5.0,1.0
75%,5.0,1.0
max,5.0,1.0


# Scikit-learn example

In [5]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_alexa, test_size=0.2, shuffle=True, random_state=13) # fix the seed

In [6]:
df_train.describe()

Unnamed: 0,rating,feedback
count,2520.0,2520.0
mean,4.453175,0.91746
std,1.07969,0.27524
min,1.0,0.0
25%,4.0,1.0
50%,5.0,1.0
75%,5.0,1.0
max,5.0,1.0


In [7]:
df_test.describe()

Unnamed: 0,rating,feedback
count,630.0,630.0
mean,4.503175,0.922222
std,1.022399,0.268034
min,1.0,0.0
25%,4.0,1.0
50%,5.0,1.0
75%,5.0,1.0
max,5.0,1.0


## TF-IDF

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names())

print(X.shape)

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
(4, 9)


### Displaying sparse matrix

In [21]:
#print(X)

#print(type(X))

#print(X.toarray())
#print(X.A)

#print(type(X.toarray()))
#print(type(X.A))

X.data = np.round(X.data, 3)
print(X.toarray())

[[0.    0.47  0.58  0.384 0.    0.    0.384 0.    0.384]
 [0.    0.688 0.    0.281 0.    0.539 0.281 0.    0.281]
 [0.512 0.    0.    0.267 0.512 0.    0.267 0.512 0.267]
 [0.    0.47  0.58  0.384 0.    0.    0.384 0.    0.384]]


# SpaCy example

In [22]:
#nlp = spacy.load("en_core_web_sm", disable=["ner"])

#! python3 -m spacy download en
nlp = spacy.load("en")

print(type(nlp))
print("Pipeline:", nlp.pipe_names)

<class 'spacy.lang.en.English'>
Pipeline: ['tagger', 'parser', 'ner']


In [11]:
doc = nlp("spaCy is a free, open-source library for advanced Natural Language Processing (NLP) in Python.")

print(type(doc))

<class 'spacy.tokens.doc.Doc'>


In [12]:
from spacy import displacy

displacy.render(doc, style="dep", jupyter=True, options={'distance': 130})

In [13]:
displacy.render(doc, style="ent", jupyter=True)

## List of tokens

In [25]:
print([token for token in doc])

[spaCy, is, a, free, ,, open, -, source, library, for, advanced, Natural, Language, Processing, (, NLP, ), in, Python, .]


In [14]:
tableData = ([token, token.shape_, token.is_alpha, token.is_stop,
              token.pos_, spacy.explain(token.pos_),
              token.tag_, spacy.explain(token.tag_),
              token.dep_, spacy.explain(token.dep_)] for token in doc)

pd.DataFrame(tableData, columns=["text", "shape", "alphanumeric", "stop word",
                                 "pos", "Coarse-grained part-of-speech",
                                 "tag", "Fine-grained part-of-speech",
                                 "dep", "Syntactic dependency relation"])

Unnamed: 0,text,shape,alphanumeric,stop word,pos,Coarse-grained part-of-speech,tag,Fine-grained part-of-speech,dep,Syntactic dependency relation
0,spaCy,xxxXx,True,False,X,other,ADD,email,nsubj,nominal subject
1,is,xx,True,True,AUX,auxiliary,VBZ,"verb, 3rd person singular present",ROOT,
2,a,x,True,True,DET,determiner,DT,determiner,det,determiner
3,free,xxxx,True,False,ADJ,adjective,JJ,adjective,amod,adjectival modifier
4,",",",",False,False,PUNCT,punctuation,",","punctuation mark, comma",punct,punctuation
5,open,xxxx,True,False,ADJ,adjective,JJ,adjective,amod,adjectival modifier
6,-,-,False,False,PUNCT,punctuation,HYPH,"punctuation mark, hyphen",punct,punctuation
7,source,xxxx,True,False,NOUN,noun,NN,"noun, singular or mass",compound,compound
8,library,xxxx,True,False,NOUN,noun,NN,"noun, singular or mass",attr,attribute
9,for,xxx,True,True,ADP,adposition,IN,"conjunction, subordinating or preposition",prep,prepositional modifier


## List of noun phrases in the document

In [15]:
pd.DataFrame((chunk.text for chunk in doc.noun_chunks), columns=["text"])

Unnamed: 0,text
0,"a free, open-source library"
1,advanced Natural Language Processing
2,NLP
3,Python


## List of named entities, phrases and concepts

In [16]:
pd.DataFrame(([entity.text, entity.label_] for entity in doc.ents), columns=["text", "label"])

Unnamed: 0,text,label
0,Natural Language Processing,ORG
1,NLP,ORG
2,Python,GPE
