In [1]:
from spacy.en import English
parser = English()

multiSentence = "There is an art, it says, or rather, a knack to flying." \
                 "The knack lies in learning how to throw yourself at the ground and miss." \
                 "In the beginning the Universe was created. This has made a lot of people "\
                 "very angry and been widely regarded as a bad move."

# TOKENS,POS

In [2]:
parsedData = parser(multiSentence)
parsedData

There is an art, it says, or rather, a knack to flying.The knack lies in learning how to throw yourself at the ground and miss.In the beginning the Universe was created. This has made a lot of people very angry and been widely regarded as a bad move.

In [3]:

for i, token in enumerate(parsedData):
    print("original:", token.orth, token.orth_)
    print("lowercased:", token.lower, token.lower_)
    print("lemma:", token.lemma, token.lemma_)
    print("shape:", token.shape, token.shape_)
    print("prefix:", token.prefix, token.prefix_)
    print("suffix:", token.suffix, token.suffix_)
    print("log probability:", token.prob)
    print("Brown cluster id:", token.cluster)
    print("----------------------------------------")
    if i > 10:
        break

original: 769 There
lowercased: 608 there
lemma: 608 there
shape: 684 Xxxxx
prefix: 568 T
suffix: 609 ere
log probability: -7.277902603149414
Brown cluster id: 1918
----------------------------------------
original: 513 is
lowercased: 513 is
lemma: 536 be
shape: 505 xx
prefix: 509 i
suffix: 513 is
log probability: -4.3297648429870605
Brown cluster id: 762
----------------------------------------
original: 591 an
lowercased: 591 an
lemma: 591 an
shape: 505 xx
prefix: 506 a
suffix: 591 an
log probability: -5.953293800354004
Brown cluster id: 3
----------------------------------------
original: 879 art
lowercased: 879 art
lemma: 879 art
shape: 502 xxx
prefix: 506 a
suffix: 879 art
log probability: -9.778430938720703
Brown cluster id: 633
----------------------------------------
original: 450 ,
lowercased: 450 ,
lemma: 450 ,
shape: 450 ,
prefix: 450 ,
suffix: 450 ,
log probability: -3.3914804458618164
Brown cluster id: 4
----------------------------------------
original: 519 it
lowercased:

In [6]:
sents = []

for span in parsedData.sents:
    
    sent = ''.join(parsedData[i].string for i in range(span.start, span.end)).strip()
    sents.append(sent)

for sentence in sents:
    print(sentence)

There is an art, it says, or rather, a knack to flying.
The knack lies in learning how to throw yourself at the ground and miss.
In the beginning the Universe was created.
This has made a lot of people very angry and been widely regarded as a bad move.


In [7]:
for span in parsedData.sents:
    sent = [parsedData[i] for i in range(span.start, span.end)]
    break

for token in sent:
    print(token.orth_, token.pos_)

There ADV
is VERB
an DET
art NOUN
, PUNCT
it PRON
says VERB
, PUNCT
or CCONJ
rather ADV
, PUNCT
a DET
knack NOUN
to ADP
flying NOUN
. PUNCT


In [8]:
example = "The boy with the spotted dog quickly ran after the firetruck."
parsedEx = parser(example)
for token in parsedEx:
    print(token.orth_, token.dep_, token.head.orth_, [t.orth_ for t in token.lefts], [t.orth_ for t in token.rights])

The det boy [] []
boy nsubj ran ['The'] ['with']
with prep boy [] []
the det dog [] []
spotted amod dog [] []
dog nsubj ran ['the', 'spotted'] []
quickly advmod ran [] []
ran ROOT ran ['boy', 'dog', 'quickly'] ['after', '.']
after prep ran [] ['firetruck']
the det firetruck [] []
firetruck pobj after ['the'] []
. punct ran [] []


In [10]:
example = "Apple's stocks dropped dramatically after the death of Steve Jobs in October."
parsedEx = parser(example)
for token in parsedEx:
    print(token.orth_, token.ent_type_ if token.ent_type_ != "" else "(not an entity)")

ents = list(parsedEx.ents)
for entity in ents:
    print(entity.label, entity.label_, ' '.join(t.orth_ for t in entity))

Apple ORG
's (not an entity)
stocks (not an entity)
dropped (not an entity)
dramatically (not an entity)
after (not an entity)
the (not an entity)
death (not an entity)
of (not an entity)
Steve PERSON
Jobs PERSON
in (not an entity)
October DATE
. (not an entity)
380 ORG Apple
377 PERSON Steve Jobs
387 DATE October


In [11]:
messyData = "lol that is rly funny :) This is gr8 i rate it 8/8!!!"
parsedData = parser(messyData)
for token in parsedData:
    print(token.orth_, token.pos_, token.lemma_)
    


lol NOUN lol
that ADJ that
is VERB be
rly ADV rly
funny ADJ funny
:) PUNCT :)
This DET this
is VERB be
gr8 VERB gr8
i PRON i
rate VERB rate
it PRON -PRON-
8/8 NUM 8/8
! PUNCT !
! PUNCT !
! PUNCT !


In [17]:
from numpy import dot
from numpy.linalg import norm

nasa = parser.vocab['NASA']

cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))

allWords = list({w for w in parser.vocab if w.has_vector and w.orth_.islower() and w.lower_ != "nasa"})

allWords.sort(key=lambda w: cosine(w.vector, nasa. vector))
allWords.reverse()
print("Top 20 most similar words to NASA:")
for word in allWords[:20]:   
    print(word.orth_)
    

king = parser.vocab['king']
man = parser.vocab['man']
woman = parser.vocab['woman']

result = king.vector - man.vector + woman.vector

allWords = list({w for w in parser.vocab if w.has_vector and w.orth_.islower() and w.lower_ != "king" and w.lower_ != "man" and w.lower_ != "woman"})
allWords.sort(key=lambda w: cosine(w.vector, result))
allWords.reverse()
print("\n----------------------------\nTop 3 closest results for king - man + woman:")
for word in allWords[:3]:   
    print(word.orth_)
    
# it got it! Queen!



Top 20 most similar words to NASA:
consistency
hotter
journey
featured
ref
puppies
artwork
crystal
defenses
helmet
slowed
promotion
similarities
boner
flex
flew
adapter
acquire
quarters
sizes

----------------------------
Top 3 closest results for king - man + woman:
queen
kings
princess


[('he', 'shot', 'me'), ('he', 'shot', 'sister'), ('brother', 'shot', 'me'), ('brother', 'shot', 'sister')]
[('watches', 'are', 'idea'), ('which', 'was', 'this'), ('it', 'was', 'pieces')]


# machine learning example with text

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
import string
import re

STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve"]

class CleanTextTransformer(TransformerMixin):
  

    def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}
    
def cleanText(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    
    mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
    text = mentionFinder.sub("@MENTION", text)
    
    text = text.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
    
    # lowercase
    text = text.lower()

    return text

def tokenizeText(sample):

    tokens = parser(sample)

    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    tokens = [tok for tok in tokens if tok not in STOPLIST]

    tokens = [tok for tok in tokens if tok not in SYMBOLS]

    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")

    return tokens

# def printNMostInformative(vectorizer, clf, N):
#     feature_names = vectorizer.get_feature_names()
#     coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
#     topClass1 = coefs_with_fns[:N]
#     topClass2 = coefs_with_fns[:-(N + 1):-1]
#     print("Class 1 best: ")
#     for feat in topClass1:
#         print(feat)
#     print("Class 2 best: ")
#     for feat in topClass2:
#         print(feat)

vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,1))
clf = LinearSVC()
model = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])

train = ["I love space. Space is great.", "Planets are cool. I am glad they exist in space", "lol @twitterdude that is gr8", 
        "twitter &amp; reddit are fun.", "Mars is a planet. It is red.", "@Microsoft: y u skip windows 9?", "Rockets launch from Earth and go to other planets.",
        "twitter social media &gt; &lt;", "@someguy @somegirl @twitter #hashtag", "Orbiting the sun is a little blue-green planet."]
labelsTrain = ["space", "space", "twitter", "twitter", "space", "twitter", "space", "twitter", "twitter", "space"]

test = ["i h8 riting comprehensibly #skoolsux", "planets and stars and rockets and stuff"]
labelsTest = ["twitter", "space"]

model.fit(train, labelsTrain)

preds = model.predict(test)
print("----------------------------------------------------------------------------------------------")
print("results:")
for (sample, pred) in zip(test, preds):
    print(sample, ":", pred)
print("accuracy:", accuracy_score(labelsTest, preds))

print("----------------------------------------------------------------------------------------------")
print("Top 10 features used to predict: ")
# show the top features
printNMostInformative(vectorizer, clf, 10)

print("----------------------------------------------------------------------------------------------")
print("The original data as it appeared to the classifier after tokenizing, lemmatizing, stoplisting, etc")
# let's see what the pipeline was transforming the data into
pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer)])
transform = pipe.fit_transform(train, labelsTrain)

# get the features that the vectorizer learned (its vocabulary)
vocab = vectorizer.get_feature_names()

# the values from the vectorizer transformed data (each item is a row,column index with value as # times occuring in the sample, stored as a sparse matrix)
for i in range(len(train)):
    s = ""
    indexIntoVocab = transform.indices[transform.indptr[i]:transform.indptr[i+1]]
    numOccurences = transform.data[transform.indptr[i]:transform.indptr[i+1]]
    for idx, num in zip(indexIntoVocab, numOccurences):
        s += str((vocab[idx], num))
    print("Sample {}: {}".format(i, s))

----------------------------------------------------------------------------------------------
results:
i h8 riting comprehensibly #skoolsux : twitter
planets and stars and rockets and stuff : space
accuracy: 1.0
----------------------------------------------------------------------------------------------
Top 10 features used to predict: 
Class 1 best: 
(-0.5317457349252519, 'planet')
(-0.35387821917377232, 'space')
(-0.21950314690393238, 'mar')
(-0.21950314690393238, 'red')
(-0.15678762293683152, 'earth')
(-0.15678762293683152, 'launch')
(-0.15678762293683152, 'rocket')
(-0.14909864994270897, 'great')
(-0.14909864994270897, 'love')
(-0.099774045796133873, 'blue')
Class 2 best: 
(0.40866490475230544, 'twitter')
(0.35268312478964314, '@mention')
(0.22672529221454771, 'lol')
(0.22672529221454771, 'gr8')
(0.20433286130934575, 'social')
(0.20433286130934575, 'medium')
(0.20433204344295969, 'reddit')
(0.20433204344295969, 'fun')
(0.12595783257509538, 'window')
(0.12595783257509538, 'u')
--