In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp("Tea is healthy and calming, don't you think?")

In [53]:
for token in doc:
    print('{} | {} | {}'.format(token.text, token.lemma_, token.is_stop))

Tea | tea | False
is | be | True
healthy | healthy | False
and | and | True
calming | calm | False
, | , | False
do | do | True
n't | n't | True
you | you | True
think | think | False
? | ? | False


In [40]:
from spacy.matcher import PhraseMatcher

In [41]:
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')

In [42]:
terms = ['Galaxy Note', 'iPhone 11', 'iPhone XS', 'Google Pixel']

In [43]:
patterns = [nlp(term) for term in terms]

In [44]:
matcher.add('PhoneList', patterns)

In [45]:
text_doc = nlp("Glowing review overall, and some really interesting side-by-side "
               "photography tests pitting the iPhone 11 Pro against the "
               "Galaxy Note 10 Plus and last year’s iPhone XS and Google Pixel 3.") 

In [46]:
matches = matcher(text_doc)

In [47]:
print(matches)

[(16260756990894137545, 17, 19), (16260756990894137545, 22, 24), (16260756990894137545, 30, 32), (16260756990894137545, 33, 35)]


In [48]:
for match_id, start, end in matches:
    print(nlp.vocab.strings[match_id], text_doc[start:end])

PhoneList iPhone 11
PhoneList Galaxy Note
PhoneList iPhone XS
PhoneList Google Pixel


In [54]:
import pandas as pd

In [55]:
spam = pd.read_csv('./datasets/spam.csv')

In [56]:
spam.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [57]:
nlp = spacy.blank('en')

In [58]:
textcat = nlp.add_pipe('textcat')

In [59]:
textcat.add_label('ham')

1

In [60]:
textcat.add_label('spam')

1

In [61]:
train_texts = spam['text'].values

In [62]:
len(train_texts)

5572

In [63]:
spam.size

11144

In [64]:
train_cats = [
    {
        'cats': {
            'ham': label == 'ham',
            'spam': label == 'spam',
        }
    } for label in spam['label']
]

In [66]:
train_data = list(zip(train_texts, train_cats))

In [67]:
train_data[:3]

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  {'cats': {'ham': True, 'spam': False}}),
 ('Ok lar... Joking wif u oni...', {'cats': {'ham': True, 'spam': False}}),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  {'cats': {'ham': False, 'spam': True}})]

In [68]:
from spacy.util import minibatch

In [69]:
from spacy.training.example import Example

In [71]:
spacy.util.fix_random_seed(1)

In [72]:
optimizer = nlp.begin_training()

In [73]:
batches = minibatch(train_data, size=8)

In [91]:
for batch in batches:
    for text, labels in batch:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, labels)
        nlp.update([example], sgd=optimizer)

Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's


In [75]:
import random

random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
for epoch in range(10):
    random.shuffle(train_data)
    # Create the batch generator with batch size = 8
    batches = minibatch(train_data, size=8)
    # Iterate through minibatches
    for batch in batches:
        for text, labels in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, labels)
            nlp.update([example], sgd=optimizer, losses=losses)
    print(losses)

{'textcat': 295.011506840551}
{'textcat': 433.77393580817477}
{'textcat': 517.9740727749643}
{'textcat': 573.5902575228944}
{'textcat': 605.8892598315485}
{'textcat': 629.7489943420239}
{'textcat': 666.9426019923563}
{'textcat': 687.974971471699}
{'textcat': 708.4414132585675}
{'textcat': 724.0049989437431}


In [76]:
texts = ["Are you ready for the tea party????? It's gonna be wild",
         "URGENT Reply to this message for GUARANTEED FREE TEA" ]

In [87]:
docs = [nlp.tokenizer(text) for text in texts]
print(docs)

[Are you ready for the tea party????? It's gonna be wild, URGENT Reply to this message for GUARANTEED FREE TEA]


In [78]:
textcat = nlp.get_pipe('textcat')
scores = textcat.predict(docs)

In [79]:
print(scores)

[[9.9999714e-01 2.8207435e-06]
 [3.3330774e-01 6.6669220e-01]]


In [88]:
predicted_labels = scores.argmax(axis=1)
print(predicted_labels)

[0 1]


In [81]:
print([textcat.labels[label] for label in predicted_labels])

['ham', 'spam']


In [101]:
import numpy as np
bool_list = np.array([False, True,  True,  True, False,  True,  True,  True,  True,  True,  True,  True])

In [102]:
bool_list.mean()

0.8333333333333334

In [103]:
nlp = spacy.load('en_core_web_lg')

In [104]:
text = "These vectors can be used as features for machine learning models."
with nlp.disable_pipes():
    vectors = np.array([token.vector for token in  nlp(text)])

In [105]:
print(vectors)

[[-0.1965    -0.13995   -0.52495   ... -0.097467   0.34578   -0.14233  ]
 [-0.25205   -0.16047   -0.6089    ...  0.19218   -0.40028    0.51894  ]
 [-0.23857    0.35457   -0.30219   ... -0.35283    0.41888    0.13168  ]
 ...
 [ 0.047511   0.1404    -0.11736   ...  0.03169   -0.14208    0.42548  ]
 [ 0.0065037  0.2064     0.0089077 ...  0.033444  -0.030121  -0.12998  ]
 [ 0.012001   0.20751   -0.12578   ...  0.13871   -0.36049   -0.035    ]]


In [106]:
with nlp.disable_pipes():
    doc_vectors = np.array([nlp(text).vector for text in spam.text])
    
doc_vectors.shape

(5572, 300)

In [107]:
from sklearn.model_selection import train_test_split

In [108]:
X_train, X_test, y_train, y_test = train_test_split(doc_vectors, spam.label,
                                                    test_size=0.1, random_state=1)

In [109]:
from sklearn.svm import LinearSVC

In [110]:
svc = LinearSVC(random_state=1, dual=False, max_iter=10000)

In [111]:
svc.fit(X_train, y_train)

LinearSVC(dual=False, max_iter=10000, random_state=1)

In [112]:
print(f"Accuracy: {svc.score(X_test, y_test) * 100:.3f}%", )

Accuracy: 97.670%


In [118]:
svc.predict(X_test[0].reshape(1, -1))

array(['ham'], dtype=object)

In [119]:
def cosine_similarity(a, b):
    return a.dot(b)/np.sqrt(a.dot(a) * b.dot(b))