<a href="https://colab.research.google.com/github/lakshmiMadhuriYalamanchi/DeepLearning/blob/master/SpacyWordEmbeddingsNLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import spacy

# Need to load the large model to get the vectors
import spacy.cli
spacy.cli.download("en_core_web_lg")
nlp = spacy.load('en_core_web_lg')


[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [0]:
nlp = spacy.load('en_core_web_lg')

In [0]:
# Disabling other pipes because we don't need them and it'll speed up this part a bit
text = "These vectors can be used as features for machine learning models."
with nlp.disable_pipes():
    vectors = np.array([token.vector for token in  nlp(text)])

In [0]:
vectors.shape

(12, 300)

In [0]:
import pandas as pd

# Loading the spam data
# ham is the label for non-spam messages
spam = pd.read_csv('spam.csv')

with nlp.disable_pipes():
    doc_vectors = np.array([nlp(text).vector for text in spam.text])
    
doc_vectors.shape

(5572, 300)

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(doc_vectors, spam.label,
                                                    test_size=0.1, random_state=1)

In [0]:

from sklearn.svm import LinearSVC

# Set dual=False to speed up training, and it's not needed
svc = LinearSVC(random_state=1, dual=False, max_iter=10000)
svc.fit(X_train, y_train)
print(f"Accuracy: {svc.score(X_test, y_test) * 100:.3f}%", )


Accuracy: 97.312%


In [0]:
def cosine_similarity(a, b):
    return a.dot(b)/np.sqrt(a.dot(a) * b.dot(b))

In [0]:
a = nlp("REPLY NOW FOR FREE TEA").vector
b = nlp("replied for free tea now").vector
cosine_similarity(a, b)

0.94943833

In [0]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
from sklearn.metrics import classification_report,confusion_matrix

In [0]:
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [0]:
pred = knn.predict(X_test)

In [0]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)
pred_i = knn.predict(X_test)
error_rate=np.mean(pred_i != y_test)
print('Accuracy: '+str(round(1-error_rate,5)))

Accuracy: 0.96416


In [0]:
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

[[479  11]
 [  9  59]]
              precision    recall  f1-score   support

         ham       0.98      0.98      0.98       490
        spam       0.84      0.87      0.86        68

    accuracy                           0.96       558
   macro avg       0.91      0.92      0.92       558
weighted avg       0.96      0.96      0.96       558

