# Word Embeddings

Word embeddings are a way to represent words as numerical vectors so that machines can understand and process human language more effectively.

## Bag of Words

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/GenAi/NLP/spam.csv", encoding='ISO-8859-1')

In [None]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [None]:
df.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [None]:
df= df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])

In [None]:
df.head(5)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.v1.value_counts()

Unnamed: 0_level_0,count
v1,Unnamed: 1_level_1
ham,4825
spam,747


In [None]:
df.v1.value_counts()/len(df)*100

Unnamed: 0_level_0,count
v1,Unnamed: 1_level_1
ham,86.593683
spam,13.406317


In [None]:
df['Spam'] = df['v1'].apply(lambda x: 1 if x=='spam' else 0)

In [None]:
df.head()

Unnamed: 0,v1,v2,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
new_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/GenAi/NLP/spam.csv", encoding='ISO-8859-1')

In [None]:
new_df= new_df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])

In [None]:
new_df['v1'].replace({'ham':0, 'spam':1}, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new_df['v1'].replace({'ham':0, 'spam':1}, inplace = True)
  new_df['v1'].replace({'ham':0, 'spam':1}, inplace = True)


In [None]:
new_df.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.head()

Unnamed: 0,v1,v2,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
df.shape

(5572, 3)

### ***Train test Split***

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.v2, df.Spam, test_size=0.2, random_state=42)

In [None]:
X_train.shape

(4457,)

In [None]:
X_test.shape

(1115,)

In [None]:
X_train[:4]

Unnamed: 0,v2
1978,No I'm in the same boat. Still here at my moms...
3989,(Bank of Granite issues Strong-Buy) EXPLOSIVE ...
3935,They r giving a second chance to rahul dengra.
4078,O i played smash bros &lt;#&gt; religiously.


In [None]:
y_train[:4]

Unnamed: 0,Spam
1978,0
3989,1
3935,0
4078,0


### **Create bag of Word representation using CountVectorizer**

In [None]:
v = CountVectorizer()

In [None]:
X_train_cv = v.fit_transform(X_train.values)
x_test_cv = v.transform(X_test)

In [None]:
X_train_cv

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 58978 stored elements and shape (4457, 7735)>

In [None]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
X_train_cv.shape

(4457, 7735)

In [None]:
v.get_feature_names_out()[734]

'9ae'

In [None]:
v.vocabulary_

{'no': 4773,
 'in': 3637,
 'the': 6786,
 'same': 5872,
 'boat': 1415,
 'still': 6455,
 'here': 3416,
 'at': 1105,
 'my': 4646,
 'moms': 4549,
 'check': 1758,
 'me': 4390,
 'out': 4988,
 'on': 4912,
 'yo': 7674,
 'half': 3308,
 'naked': 4661,
 'bank': 1218,
 'of': 4869,
 'granite': 3237,
 'issues': 3749,
 'strong': 6494,
 'buy': 1580,
 'explosive': 2741,
 'pick': 5178,
 'for': 2954,
 'our': 4986,
 'members': 4423,
 'up': 7152,
 'over': 5003,
 '300': 419,
 'nasdaq': 4675,
 'symbol': 6639,
 'cdgt': 1701,
 'that': 6781,
 'is': 3739,
 '00': 0,
 'per': 5126,
 'they': 6809,
 'giving': 3157,
 'second': 5948,
 'chance': 1726,
 'to': 6906,
 'rahul': 5526,
 'dengra': 2246,
 'played': 5221,
 'smash': 6217,
 'bros': 1522,
 'lt': 4234,
 'gt': 3271,
 'religiously': 5661,
 'private': 5399,
 'your': 7685,
 '2003': 345,
 'account': 783,
 'statement': 6429,
 '07973788240': 45,
 'shows': 6108,
 '800': 630,
 'un': 7099,
 'redeemed': 5630,
 'points': 5270,
 'call': 1608,
 '08715203649': 122,
 'identifier': 

In [None]:
X_train_np = X_train_cv.toarray()

In [None]:
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
np.where(X_train_np[0]!=0)

(array([1105, 1415, 1758, 3308, 3416, 3637, 4390, 4549, 4646, 4661, 4773,
        4912, 4988, 5872, 6455, 6786, 7674]),)

### Naive Bayes Classifier

In [None]:
model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [None]:
y_preds = model.predict(x_test_cv)

In [None]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.89      0.94       150

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



### Test in random datapoint

In [None]:
message = {"GENT! We are trying to contact you. Last weekends draw shows that you won a �1000 prize GUARANTEED. Call 09064012160. Claim Code K52. Valid 12hrs only. 150ppm"}

In [None]:
message_cnt = v.transform(message)
model.predict(message_cnt)

array([1])