<a href="https://colab.research.google.com/github/javed163/NLP-/blob/main/day_5_nlp_word2vec_and_avgword2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import pandas as pd


In [4]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])

In [5]:
df.head(1)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."


In [7]:
df.shape

(5572, 2)

In [8]:
df['message'].iloc[100]

"Please don't text me anymore. I have nothing else to say."

In [9]:
# data cleaning and preprocessing
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer # porterstemmer is used for stemming purpose
ps = PorterStemmer()
corpus = []
for i in range(len(df)):
  review = re.sub('[^a-zA-Z]',' ', df['message'][i])
  review = review.lower()
  review = review.split()

  reviwe = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  corpus.append(review)

In [18]:
# creating the bag of word model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, binary=True, ngram_range=(2,2))
x=cv.fit_transform(corpus).toarray()

In [21]:
x[1]
x.shape

(5572, 5000)

In [20]:
y = pd.get_dummies(df['label'])
y = y.iloc[:,1].values

In [22]:
# train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, stratify=y, random_state=0)

In [23]:
x_train.shape

(4457, 5000)

In [24]:
y_train.shape

(4457,)

In [25]:
x_train, y_train

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]),
 array([False, False,  True, ..., False, False, False]))

In [26]:
from sklearn.naive_bayes import MultinomialNB
spam_detect = MultinomialNB().fit(x_train, y_train)
y_pred = spam_detect.predict(x_test)

In [28]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, classification_report
cm = confusion_matrix(y_test, y_pred)
score = accuracy_score(y_test, y_pred)
print(cm)
print(score)
print(classification_report(y_test, y_pred))

[[959   7]
 [ 21 128]]
0.9748878923766816
              precision    recall  f1-score   support

       False       0.98      0.99      0.99       966
        True       0.95      0.86      0.90       149

    accuracy                           0.97      1115
   macro avg       0.96      0.93      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [35]:
# creating the tf-idf model
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(max_features=5000, binary=True, ngram_range=(2,2))
x=tf.fit_transform(corpus).toarray()
y = pd.get_dummies(df['label'])
y = y.iloc[:,1].values

In [38]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, stratify=y, random_state=0)
from sklearn.naive_bayes import MultinomialNB
spam_detect2 = MultinomialNB().fit(x_train, y_train)
y_pred = spam_detect2.predict(x_test)

In [39]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, classification_report
cm = confusion_matrix(y_test, y_pred)
score2 = accuracy_score(y_test, y_pred)
print(cm)
print(score)
print(classification_report(y_test, y_pred))

[[963   3]
 [ 42 107]]
0.9748878923766816
              precision    recall  f1-score   support

       False       0.96      1.00      0.98       966
        True       0.97      0.72      0.83       149

    accuracy                           0.96      1115
   macro avg       0.97      0.86      0.90      1115
weighted avg       0.96      0.96      0.96      1115



In [40]:
from sklearn.ensemble import RandomForestClassifier
spam_detect3 = RandomForestClassifier(n_estimators=100).fit(x_train, y_train)
y_pred = spam_detect3.predict(x_test)

In [41]:
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.967713004484305
[[965   1]
 [ 35 114]]
              precision    recall  f1-score   support

       False       0.96      1.00      0.98       966
        True       0.99      0.77      0.86       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115



# Word2vec

In [42]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [46]:

!pip install --upgrade --force-reinstall numpy
!pip install --upgrade --force-reinstall scipy
!pip install --upgrade --force-reinstall gensim

Collecting numpy
  Using cached numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Using cached numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.2.4 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.2.4 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.2.4 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-2.2.4
Collecting scipy
  Downloading scipy-1.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86

In [1]:
import gensim


In [2]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer

In [10]:
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
corpus = []
for i in range(len(df)):
  review = re.sub('[^a-zA-Z]',' ', df['message'][i])
  review = review.lower()
  review = review.split()
  lemmatizer = WordNetLemmatizer()
  review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  corpus.append(review)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [16]:
corpus[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [18]:
nltk.download('punkt_tab')
word = []
for send in corpus:
  send_token = sent_tokenize(send)
  for sent in send_token:
    word.append(simple_preprocess(sent))

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [19]:
word

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'valued',
  'network',
  'customer',
  'selected',
  'receivea',
 

In [20]:
import gensim

In [21]:
# train the word2c model from stratch
model = gensim.models.Word2Vec(
    word,
    window = 5,
    min_count=2
)

In [22]:
model.wv.index_to_key

['call',
 'get',
 'ur',
 'gt',
 'lt',
 'go',
 'day',
 'ok',
 'free',
 'know',
 'come',
 'like',
 'good',
 'time',
 'got',
 'love',
 'text',
 'want',
 'send',
 'one',
 'need',
 'txt',
 'today',
 'going',
 'stop',
 'home',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'dont',
 'think',
 'tell',
 'week',
 'phone',
 'hi',
 'new',
 'please',
 'later',
 'pls',
 'co',
 'msg',
 'min',
 'make',
 'night',
 'dear',
 'message',
 'well',
 'say',
 'thing',
 'much',
 'great',
 'oh',
 'hope',
 'claim',
 'hey',
 'number',
 'give',
 'happy',
 'work',
 'wat',
 'friend',
 'yes',
 'way',
 'www',
 'let',
 'prize',
 'right',
 'tomorrow',
 'already',
 'tone',
 'win',
 'ask',
 'said',
 'cash',
 'life',
 'amp',
 'im',
 'yeah',
 'really',
 'meet',
 'babe',
 'find',
 'miss',
 'morning',
 'service',
 'year',
 'thanks',
 'uk',
 'last',
 'would',
 'anything',
 'com',
 'care',
 'lol',
 'nokia',
 'also',
 'feel',
 'every',
 'keep',
 'pick',
 'sure',
 'sent',
 'urgent',
 'contact',


In [23]:
model.corpus_count

5564

In [24]:
model.epochs

5

In [27]:
model.wv.most_similar('new')

[('ur', 0.9996698498725891),
 ('even', 0.9996641874313354),
 ('reply', 0.9996410012245178),
 ('get', 0.9996398687362671),
 ('msg', 0.9996290802955627),
 ('see', 0.9996278285980225),
 ('still', 0.9996021389961243),
 ('week', 0.9996012449264526),
 ('text', 0.9995998740196228),
 ('day', 0.9995966553688049)]

In [28]:
model.wv['hope'].shape

(100,)

In [41]:
import numpy as np

def avg_word2vec(doc):
  # remove out-of-vocabulary words
  # sent  = [word for word in doc if word in model.wv.index_to_key]
  # print(sent)

  # Convert the generator to a list before calculating the mean
  # List comprehension to create a list of word vectors
  word_vectors = [model.wv[word] for word in doc if word in model.wv.index_to_key]

  # Check if any word vectors were found. If not, return a zero vector.
  if word_vectors:
    return np.mean(word_vectors, axis=0)  # Calculate the mean along axis 0 (columns)
  else:
    return np.zeros(model.vector_size) # Return a zero vector with the correct dimensions

In [30]:
!pip install tqdm



In [42]:
from tqdm import tqdm
word[70]

['wah', 'lucky', 'man', 'save', 'money', 'hee']

In [43]:
type(model.wv.index_to_key)

list

In [44]:
x = []
for i in tqdm(range(len(word))):
  x.append(avg_word2vec(word[i]))

100%|██████████| 5564/5564 [00:01<00:00, 4885.44it/s]


In [45]:
type(x)

list

In [46]:
x_new = np.array(x)

In [50]:
word[0]

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [49]:
x_new[0]

array([-0.06334571,  0.22453572,  0.0057351 ,  0.05657792,  0.08351024,
       -0.33654797,  0.09186426,  0.5304777 , -0.23420481, -0.15251127,
       -0.13484715, -0.31090817,  0.01657413,  0.12338823,  0.08866408,
       -0.24748506,  0.02418556, -0.3267208 ,  0.02030593, -0.53020173,
        0.15440251,  0.14621009,  0.11909884, -0.14076388, -0.17832334,
        0.03813374, -0.17394079, -0.15948226, -0.26615188, -0.00526669,
        0.30995172,  0.00910056,  0.06577732, -0.1518895 , -0.1221217 ,
        0.26492485,  0.06550487, -0.19281375, -0.18227617, -0.47600293,
       -0.00326341, -0.24231394, -0.06290128,  0.02658189,  0.21129964,
       -0.03201273, -0.17507626, -0.04372146,  0.08874422,  0.1799707 ,
        0.11448478, -0.19975086, -0.0343962 , -0.02872951, -0.11930385,
        0.11767482,  0.19147407, -0.04852689, -0.32770169,  0.10894575,
        0.07874446,  0.19995619, -0.15209715, -0.02303895, -0.30367461,
        0.12326331,  0.07722906,  0.15549321, -0.33987328,  0.31