In [1]:
#Perform bag-of-words approach (count occurrence, normalized count occurrence)
#TF-IDF on data.
#Create embeddings using Word2Vec

In [2]:
!pip install numpy pandas scikit-learn gensim


Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m67.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk


In [4]:
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
# Sample text data (Replace this with your dataset)
documents = [
    "AI is transforming the future.",
    "Machine learning is a part of AI.",
    "Natural language processing is useful for AI applications."
]



In [6]:
# Convert to Pandas DataFrame
df = pd.DataFrame(documents, columns=["Text"])


In [7]:
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(df["Text"])


In [8]:
# Convert to DataFrame for better visualization
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print("Bag-of-Words (Raw Count):")
print(bow_df)


Bag-of-Words (Raw Count):
   ai  applications  for  future  is  language  learning  machine  natural  \
0   1             0    0       1   1         0         0        0        0   
1   1             0    0       0   1         0         1        1        0   
2   1             1    1       0   1         1         0        0        1   

   of  part  processing  the  transforming  useful  
0   0     0           0    1             1       0  
1   1     1           0    0             0       0  
2   0     0           1    0             0       1  


In [9]:
from sklearn.preprocessing import normalize



In [10]:
normalized_bow = normalize(bow_matrix, norm='l1', axis=1)  # L1 Normalization
# Convert sparse matrix to dense array to solve shape mismatch.
normalized_bow_dense = normalized_bow.toarray()
normalized_bow_df = pd.DataFrame(normalized_bow_dense, columns=vectorizer.get_feature_names_out())
print("\nBag-of-Words (Normalized Count):")
print(normalized_bow_df)



Bag-of-Words (Normalized Count):
         ai  applications    for  future        is  language  learning  \
0  0.200000         0.000  0.000     0.2  0.200000     0.000  0.000000   
1  0.166667         0.000  0.000     0.0  0.166667     0.000  0.166667   
2  0.125000         0.125  0.125     0.0  0.125000     0.125  0.000000   

    machine  natural        of      part  processing  the  transforming  \
0  0.000000    0.000  0.000000  0.000000       0.000  0.2           0.2   
1  0.166667    0.000  0.166667  0.166667       0.000  0.0           0.0   
2  0.000000    0.125  0.000000  0.000000       0.125  0.0           0.0   

   useful  
0   0.000  
1   0.000  
2   0.125  


In [11]:
# Initialize and fit TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer() # Initialize the TfidfVectorizer
tfidf_matrix = tfidf_vectorizer.fit_transform(df["Text"]) # Fit and transform


In [12]:
# Convert to DataFrame for better visualization
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("\nTF-IDF Representation:")
print(tfidf_df)



TF-IDF Representation:
         ai  applications       for   future        is  language  learning  \
0  0.307144      0.000000  0.000000  0.52004  0.307144  0.000000  0.000000   
1  0.272499      0.000000  0.000000  0.00000  0.272499  0.000000  0.461381   
2  0.228215      0.386401  0.386401  0.00000  0.228215  0.386401  0.000000   

    machine   natural        of      part  processing      the  transforming  \
0  0.000000  0.000000  0.000000  0.000000    0.000000  0.52004       0.52004   
1  0.461381  0.000000  0.461381  0.461381    0.000000  0.00000       0.00000   
2  0.000000  0.386401  0.000000  0.000000    0.386401  0.00000       0.00000   

     useful  
0  0.000000  
1  0.000000  
2  0.386401  


In [13]:
# Download the 'punkt_tab' resource:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [14]:
# Tokenizing the sentences for Word2Vec
tokenized_text = [word_tokenize(doc.lower()) for doc in df["Text"]]


In [15]:
# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=1, workers=4)


In [16]:
# Get vector for a word
word = "ai"
if word in word2vec_model.wv:
    print(f"\nWord2Vec vector for '{word}':")
    print(word2vec_model.wv[word])
else:
    print(f"\nWord '{word}' not found in Word2Vec vocabulary.")



Word2Vec vector for 'ai':
[ 9.4563962e-05  3.0773198e-03 -6.8126451e-03 -1.3754654e-03
  7.6685809e-03  7.3464094e-03 -3.6732971e-03  2.6427018e-03
 -8.3171297e-03  6.2054861e-03 -4.6373224e-03 -3.1641065e-03
  9.3113566e-03  8.7338570e-04  7.4907029e-03 -6.0740625e-03
  5.1605068e-03  9.9228229e-03 -8.4573915e-03 -5.1356913e-03
 -7.0648370e-03 -4.8626517e-03 -3.7785638e-03 -8.5361991e-03
  7.9556061e-03 -4.8439382e-03  8.4236134e-03  5.2625705e-03
 -6.5500261e-03  3.9578713e-03  5.4701497e-03 -7.4265362e-03
 -7.4057197e-03 -2.4752307e-03 -8.6257253e-03 -1.5815723e-03
 -4.0343284e-04  3.2996845e-03  1.4418805e-03 -8.8142155e-04
 -5.5940580e-03  1.7303658e-03 -8.9737179e-04  6.7936908e-03
  3.9735902e-03  4.5294715e-03  1.4343059e-03 -2.6998555e-03
 -4.3668128e-03 -1.0320747e-03  1.4370275e-03 -2.6460087e-03
 -7.0737829e-03 -7.8053069e-03 -9.1217868e-03 -5.9351693e-03
 -1.8474245e-03 -4.3238713e-03 -6.4606704e-03 -3.7173224e-03
  4.2891586e-03 -3.7390434e-03  8.3781751e-03  1.5339935e-