<a href="https://colab.research.google.com/github/jaiswalgaurav012002/deep-learning-lab-work/blob/main/LAB3_Advancements_in_Ai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
!pip install scikit-learn nltk




In [15]:
import numpy as np
import re
import string

def load_glove_embeddings(file_path):
    """
    Load GloVe embeddings from a file.
    """
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

def preprocess_text(text):
    """
    Preprocess the input text by performing the following steps:
    1. Convert text to lowercase
    2. Remove special characters and punctuation
    3. Remove extra whitespace
    """
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def get_text_embedding(text, embeddings_index, embedding_dim=50):
    """
    Obtain the embedding vector for the input text.
    """
    tokens = preprocess_text(text).split()
    embedding_matrix = np.zeros((len(tokens), embedding_dim))

    for i, token in enumerate(tokens):
        if token in embeddings_index:
            embedding_matrix[i] = embeddings_index[token]

    # Return the average embedding of all tokens (or handle as needed)
    return np.mean(embedding_matrix, axis=0)

def main():
    # File paths
    glove_file_path = 'glove.6B.50d.txt'  # Update with the path to your GloVe file

    # Load GloVe embeddings
    embeddings_index = load_glove_embeddings(glove_file_path)

    # Example text
    example_text = "The quick brown fox jumps over the lazy dog."

    # Get text embedding
    text_embedding = get_text_embedding(example_text, embeddings_index)

    # Print result
    print("Text embedding:")
    print(text_embedding)

if __name__ == 'main':
    main()

In [16]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample documents
documents = [
    "The cat sat on the mat.",
    "The dog sat on the log.",
    "Cats and dogs are common pets.",
    "Mat and log are types of surfaces.",
    "Pets are animals kept by humans."
]

# Preprocessing function to tokenize and remove stopwords
def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha()]  # Remove punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Preprocess all documents
preprocessed_docs = [preprocess(doc) for doc in documents]

# Compute TF-IDF matrix
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_docs)

# Compute pairwise cosine similarity between documents
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Display results
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())
print("\nCosine Similarity Matrix:")
print(cosine_sim)

# Function to find the most similar document to a given document
def find_most_similar_document(index):
    sim_scores = list(enumerate(cosine_sim[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    most_similar_index = sim_scores[1][0]  # Exclude self-comparison
    return most_similar_index

# Test finding the most similar document to the first document
most_similar = find_most_similar_document(0)
print(f"\nThe most similar document to document 0 is document {most_similar}.")


TF-IDF Matrix:
[[0.         0.659118   0.         0.         0.         0.
  0.         0.         0.         0.53177225 0.         0.53177225
  0.         0.        ]
 [0.         0.         0.         0.         0.659118   0.
  0.         0.         0.53177225 0.         0.         0.53177225
  0.         0.        ]
 [0.         0.         0.52335825 0.52335825 0.         0.52335825
  0.         0.         0.         0.         0.42224214 0.
  0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.44400208 0.44400208 0.         0.
  0.55032913 0.55032913]
 [0.52335825 0.         0.         0.         0.         0.
  0.52335825 0.52335825 0.         0.         0.42224214 0.
  0.         0.        ]]

Cosine Similarity Matrix:
[[1.         0.28278173 0.         0.23610799 0.        ]
 [0.28278173 1.         0.         0.23610799 0.        ]
 [0.         0.         1.         0.         0.17828843]
 [0.23610799 0.23610799 0.        

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def cosine_similarity_percentage(doc1, doc2):
    # Create a TfidfVectorizer to convert the documents to vectors
    vectorizer = TfidfVectorizer()

    # Fit and transform the documents into vector form
    vectors = vectorizer.fit_transform([doc1, doc2])

    # Compute the cosine similarity between the two vectors
    cos_sim = cosine_similarity(vectors[0], vectors[1])[0][0]

    # Return the cosine similarity as a percentage
    return cos_sim * 100

# Example usage
doc1 = "This is the first document."
doc2 = "This document is the second one."

similarity_percentage = cosine_similarity_percentage(doc1, doc2)
print(f"The documents are {similarity_percentage:.2f}% similar.")


The documents are 58.03% similar.


In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv('/content/SMSSpamCollection.txt', sep='\t', header=None, names=['label', 'message'])

# Split the dataset into training and testing sets (80% train, 20% test)
train_data, test_data = train_test_split(data, test_size=0.5, random_state=42, stratify=data['label'])

# Save the two datasets
train_data.to_csv('/content/sms_train.csv', index=False, header=True)
test_data.to_csv('/content/sms_test.csv', index=False, header=True)

# Compare the class distribution
train_distribution = train_data['label'].value_counts(normalize=True)
test_distribution = test_data['label'].value_counts(normalize=True)

# Create a DataFrame for comparison
comparison_df = pd.DataFrame({
    'Train Distribution': train_distribution,
    'Test Distribution': test_distribution
}).fillna(0)

# Save the comparison to a CSV file
comparison_df.to_csv('/content/sms_distribution_comparison.csv')

print("Training and testing datasets, along with the distribution comparison, have been saved successfully.")


Training and testing datasets, along with the distribution comparison, have been saved successfully.


In [23]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip


--2024-09-23 04:21:46--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-09-23 04:21:46--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-09-23 04:21:46--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [24]:
!ls glove.6B.*


glove.6B.100d.txt  glove.6B.200d.txt  glove.6B.300d.txt  glove.6B.50d.txt  glove.6B.zip


In [25]:
glove_file_path = 'glove.6B.50d.txt'  # This should be correct if you're in the same directory


In [30]:
import os
if not os.path.exists(glove_file_path):
    print("GloVe embeddings file not found. Please check the path.")



In [32]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def get_text_embedding(text, embeddings_index, embedding_dim=50):
    tokens = preprocess_text(text).split()
    embedding_matrix = np.zeros((len(tokens), embedding_dim))

    for i, token in enumerate(tokens):
        if token in embeddings_index:
            embedding_matrix[i] = embeddings_index[token]

    return np.mean(embedding_matrix, axis=0) if len(tokens) > 0 else np.zeros(embedding_dim)



# Set the GloVe file path
glove_file_path = 'glove.6B.50d.txt'

# Check if the GloVe file exists
import os
if not os.path.exists(glove_file_path):
    print("GloVe embeddings file not found. Please check the path.")

# Load the dataset
data = pd.read_csv('/content/SMSSpamCollection.txt', sep='\t', header=None, names=['label', 'message'])

# Split the dataset into training and testing sets (80% train, 20% test)
train_data, test_data = train_test_split(data, test_size=0.5, random_state=42, stratify=data['label'])

# Load GloVe embeddings
embeddings_index = load_glove_embeddings(glove_file_path)

# Compute embeddings for the training and testing datasets
embedding_dim = 50
train_embeddings = np.array([get_text_embedding(msg, embeddings_index, embedding_dim) for msg in train_data['message']])
test_embeddings = np.array([get_text_embedding(msg, embeddings_index, embedding_dim) for msg in test_data['message']])

# Calculate the cosine similarity between all train and test embeddings
similarity_matrix = cosine_similarity(train_embeddings, test_embeddings)

# Display results
print("Cosine Similarity Matrix (Train vs Test):")
print(similarity_matrix)

# Optionally, find the most similar test message for each train message
most_similar_indices = np.argmax(similarity_matrix, axis=1)
for i, idx in enumerate(most_similar_indices):
    print(f"Most similar test message for train message {i}: Test Message {idx} (Similarity: {similarity_matrix[i][idx]:.4f})")


Cosine Similarity Matrix (Train vs Test):
[[0.93248694 0.9351238  0.87210509 ... 0.89615171 0.8419725  0.95252176]
 [0.95712393 0.89659058 0.90881943 ... 0.83965368 0.84068721 0.92627203]
 [0.89938875 0.89647643 0.90942237 ... 0.79316069 0.79990647 0.94749424]
 ...
 [0.93675367 0.88357635 0.84945493 ... 0.89229631 0.93099204 0.89490865]
 [0.93487073 0.93268167 0.92803437 ... 0.86212934 0.83636136 0.9780828 ]
 [0.87268569 0.8716495  0.83160126 ... 0.84288808 0.78528918 0.90332985]]
Most similar test message for train message 0: Test Message 2609 (Similarity: 0.9816)
Most similar test message for train message 1: Test Message 1157 (Similarity: 0.9668)
Most similar test message for train message 2: Test Message 1936 (Similarity: 0.9692)
Most similar test message for train message 3: Test Message 1334 (Similarity: 0.7240)
Most similar test message for train message 4: Test Message 849 (Similarity: 0.9682)
Most similar test message for train message 5: Test Message 418 (Similarity: 1.0000)


In [34]:
# Assuming similarity_matrix is already computed
# Convert cosine similarity values to percentages
similarity_percentage_matrix = similarity_matrix * 100

# Display the cosine similarity percentage matrix
print("Cosine Similarity Percentage Matrix (Train vs Test):")
print(similarity_percentage_matrix)

# Optionally, find the most similar test message for each train message
most_similar_indices = np.argmax(similarity_matrix, axis=1)
for i, idx in enumerate(most_similar_indices):
    print(f"Most similar test message for train message {i}: Test Message {idx} (Similarity: {similarity_percentage_matrix[i][idx]:.2f}%)")


Cosine Similarity Percentage Matrix (Train vs Test):
[[93.24869444 93.51237977 87.21050872 ... 89.61517099 84.19724955
  95.25217615]
 [95.71239324 89.65905787 90.88194299 ... 83.96536841 84.06872106
  92.62720321]
 [89.93887471 89.64764251 90.94223739 ... 79.31606869 79.9906475
  94.74942434]
 ...
 [93.67536726 88.35763455 84.94549323 ... 89.22963129 93.09920356
  89.49086451]
 [93.48707261 93.26816651 92.80343673 ... 86.212934   83.63613649
  97.80827978]
 [87.26856884 87.16494976 83.16012592 ... 84.28880837 78.52891828
  90.33298456]]
Most similar test message for train message 0: Test Message 2609 (Similarity: 98.16%)
Most similar test message for train message 1: Test Message 1157 (Similarity: 96.68%)
Most similar test message for train message 2: Test Message 1936 (Similarity: 96.92%)
Most similar test message for train message 3: Test Message 1334 (Similarity: 72.40%)
Most similar test message for train message 4: Test Message 849 (Similarity: 96.82%)
Most similar test message f

In [35]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_average_similarity(train_docs, test_docs):
    # Create a TfidfVectorizer to convert the documents to vectors
    vectorizer = TfidfVectorizer()

    # Fit and transform the training documents
    train_vectors = vectorizer.fit_transform(train_docs)

    # Transform the test documents into the same vector space
    test_vectors = vectorizer.transform(test_docs)

    # Calculate cosine similarity between all train and test vectors
    similarity_matrix = cosine_similarity(test_vectors, train_vectors)

    # Average the similarities for each test document
    average_similarities = similarity_matrix.mean(axis=1)

    # Convert to percentage
    average_similarity_percentage = average_similarities * 100

    return average_similarity_percentage

# Load the dataset
data = pd.read_csv('/content/SMSSpamCollection.txt', sep='\t', header=None, names=['label', 'message'])

# Split the dataset into training and testing sets (80% train, 20% test)
train_data, test_data = train_test_split(data, test_size=0.5, random_state=42, stratify=data['label'])

# Calculate average similarity percentage
train_messages = train_data['message'].tolist()
test_messages = test_data['message'].tolist()

average_similarity_percentage = calculate_average_similarity(train_messages, test_messages)

# Display results
for i, similarity in enumerate(average_similarity_percentage):
    print(f"Test Message {i}: Average Similarity with Train Set: {similarity:.2f}%")


Test Message 0: Average Similarity with Train Set: 3.16%
Test Message 1: Average Similarity with Train Set: 1.19%
Test Message 2: Average Similarity with Train Set: 1.18%
Test Message 3: Average Similarity with Train Set: 2.21%
Test Message 4: Average Similarity with Train Set: 4.11%
Test Message 5: Average Similarity with Train Set: 1.28%
Test Message 6: Average Similarity with Train Set: 3.03%
Test Message 7: Average Similarity with Train Set: 1.46%
Test Message 8: Average Similarity with Train Set: 2.69%
Test Message 9: Average Similarity with Train Set: 2.81%
Test Message 10: Average Similarity with Train Set: 2.36%
Test Message 11: Average Similarity with Train Set: 2.46%
Test Message 12: Average Similarity with Train Set: 1.38%
Test Message 13: Average Similarity with Train Set: 1.96%
Test Message 14: Average Similarity with Train Set: 2.03%
Test Message 15: Average Similarity with Train Set: 2.35%
Test Message 16: Average Similarity with Train Set: 2.39%
Test Message 17: Average

In [36]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def overall_cosine_similarity_percentage(train_docs, test_docs):
    # Create a TfidfVectorizer to convert the documents to vectors
    vectorizer = TfidfVectorizer()

    # Fit and transform the training documents
    train_vectors = vectorizer.fit_transform(train_docs)

    # Transform the test documents into the same vector space
    test_vectors = vectorizer.transform(test_docs)

    # Calculate cosine similarity between all train and test vectors
    similarity_matrix = cosine_similarity(test_vectors, train_vectors)

    # Average the similarities across all test and train documents
    overall_similarity = similarity_matrix.mean()

    # Convert to percentage
    overall_similarity_percentage = overall_similarity * 100

    return overall_similarity_percentage

# Load the dataset
data = pd.read_csv('/content/SMSSpamCollection.txt', sep='\t', header=None, names=['label', 'message'])

# Split the dataset into training and testing sets (80% train, 20% test)
train_data, test_data = train_test_split(data, test_size=0.5, random_state=42, stratify=data['label'])

# Calculate overall cosine similarity percentage
train_messages = train_data['message'].tolist()
test_messages = test_data['message'].tolist()

overall_similarity_percentage = overall_cosine_similarity_percentage(train_messages, test_messages)

# Display the overall similarity percentage
print(f"The overall cosine similarity between train and test datasets is: {overall_similarity_percentage:.2f}%")


The overall cosine similarity between train and test datasets is: 2.00%
