In [None]:
import os
import zipfile

base_path = "/mnt/data/text_similarity_dataset"
os.makedirs(base_path, exist_ok=True)

documents = {
    "doc1_original.txt": "Artificial Intelligence is transforming education by enabling personalized learning experiences. AI systems can analyze student performance and adapt content to individual needs.",
    "doc2_original.txt": "AI-based tools help teachers by automating grading, tracking attendance, and identifying students who need additional support.",
    "doc3_original.txt": "Educational institutions are using artificial intelligence to enhance online learning platforms and improve student engagement.",
    "doc4_original.txt": "Machine learning algorithms can predict student outcomes by analyzing historical academic data and learning patterns.",
    "doc5_original.txt": "Artificial Intelligence reduces administrative workload in schools, allowing educators to focus more on teaching.",
    "doc6_modified.txt": "Artificial Intelligence is changing education by providing personalized learning experiences. AI systems analyze student performance and adjust content to meet individual needs.",
    "doc7_modified.txt": "AI-powered tools assist teachers by automating grading tasks, monitoring attendance, and identifying students requiring extra support.",
    "doc8_modified.txt": "Educational institutions use artificial intelligence to improve online learning platforms and increase student engagement.",
    "doc9_paraphrased.txt": "AI technology allows learning systems to customize educational content based on each student’s strengths and weaknesses.",
    "doc10_paraphrased.txt": "By studying past academic records, machine learning models can estimate future student performance and learning outcomes.",
    "doc11_paraphrased.txt": "Schools benefit from artificial intelligence by minimizing paperwork and giving teachers more time to focus on instruction.",
    "doc12_new.txt": "The integration of AI into educational systems is creating dynamic learning environments that cater to diverse student populations.",
    "doc13_new.txt": "Smart algorithms are now capable of recommending personalized learning paths, enhancing student motivation and retention.",
    "doc14_new.txt": "Virtual reality and augmented reality, powered by AI, offer immersive educational experiences that were previously unimaginable.",
    "doc15_new.txt": "Ethical considerations in AI development for education emphasize fairness, transparency, and data privacy to protect student information."
}

# Create the text documents
for filename, content in documents.items():
    file_path = os.path.join(base_path, filename)
    with open(file_path, "w") as f:
        f.write(content)
    print(f"Created {file_path}")

# Create a README file
readme_content = """
# Text Similarity Dataset

This dataset contains 15 text documents related to Artificial Intelligence in education.

### Document Categories:
- `doc1_original.txt` to `doc5_original.txt`: Original statements about AI in education.
- `doc6_modified.txt` to `doc8_modified.txt`: Slightly rephrased versions of original statements.
- `doc9_paraphrased.txt` to `doc11_paraphrased.txt`: More significantly paraphrased versions.
- `doc12_new.txt` to `doc15_new.txt`: New statements on related topics.

These documents can be used for tasks such as:
- Text similarity analysis
- Semantic search
- Paraphrase detection
- Topic modeling

Created for demonstration purposes.
"""

readme_file_path = os.path.join(base_path, "README.md")
with open(readme_file_path, "w") as f:
    f.write(readme_content)
print(f"Created {readme_file_path}")

# Zip the files for download
zip_filename = "text_similarity_dataset.zip"
zip_file_path = os.path.join("/mnt/data", zip_filename) # Store zip outside the base_path for easy download

with zipfile.ZipFile(zip_file_path, 'w') as zipf:
    for root, _, files in os.walk(base_path):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, os.path.dirname(base_path)) # Path within the zip file
            zipf.write(file_path, arcname)
print(f"Created {zip_file_path}")
print("Dataset created and zipped successfully!")

Created /mnt/data/text_similarity_dataset/doc1_original.txt
Created /mnt/data/text_similarity_dataset/doc2_original.txt
Created /mnt/data/text_similarity_dataset/doc3_original.txt
Created /mnt/data/text_similarity_dataset/doc4_original.txt
Created /mnt/data/text_similarity_dataset/doc5_original.txt
Created /mnt/data/text_similarity_dataset/doc6_modified.txt
Created /mnt/data/text_similarity_dataset/doc7_modified.txt
Created /mnt/data/text_similarity_dataset/doc8_modified.txt
Created /mnt/data/text_similarity_dataset/doc9_paraphrased.txt
Created /mnt/data/text_similarity_dataset/doc10_paraphrased.txt
Created /mnt/data/text_similarity_dataset/doc11_paraphrased.txt
Created /mnt/data/text_similarity_dataset/doc12_new.txt
Created /mnt/data/text_similarity_dataset/doc13_new.txt
Created /mnt/data/text_similarity_dataset/doc14_new.txt
Created /mnt/data/text_similarity_dataset/doc15_new.txt
Created /mnt/data/text_similarity_dataset/README.md
Created /mnt/data/text_similarity_dataset.zip
Dataset

# **STEP 1 — Preprocess**

In [None]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data (if not already downloaded)
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# Explicitly download 'punkt_tab' as it was requested in the traceback
# and seems to be causing issues even after 'punkt' download.
print("Explicitly attempting to download 'punkt_tab' as per traceback suggestion.")
nltk.download('punkt_tab')

# try:
#     nltk.data.find('corpora/wordnet')
# except LookupError:
#     nltk.download('wordnet')

print("NLTK data (stopwords, punkt, punkt_tab) checked and downloaded if necessary.")

Explicitly attempting to download 'punkt_tab' as per traceback suggestion.


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


NLTK data (stopwords, punkt, punkt_tab) checked and downloaded if necessary.


### Preprocessing Function

This function will perform the following steps:
1.  **Lowercase**: Convert all text to lowercase.
2.  **Punctuation Removal**: Remove all punctuation marks.
3.  **Tokenization**: Split the text into individual words.
4.  **Stopword Removal**: Remove common English stopwords.
5.  **(Optional) Lemmatization**: Convert words to their base form (commented out by default, uncomment to enable).

In [None]:
def preprocess_text(text):
    # 1. Lowercase
    text = text.lower()

    # 2. Punctuation Removal
    text = text.translate(str.maketrans('', '', string.punctuation))

    # 3. Tokenization
    tokens = word_tokenize(text)

    # 4. Stopword Removal
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # 5. (Optional) Lemmatization
    # lemmatizer = WordNetLemmatizer()
    # tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

print("Preprocessing function defined.")

Preprocessing function defined.


### Apply Preprocessing to Documents

Now, let's apply the `preprocess_text` function to all the documents and store the preprocessed versions.

In [None]:
import os

# Assuming base_path and documents dict are still available from the previous cell
# If not, you might need to re-run the cell where they were defined or load them.
# For this example, we'll use the 'documents' dictionary that holds the original content.

preprocessed_documents = {}
raw_documents = {}

# Read content from the files and preprocess
for filename in os.listdir(base_path):
    if filename.endswith('.txt'): # Only process text files
        file_path = os.path.join(base_path, filename)
        with open(file_path, 'r') as f:
            content = f.read()
            raw_documents[filename] = content
            preprocessed_documents[filename] = preprocess_text(content)

print(f"Successfully preprocessed {len(preprocessed_documents)} documents.")

# Display original and preprocessed content for a sample document
sample_filename = "doc1_original.txt"
if sample_filename in raw_documents:
    print(f"\n--- Original Content of {sample_filename} ---")
    print(raw_documents[sample_filename])
    print(f"\n--- Preprocessed Content of {sample_filename} ---")
    print(preprocessed_documents[sample_filename])
else:
    print(f"Sample file {sample_filename} not found in the loaded documents.")

Successfully preprocessed 15 documents.

--- Original Content of doc1_original.txt ---
Artificial Intelligence is transforming education by enabling personalized learning experiences. AI systems can analyze student performance and adapt content to individual needs.

--- Preprocessed Content of doc1_original.txt ---
artificial intelligence transforming education enabling personalized learning experiences ai systems analyze student performance adapt content individual needs


# **STEP 2 — Feature Representation**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd

# Ensure documents are in a consistent order
document_filenames = sorted(preprocessed_documents.keys())
preprocessed_texts = [preprocessed_documents[filename] for filename in document_filenames]

### TF-IDF for Cosine Similarity
print("\n--- Generating TF-IDF Features ---")
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed documents
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_texts)

print(f"TF-IDF Matrix Shape: {tfidf_matrix.shape}")
print("First 5 TF-IDF features for the first document:")
# Convert to DataFrame for better viewing
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out(), index=document_filenames)
print(tfidf_df.iloc[0, :].head().to_string())

### Bag-of-Words for Jaccard Similarity
print("\n--- Generating Bag-of-Words Features ---")
# Initialize Count Vectorizer (Bag-of-Words)
cv_vectorizer = CountVectorizer()

# Fit and transform the preprocessed documents
cv_matrix = cv_vectorizer.fit_transform(preprocessed_texts)

print(f"Bag-of-Words Matrix Shape: {cv_matrix.shape}")
print("First 5 Bag-of-Words features for the first document:")
# Convert to DataFrame for better viewing
cv_df = pd.DataFrame(cv_matrix.toarray(), columns=cv_vectorizer.get_feature_names_out(), index=document_filenames)
print(cv_df.iloc[0, :].head().to_string())

print("Feature representation complete.")


--- Generating TF-IDF Features ---
TF-IDF Matrix Shape: (15, 116)
First 5 TF-IDF features for the first document:
academic          0.274958
adapt             0.000000
additional        0.000000
adjust            0.000000
administrative    0.000000

--- Generating Bag-of-Words Features ---
Bag-of-Words Matrix Shape: (15, 116)
First 5 Bag-of-Words features for the first document:
academic          1
adapt             0
additional        0
adjust            0
administrative    0
Feature representation complete.


# **STEP 3 — Cosine Similarity**

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

print("\n--- Calculating Cosine Similarity ---")

# Calculate cosine similarity matrix
# tfidf_matrix is already available from previous step
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Convert to DataFrame for better readability
cosine_sim_df = pd.DataFrame(cosine_sim_matrix,
                             index=document_filenames,
                             columns=document_filenames)

print("Cosine Similarity Matrix (first 5x5 entries):")
display(cosine_sim_df.head())



--- Calculating Cosine Similarity ---
Cosine Similarity Matrix (first 5x5 entries):


Unnamed: 0,doc10_paraphrased.txt,doc11_paraphrased.txt,doc12_new.txt,doc13_new.txt,doc14_new.txt,doc15_new.txt,doc1_original.txt,doc2_original.txt,doc3_original.txt,doc4_original.txt,doc5_original.txt,doc6_modified.txt,doc7_modified.txt,doc8_modified.txt,doc9_paraphrased.txt
doc10_paraphrased.txt,1.0,0.0,0.070376,0.069993,0.0,0.019108,0.124234,0.0,0.076691,0.35838,0.0,0.118579,0.0,0.076691,0.072053
doc11_paraphrased.txt,0.0,1.0,0.0,0.0,0.0,0.0,0.074081,0.061331,0.087654,0.0,0.26,0.070709,0.058692,0.087654,0.0
doc12_new.txt,0.070376,0.0,1.0,0.048908,0.073873,0.056628,0.134916,0.0,0.105497,0.074579,0.0,0.128775,0.0,0.105497,0.19875
doc13_new.txt,0.069993,0.0,0.048908,1.0,0.0,0.020364,0.108365,0.0,0.053297,0.159553,0.0,0.103433,0.0,0.053297,0.050073
doc14_new.txt,0.0,0.0,0.073873,0.0,1.0,0.0303,0.084629,0.0,0.043505,0.0,0.0,0.080777,0.0,0.043505,0.075633


### Highlighting Top 5 Most Similar Document Pairs

Since the cosine similarity matrix is symmetric and the diagonal elements (document's similarity with itself) are always 1, we need to extract only the unique pairs (e.g., `docA` vs `docB` but not `docB` vs `docA`, and not `docA` vs `docA`).

In [None]:
# Extract unique pairs and their similarity scores
similarity_pairs = []
num_docs = len(document_filenames)

for i in range(num_docs):
    for j in range(i + 1, num_docs): # Start from i+1 to avoid duplicates and self-similarity
        doc1 = document_filenames[i]
        doc2 = document_filenames[j]
        score = cosine_sim_df.loc[doc1, doc2]
        similarity_pairs.append(((doc1, doc2), score))

# Sort pairs by similarity score in descending order
similarity_pairs.sort(key=lambda x: x[1], reverse=True)

print("\n--- Top 5 Most Similar Document Pairs (Cosine Similarity) ---")
for pair, score in similarity_pairs[:5]:
    print(f"Document Pair: {pair[0]} - {pair[1]}, Similarity Score: {score:.4f}")

print("\n--- Interpretation of Scores ---")
print("Cosine similarity measures the cosine of the angle between two non-zero vectors. In text analysis, these vectors are usually TF-IDF (or word count) vectors. A score of:")
print("- 1.0 indicates identical content (or very similar meaning and word usage).")
print("- 0.0 indicates no common terms (completely dissimilar).")
print("- Values between 0.0 and 1.0 indicate varying degrees of similarity.")
print("The higher the score, the more similar the documents are in terms of their semantic content (based on shared words and their importance).")


--- Top 5 Most Similar Document Pairs (Cosine Similarity) ---
Document Pair: doc3_original.txt - doc8_modified.txt, Similarity Score: 0.7266
Document Pair: doc1_original.txt - doc6_modified.txt, Similarity Score: 0.6749
Document Pair: doc2_original.txt - doc7_modified.txt, Similarity Score: 0.5171
Document Pair: doc10_paraphrased.txt - doc4_original.txt, Similarity Score: 0.3584
Document Pair: doc11_paraphrased.txt - doc5_original.txt, Similarity Score: 0.2600

--- Interpretation of Scores ---
Cosine similarity measures the cosine of the angle between two non-zero vectors. In text analysis, these vectors are usually TF-IDF (or word count) vectors. A score of:
- 1.0 indicates identical content (or very similar meaning and word usage).
- 0.0 indicates no common terms (completely dissimilar).
- Values between 0.0 and 1.0 indicate varying degrees of similarity.
The higher the score, the more similar the documents are in terms of their semantic content (based on shared words and their impo

# **STEP 4 — Jaccard Similarity**

In [None]:
import pandas as pd

print("\n--- Calculating Jaccard Similarity ---")

def jaccard_similarity(text1, text2):
    # Convert preprocessed texts to sets of words
    words1 = set(text1.split())
    words2 = set(text2.split())

    if not words1 and not words2:
        return 1.0 # Both empty, considered 100% similar
    if not words1 or not words2:
        return 0.0 # One empty, considered 0% similar (no overlap)

    intersection = len(words1.intersection(words2))
    union = len(words1.union(words2))
    return intersection / union

# Calculate Jaccard similarity matrix
jaccard_sim_matrix = []
num_docs = len(document_filenames)

for i in range(num_docs):
    row = []
    for j in range(num_docs):
        score = jaccard_similarity(preprocessed_documents[document_filenames[i]],
                                   preprocessed_documents[document_filenames[j]])
        row.append(score)
    jaccard_sim_matrix.append(row)

jaccard_sim_df = pd.DataFrame(jaccard_sim_matrix,
                              index=document_filenames,
                              columns=document_filenames)

print("Jaccard Similarity Matrix (first 5x5 entries):")
display(jaccard_sim_df.head())

# Extract unique pairs and their similarity scores
jaccard_similarity_pairs = []
for i in range(num_docs):
    for j in range(i + 1, num_docs): # Start from i+1 to avoid duplicates and self-similarity
        doc1 = document_filenames[i]
        doc2 = document_filenames[j]
        score = jaccard_sim_df.loc[doc1, doc2]
        jaccard_similarity_pairs.append(((doc1, doc2), score))

# Sort pairs by similarity score in descending order
jaccard_similarity_pairs.sort(key=lambda x: x[1], reverse=True)

print("\n--- Top 5 Most Similar Document Pairs (Jaccard Similarity) ---")
for pair, score in jaccard_similarity_pairs[:5]:
    print(f"Document Pair: {pair[0]} - {pair[1]}, Similarity Score: {score:.4f}")

print("\n--- Interpretation of Scores ---")
print("Jaccard Similarity measures the ratio of the intersection to the union of two sets of words. A score of:")
print("- 1.0 indicates that the two documents share all of their unique words (identical word sets).")
print("- 0.0 indicates that the two documents share no unique words (completely distinct word sets).")
print("- Values between 0.0 and 1.0 indicate the proportion of shared unique words.")
print("The higher the score, the greater the overlap in the unique vocabulary of the two documents.")


--- Calculating Jaccard Similarity ---
Jaccard Similarity Matrix (first 5x5 entries):


Unnamed: 0,doc10_paraphrased.txt,doc11_paraphrased.txt,doc12_new.txt,doc13_new.txt,doc14_new.txt,doc15_new.txt,doc1_original.txt,doc2_original.txt,doc3_original.txt,doc4_original.txt,doc5_original.txt,doc6_modified.txt,doc7_modified.txt,doc8_modified.txt,doc9_paraphrased.txt
doc10_paraphrased.txt,1.0,0.0,0.090909,0.095238,0.0,0.041667,0.115385,0.0,0.090909,0.277778,0.0,0.111111,0.0,0.090909,0.086957
doc11_paraphrased.txt,0.0,1.0,0.0,0.0,0.0,0.0,0.076923,0.043478,0.095238,0.0,0.235294,0.074074,0.041667,0.095238,0.0
doc12_new.txt,0.090909,0.0,1.0,0.095238,0.095238,0.086957,0.16,0.0,0.142857,0.095238,0.0,0.153846,0.0,0.142857,0.25
doc13_new.txt,0.095238,0.0,0.095238,1.0,0.0,0.043478,0.12,0.0,0.095238,0.157895,0.0,0.115385,0.0,0.095238,0.090909
doc14_new.txt,0.0,0.0,0.095238,0.0,1.0,0.043478,0.076923,0.0,0.045455,0.0,0.0,0.074074,0.0,0.045455,0.090909



--- Top 5 Most Similar Document Pairs (Jaccard Similarity) ---
Document Pair: doc3_original.txt - doc8_modified.txt, Similarity Score: 0.7143
Document Pair: doc1_original.txt - doc6_modified.txt, Similarity Score: 0.6667
Document Pair: doc2_original.txt - doc7_modified.txt, Similarity Score: 0.4211
Document Pair: doc10_paraphrased.txt - doc4_original.txt, Similarity Score: 0.2778
Document Pair: doc12_new.txt - doc9_paraphrased.txt, Similarity Score: 0.2500

--- Interpretation of Scores ---
Jaccard Similarity measures the ratio of the intersection to the union of two sets of words. A score of:
- 1.0 indicates that the two documents share all of their unique words (identical word sets).
- 0.0 indicates that the two documents share no unique words (completely distinct word sets).
- Values between 0.0 and 1.0 indicate the proportion of shared unique words.
The higher the score, the greater the overlap in the unique vocabulary of the two documents.


# **STEP 5 — WordNet Semantic Similarity**

In [None]:
import os

# Assuming base_path and documents dict are still available from the previous cell
# If not, you might need to re-run the cell where they were defined or load them.
# For this example, we'll use the 'documents' dictionary that holds the original content.

preprocessed_documents = {}
raw_documents = {}

# Read content from the files and preprocess
for filename in os.listdir(base_path):
    if filename.endswith('.txt'): # Only process text files
        file_path = os.path.join(base_path, filename)
        with open(file_path, 'r') as f:
            content = f.read()
            raw_documents[filename] = content
            preprocessed_documents[filename] = preprocess_text(content)

print(f"Successfully preprocessed {len(preprocessed_documents)} documents with lemmatization.")

# Display original and preprocessed content for a sample document
sample_filename = "doc1_original.txt"
if sample_filename in raw_documents:
    print(f"\n--- Original Content of {sample_filename} ---")
    print(raw_documents[sample_filename])
    print(f"\n--- Preprocessed Content of {sample_filename} (with lemmatization) ---")
    print(preprocessed_documents[sample_filename])
else:
    print(f"Sample file {sample_filename} not found in the loaded documents.")

Successfully preprocessed 15 documents with lemmatization.

--- Original Content of doc1_original.txt ---
Artificial Intelligence is transforming education by enabling personalized learning experiences. AI systems can analyze student performance and adapt content to individual needs.

--- Preprocessed Content of doc1_original.txt (with lemmatization) ---
artificial intelligence transforming education enabling personalized learning experience ai system analyze student performance adapt content individual need


**Reasoning**:
Now that the documents are preprocessed with lemmatization, the next step is to implement a WordNet-based semantic similarity function to compare document pairs. This involves defining a function that leverages NLTK's WordNet capabilities to calculate similarity between words and then aggregates these scores for full documents.



In [None]:
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize

def get_wordnet_synsets(word):
    """Returns a list of WordNet synsets for a given word."""
    return wn.synsets(word)

def document_wordnet_similarity(doc1_text, doc2_text):
    """Calculates a basic WordNet semantic similarity between two documents.
    This function tokenizes documents, finds synsets for each word, and then
    calculates the average path similarity between all possible word pairs.
    """
    tokens1 = word_tokenize(doc1_text)
    tokens2 = word_tokenize(doc2_text)

    # Filter out empty tokens or tokens that don't have synsets
    valid_tokens1 = [t for t in tokens1 if get_wordnet_synsets(t)]
    valid_tokens2 = [t for t in tokens2 if get_wordnet_synsets(t)]

    if not valid_tokens1 or not valid_tokens2:
        return 0.0 # No common words with synsets, or one document is empty

    similarity_scores = []

    # Compare every valid word from doc1 with every valid word from doc2
    for t1 in valid_tokens1:
        for t2 in valid_tokens2:
            syns1 = get_wordnet_synsets(t1)
            syns2 = get_wordnet_synsets(t2)

            # Find the highest similarity between any pair of synsets for the two words
            max_sim = 0.0
            for s1 in syns1:
                for s2 in syns2:
                    sim = s1.path_similarity(s2) # Using path similarity
                    if sim is not None and sim > max_sim:
                        max_sim = sim
            similarity_scores.append(max_sim)

    if not similarity_scores:
        return 0.0

    # Return the average of all similarity scores
    return sum(similarity_scores) / len(similarity_scores)

print("WordNet semantic similarity function defined.")

WordNet semantic similarity function defined.


**Reasoning**:
Now that the WordNet semantic similarity function is defined, I will apply it to all document pairs to calculate their semantic similarity, store these scores in a DataFrame, and then identify and display the top 5 most similar pairs, along with an interpretation of the scores, as required by the task.



In [None]:
import pandas as pd

print("\n--- Calculating WordNet Semantic Similarity ---")

# Ensure documents are in a consistent order
document_filenames = sorted(preprocessed_documents.keys())

# Calculate WordNet semantic similarity matrix
wordnet_sim_matrix = []
num_docs = len(document_filenames)

# Keep track of similarity pairs for sorting later
wordnet_similarity_pairs = []

for i in range(num_docs):
    row = []
    for j in range(num_docs):
        doc1_name = document_filenames[i]
        doc2_name = document_filenames[j]

        # Retrieve preprocessed text for each document
        doc1_text = preprocessed_documents[doc1_name]
        doc2_text = preprocessed_documents[doc2_name]

        # Calculate similarity
        score = document_wordnet_similarity(doc1_text, doc2_text)
        row.append(score)

        # Store unique pairs (j > i to avoid duplicates and self-similarity)
        if j > i:
            wordnet_similarity_pairs.append(((doc1_name, doc2_name), score))
    wordnet_sim_matrix.append(row)

# Convert to DataFrame for better readability
wordnet_sim_df = pd.DataFrame(wordnet_sim_matrix,
                              index=document_filenames,
                              columns=document_filenames)

print("WordNet Semantic Similarity Matrix (first 5x5 entries):")
display(wordnet_sim_df.head())

# Sort pairs by similarity score in descending order
wordnet_similarity_pairs.sort(key=lambda x: x[1], reverse=True)

print("\n--- Top 5 Most Similar Document Pairs (WordNet Semantic Similarity) ---")
for pair, score in wordnet_similarity_pairs[:5]:
    print(f"Document Pair: {pair[0]} - {pair[1]}, Similarity Score: {score:.4f}")

print("\n--- Interpretation of Scores ---")
print("WordNet semantic similarity (using path similarity) measures the shortest path between concepts (synsets) in the WordNet hierarchy. A score of:")
print("- 1.0 indicates very high semantic relatedness (often direct synonyms or very closely related concepts).")
print("- 0.0 indicates no semantic path found, suggesting words are entirely unrelated within the WordNet hierarchy, or one of the documents is empty of words with synsets.")
print("- Values between 0.0 and 1.0 indicate varying degrees of semantic closeness, where a higher score means greater semantic similarity based on shared conceptual meaning rather than just shared words.")
print("This method can capture relationships between words that are not explicitly shared but are semantically related (e.g., 'car' and 'vehicle').")



--- Calculating WordNet Semantic Similarity ---
WordNet Semantic Similarity Matrix (first 5x5 entries):


Unnamed: 0,doc10_paraphrased.txt,doc11_paraphrased.txt,doc12_new.txt,doc13_new.txt,doc14_new.txt,doc15_new.txt,doc1_original.txt,doc2_original.txt,doc3_original.txt,doc4_original.txt,doc5_original.txt,doc6_modified.txt,doc7_modified.txt,doc8_modified.txt,doc9_paraphrased.txt
doc10_paraphrased.txt,0.318306,0.204419,0.22002,0.217637,0.236689,0.163271,0.250674,0.218996,0.225734,0.261123,0.22068,0.249736,0.210364,0.231589,0.22353
doc11_paraphrased.txt,0.204419,0.241621,0.169241,0.172046,0.192426,0.15191,0.207706,0.190039,0.18853,0.176937,0.214335,0.211391,0.182006,0.193571,0.176926
doc12_new.txt,0.22002,0.169241,0.236103,0.179439,0.213024,0.150536,0.210844,0.189897,0.189922,0.194698,0.177892,0.215553,0.179828,0.194774,0.207034
doc13_new.txt,0.217637,0.172046,0.179439,0.238983,0.193297,0.141651,0.209182,0.197134,0.193686,0.198026,0.183206,0.211165,0.185167,0.194579,0.183071
doc14_new.txt,0.236689,0.192426,0.213024,0.193297,0.326512,0.157681,0.242911,0.204085,0.207896,0.203462,0.199047,0.249347,0.196651,0.215014,0.220943



--- Top 5 Most Similar Document Pairs (WordNet Semantic Similarity) ---
Document Pair: doc1_original.txt - doc6_modified.txt, Similarity Score: 0.2665
Document Pair: doc10_paraphrased.txt - doc4_original.txt, Similarity Score: 0.2611
Document Pair: doc10_paraphrased.txt - doc1_original.txt, Similarity Score: 0.2507
Document Pair: doc10_paraphrased.txt - doc6_modified.txt, Similarity Score: 0.2497
Document Pair: doc14_new.txt - doc6_modified.txt, Similarity Score: 0.2493

--- Interpretation of Scores ---
WordNet semantic similarity (using path similarity) measures the shortest path between concepts (synsets) in the WordNet hierarchy. A score of:
- 1.0 indicates very high semantic relatedness (often direct synonyms or very closely related concepts).
- 0.0 indicates no semantic path found, suggesting words are entirely unrelated within the WordNet hierarchy, or one of the documents is empty of words with synsets.
- Values between 0.0 and 1.0 indicate varying degrees of semantic closeness

# **Step 6 -Comparison Section**

### Comparative Analysis of Similarity Metrics

--- Which similarity metric detected copying best? ---
Cosine Similarity, driven by TF-IDF, and Jaccard Similarity proved most effective at detecting direct textual copying or highly lexical similarities. For instance, both metrics assigned very high scores to the `_original.txt` and `_modified.txt` pairs, such as `doc3_original.txt` and `doc8_modified.txt` (Cosine: 0.7266, Jaccard: 0.7143). These high scores accurately reflect that the modified documents were essentially rephrased versions of the originals, maintaining significant lexical overlap.

--- When did Jaccard fail? ---
Jaccard Similarity's efficacy is directly tied to the exact lexical overlap between documents. It didn't necessarily 'fail' but showed its limitations when documents conveyed similar meanings using a more diverse vocabulary. For instance, with more heavily paraphrased content like `doc10_paraphrased.txt` and `doc4_original.txt`, Jaccard yielded a score of 0.2778, which is lower than the scores for directly modified pairs. This indicates that while the meaning might be similar, the unique word sets had less direct intersection.

--- When did WordNet help? ---
WordNet Semantic Similarity is designed to capture conceptual relationships even when direct lexical overlap is minimal. While its absolute scores were generally lower for the 'modified' and 'paraphrased' documents compared to Cosine and Jaccard (e.g., `doc1_original.txt` - `doc6_modified.txt` at 0.2665), it could be particularly helpful in scenarios where synonyms or semantically related terms are used. It can identify thematic connections, such as between `doc10_paraphrased.txt` and `doc1_original.txt` (0.2507), by linking terms like 'student performance' and 'learning' through their underlying semantic networks, even if the phrasing differs considerably.

--- Any false positives? ---
Based on the top 5 results for each metric, there were no clear 'false positives' where highly dissimilar documents received high similarity scores. The top pairs identified by Cosine and Jaccard were genuinely lexically similar (original vs. modified versions). WordNet's top scores were relatively modest (around 0.25-0.26), meaning it wasn't incorrectly flagging unrelated documents as highly similar. However, the interpretation of these lower WordNet scores as 'strong matches' depends on the specific application's threshold for semantic similarity.

## **STEP 7 — Lab Report**

# Task
The task is to generate a comprehensive lab report summarizing the objective, dataset, preprocessing steps, results from each similarity metric, comparative analysis, and conclusion of the text similarity project.

Here's the lab report:

## Lab Report: Text Similarity Analysis

### 1. Objective

The primary objective of this lab was to explore and compare various text similarity metrics—namely Cosine Similarity (using TF-IDF), Jaccard Similarity (using Bag-of-Words), and WordNet Semantic Similarity—in their ability to identify relatedness between a set of documents. The goal was to understand the strengths and limitations of each method in detecting direct copying, rephrasing, and semantic relatedness.

### 2. Dataset Description

A synthetic dataset comprising 15 text documents related to "Artificial Intelligence in Education" was created. The dataset was structured into distinct categories to facilitate a robust comparison of similarity metrics:
*   **`doc1_original.txt` to `doc5_original.txt`**: Original statements.
*   **`doc6_modified.txt` to `doc8_modified.txt`**: Slightly rephrased versions of original statements, intended to have high lexical overlap.
*   **`doc9_paraphrased.txt` to `doc11_paraphrased.txt`**: More significantly paraphrased versions, expected to have less direct lexical overlap but maintained semantic meaning.
*   **`doc12_new.txt` to `doc15_new.txt`**: New statements on related topics, designed to test semantic broader relatedness.

The dataset was zipped as `text_similarity_dataset.zip` for portability.

### 3. Preprocessing Steps

Before calculating similarity, all text documents underwent a series of preprocessing steps:
1.  **Lowercasing**: All text was converted to lowercase to ensure consistency and prevent variations in capitalization from being treated as different words.
2.  **Punctuation Removal**: All punctuation marks were removed to focus on the lexical content.
3.  **Tokenization**: Text was split into individual words (tokens).
4.  **Stopword Removal**: Common English stopwords (e.g., "the," "is," "a") were removed to reduce noise and focus on more meaningful terms.
5.  **(Optional) Lemmatization**: For WordNet Semantic Similarity, an additional lemmatization step was performed to reduce words to their base forms (e.g., "running" to "run") to improve semantic matching.

### 4. Similarity Metric Results

#### 4.1. Cosine Similarity (TF-IDF)

**Description**: Cosine similarity measures the cosine of the angle between two non-zero TF-IDF vectors. TF-IDF (Term Frequency-Inverse Document Frequency) weights terms by their importance in a document relative to the entire corpus. A score of 1.0 indicates identical content, 0.0 indicates no common terms.

**Top 5 Most Similar Document Pairs**:
*   Document Pair: `doc3_original.txt` - `doc8_modified.txt`, Similarity Score: 0.7266
*   Document Pair: `doc1_original.txt` - `doc6_modified.txt`, Similarity Score: 0.6749
*   Document Pair: `doc2_original.txt` - `doc7_modified.txt`, Similarity Score: 0.5171
*   Document Pair: `doc10_paraphrased.txt` - `doc4_original.txt`, Similarity Score: 0.3584
*   Document Pair: `doc11_paraphrased.txt` - `doc5_original.txt`, Similarity Score: 0.2600

**Interpretation**: Cosine Similarity performed well in identifying documents with significant lexical overlap, as evidenced by the high scores between original and modified documents. The TF-IDF weighting helps emphasize unique and important terms.

#### 4.2. Jaccard Similarity (Bag-of-Words)

**Description**: Jaccard Similarity measures the ratio of the intersection to the union of two sets of words (after preprocessing). A score of 1.0 means identical word sets, and 0.0 means no common words. It is sensitive to exact lexical matches.

**Top 5 Most Similar Document Pairs**:
*   Document Pair: `doc3_original.txt` - `doc8_modified.txt`, Similarity Score: 0.7143
*   Document Pair: `doc1_original.txt` - `doc6_modified.txt`, Similarity Score: 0.6667
*   Document Pair: `doc2_original.txt` - `doc7_modified.txt`, Similarity Score: 0.4211
*   Document Pair: `doc10_paraphrased.txt` - `doc4_original.txt`, Similarity Score: 0.2778
*   Document Pair: `doc12_new.txt` - `doc9_paraphrased.txt`, Similarity Score: 0.2500

**Interpretation**: Similar to Cosine Similarity, Jaccard Similarity effectively detected direct textual copying and highly similar rephrased content due to its reliance on the exact shared vocabulary.

#### 4.3. WordNet Semantic Similarity

**Description**: WordNet Semantic Similarity (using path similarity) measures the shortest path between concepts (synsets) in the WordNet hierarchy. This method aims to capture conceptual relationships beyond exact word matches. Scores range from 0.0 (no semantic path) to 1.0 (high semantic relatedness).

**Top 5 Most Similar Document Pairs**:
*   Document Pair: `doc1_original.txt` - `doc6_modified.txt`, Similarity Score: 0.2665
*   Document Pair: `doc10_paraphrased.txt` - `doc4_original.txt`, Similarity Score: 0.2611
*   Document Pair: `doc10_paraphrased.txt` - `doc1_original.txt`, Similarity Score: 0.2507
*   Document Pair: `doc10_paraphrased.txt` - `doc6_modified.txt`, Similarity Score: 0.2497
*   Document Pair: `doc14_new.txt` - `doc6_modified.txt`, Similarity Score: 0.2493

**Interpretation**: WordNet similarity yielded generally lower absolute scores compared to the other two metrics. However, it demonstrated the ability to find conceptual links, even between documents with minimal lexical overlap, by leveraging semantic relationships in WordNet.

### 5. Comparative Analysis

*   **Which similarity metric detected copying best?**
    Cosine Similarity (TF-IDF) and Jaccard Similarity proved most effective at detecting direct textual copying or highly lexical similarities. For instance, both metrics assigned very high scores to the `_original.txt` and `_modified.txt` pairs, such as `doc3_original.txt` and `doc8_modified.txt` (Cosine: 0.7266, Jaccard: 0.7143). These high scores accurately reflect that the modified documents were essentially rephrased versions of the originals, maintaining significant lexical overlap.

*   **When did Jaccard fail?**
    Jaccard Similarity's efficacy is directly tied to the exact lexical overlap between documents. It didn't necessarily 'fail' but showed its limitations when documents conveyed similar meanings using a more diverse vocabulary. For instance, with more heavily paraphrased content like `doc10_paraphrased.txt` and `doc4_original.txt`, Jaccard yielded a score of 0.2778, which is lower than the scores for directly modified pairs. This indicates that while the meaning might be similar, the unique word sets had less direct intersection.

*   **When did WordNet help?**
    WordNet Semantic Similarity is designed to capture conceptual relationships even when direct lexical overlap is minimal. While its absolute scores were generally lower for the 'modified' and 'paraphrased' documents compared to Cosine and Jaccard (e.g., `doc1_original.txt` - `doc6_modified.txt` at 0.2665), it could be particularly helpful in scenarios where synonyms or semantically related terms are used. It can identify thematic connections, such as between `doc10_paraphrased.txt` and `doc1_original.txt` (0.2507), by linking terms like 'student performance' and 'learning' through their underlying semantic networks, even if the phrasing differs considerably.

*   **Any false positives?**
    Based on the top 5 results for each metric, there were no clear 'false positives' where highly dissimilar documents received high similarity scores. The top pairs identified by Cosine and Jaccard were genuinely lexically similar (original vs. modified versions). WordNet's top scores were relatively modest (around 0.25-0.26), meaning it wasn't incorrectly flagging unrelated documents as highly similar. However, the interpretation of these lower WordNet scores as 'strong matches' depends on the specific application's threshold for semantic similarity.

### 6. Conclusion

Each text similarity metric has its unique strengths and is best suited for different types of analysis:

*   **Cosine Similarity (TF-IDF)** is excellent for identifying documents that share a significant proportion of important terms. It balances term frequency within a document against its rarity across the corpus, making it robust for detecting rephrasing with varied word usage. It is generally a strong performer for both direct copying and somewhat modified content.

*   **Jaccard Similarity** is ideal for situations where exact lexical overlap is the primary criterion for similarity, such as detecting plagiarism or direct duplication. Its simplicity makes it easy to interpret, but its sensitivity to vocabulary differences means it struggles with semantically similar but lexically distinct content.

*   **WordNet Semantic Similarity** shines when the goal is to find conceptual relatedness, even if documents use different words to express similar ideas (e.g., through synonyms or hypernyms). While its absolute scores can be lower and harder to interpret without a domain-specific threshold, it provides a valuable tool for understanding the underlying meaning.

In summary, for detecting direct "copying," Cosine and Jaccard Similarity are highly effective. For capturing broader semantic connections where specific word choice varies, WordNet Semantic Similarity provides complementary insights. The choice of metric depends heavily on the specific definition of "similarity" relevant to the task at hand.

## Summary:

### Q&A

*   **Which similarity metric detected copying best?**
    Both Cosine Similarity (TF-IDF) and Jaccard Similarity were most effective at detecting direct textual copying or highly lexical similarities. For instance, both metrics assigned very high scores to the `doc3_original.txt` and `doc8_modified.txt` pair (Cosine: 0.7266, Jaccard: 0.7143).

*   **When did Jaccard fail?**
    Jaccard Similarity showed limitations when documents conveyed similar meanings using a more diverse vocabulary, such as with more heavily paraphrased content. For example, the similarity between `doc10_paraphrased.txt` and `doc4_original.txt` was 0.2778, indicating less direct lexical intersection despite potential semantic similarity.

*   **When did WordNet help?**
    WordNet Semantic Similarity helped capture conceptual relationships even with minimal direct lexical overlap. While its absolute scores were generally lower (e.g., `doc1_original.txt` - `doc6_modified.txt` at 0.2665), it could identify thematic connections by linking terms through their underlying semantic networks.

*   **Were there any false positives?**
    Based on the top 5 results for each metric, there were no clear "false positives" where highly dissimilar documents received high similarity scores. The top pairs identified were genuinely related either lexically or semantically.

### Data Analysis Key Findings

*   **Preprocessing**: All text underwent lowercasing, punctuation removal, tokenization, and stopword removal. WordNet Semantic Similarity also included lemmatization.
*   **Cosine Similarity (TF-IDF)**: Demonstrated strong performance in identifying documents with significant lexical overlap, such as `doc3_original.txt` and `doc8_modified.txt` with a score of 0.7266, indicating its effectiveness for rephrased content.
*   **Jaccard Similarity**: Effectively detected direct textual copying and highly similar rephrased content, yielding a score of 0.7143 for `doc3_original.txt` and `doc8_modified.txt`, similar to Cosine Similarity.
*   **WordNet Semantic Similarity**: While yielding generally lower absolute scores (e.g., `doc1_original.txt` - `doc6_modified.txt` at 0.2665), it effectively identified conceptual links between documents, such as `doc10_paraphrased.txt` and `doc1_original.txt` at 0.2507, even with minimal lexical overlap.
*   **Metric Comparison**: Cosine and Jaccard were best for detecting direct copying and high lexical similarity, while WordNet was valuable for uncovering broader semantic connections.
*   **No False Positives**: No clear false positives were observed among the top 5 similar document pairs for any of the metrics, suggesting reliable detection within their respective strengths.

### Insights or Next Steps

*   The choice of text similarity metric should be guided by the specific definition of "similarity" required for the task; Cosine and Jaccard are suitable for lexical overlap detection, while WordNet is better for semantic relatedness.
*   Future analysis could explore hybrid approaches, combining lexical and semantic metrics, or investigate the impact of domain-specific ontologies on semantic similarity to potentially improve score interpretation and relevance.
