# SIMILARITY MATCHING

This notebook aims to simulate similarity matching using commonly used algorithms for text similarity scoring

## Library Dependencies

In [None]:
# libraries used (install using pip)
# !pip install faker
# !pip install python-Levenshtein
# !pip install fuzzywuzzy
# !pip install pandas
# !pip install faker
# !pip install scikit-learn
# !pip install scipy
# !pip install sentence-transformers
# !pip install torch
# !pip install python-Levenshtein[speedup]

## Generating Synthetic Data

In [None]:
import pandas as pd
import random
from faker import Faker

def generate_text_pairs():
    fake = Faker()
    texts = []
    
    # Base templates
    tech_templates = [
        "The evolution of artificial intelligence has transformed {industry}. Recent advances in {tech_type} have enabled {capability}, leading to {benefit}. Companies like {company} are investing heavily in AI research, focusing on {specific_area}. This technology promises to {future_impact}, though challenges remain in {challenge_area}.",
        "Machine learning applications in {industry} continue to expand. Particularly, {tech_type} has shown promising results in {capability}. Industry leaders such as {company} are developing solutions for {specific_area}, aiming to {future_impact}."
    ]
    
    business_templates = [
        "Market analysis reveals significant growth in {sector} during {quarter}. Companies reported {percentage}% increase in revenue, driven by {factor}. Experts predict {prediction} by {year}, with {trend} emerging as a key trend.",
        "The {sector} industry showed remarkable performance in {quarter}, with leading firms experiencing {percentage}% growth. {factor} played a crucial role in this expansion, while analysts forecast {prediction}."
    ]
    
    # Generate 100 pairs
    for i in range(100):
        similarity = random.uniform(0, 1)
        
        if similarity > 0.8:  # High similarity
            template = random.choice(tech_templates)
            text1 = template.format(
                industry="healthcare",
                tech_type="deep learning",
                capability="disease diagnosis",
                benefit="improved patient outcomes",
                company="DeepMind",
                specific_area="medical imaging",
                future_impact="revolutionize healthcare delivery",
                challenge_area="data privacy"
            )
            # Slight variations for high similarity
            text2 = template.format(
                industry="healthcare",
                tech_type="deep learning",
                capability="disease detection",
                benefit="better patient outcomes",
                company="DeepMind",
                specific_area="medical imaging analysis",
                future_impact="transform healthcare delivery",
                challenge_area="data security"
            )
        
        elif 0.4 <= similarity <= 0.7:  # Medium similarity
            text1 = fake.text(max_nb_chars=300)
            text2 = text1[:150] + fake.text(max_nb_chars=150)
        
        else:  # Low similarity
            text1 = fake.text(max_nb_chars=300)
            text2 = fake.text(max_nb_chars=300)
        
        texts.append({
            'id': i+1,
            'text1': text1,
            'text2': text2,
            'similarity_score': round(similarity, 3)
        })
    
    return pd.DataFrame(texts)

# # Generate and save dataset
df = generate_text_pairs()
# df.to_csv('verbatim_text_data_sample.csv', index=False)

In [5]:
df

Unnamed: 0,id,text1,text2,similarity_score
0,1,Speech guess able everything suddenly clearly....,Speech guess able everything suddenly clearly....,0.417
1,2,The evolution of artificial intelligence has t...,The evolution of artificial intelligence has t...,0.938
2,3,Lose song pattern fear show produce keep. Refl...,Seven record agency hotel get. Office maintain...,0.712
3,4,Least west ok whether. Why sell lot your troub...,Least west ok whether. Why sell lot your troub...,0.563
4,5,Bar require prepare then gun they discover. Tr...,Matter audience production. Go if clear medica...,0.167
...,...,...,...,...
95,96,Free little begin need including. Drug souther...,Treatment provide kid support. Forward later i...,0.186
96,97,The evolution of artificial intelligence has t...,The evolution of artificial intelligence has t...,0.830
97,98,Machine learning applications in healthcare co...,Machine learning applications in healthcare co...,0.907
98,99,Catch always hundred treat stand last. At choo...,Oil life administration.\nSuccess development ...,0.091


## Similarity Algorithms

Here are the key text matching algorithms you can explore to kickstart your journey with similarity matching
1. **String based similarity** - This matching algorithms is good to match words based on its spelling, it is used to identify or correct spelling mistakes etc.
2. **Token based similarity** - This matching creates tokens for words, and matches based on frequency counts of the most common words. 
3. **Semantic Similarity** - considers the semantic meaning of the words, parts of speech and position of words in the sentence which allows a better matching context.

In [10]:
# String Based Similarity

from difflib import SequenceMatcher
from Levenshtein import distance
from fuzzywuzzy import fuzz

def string_similarity(text1, text2):
    # Levenshtein Distance
    lev_ratio = 1 - distance(text1, text2) / max(len(text1), len(text2))
    
    # Sequence Matcher
    seq_ratio = SequenceMatcher(None, text1, text2).ratio()
    
    # Fuzzy String Matching
    fuzzy_ratio = fuzz.ratio(text1, text2) / 100
    
    return lev_ratio, seq_ratio, fuzzy_ratio

In [11]:
# Token-Based Simliarity

from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import cosine

def token_similarity(text1, text2):
    vectorizer = CountVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    
    # Jaccard Similarity
    intersection = len(set(text1.split()) & set(text2.split()))
    union = len(set(text1.split()) | set(text2.split()))
    jaccard = intersection / union
    
    # Cosine Similarity with BOW
    cosine_sim = 1 - cosine(vectors[0], vectors[1])
    
    return jaccard, cosine_sim

In [14]:
def jaccard_similarity(text1, text2):
    # Convert texts to sets of words
    set1 = set(text1.split())
    set2 = set(text2.split())
    
    # Calculate intersection and union
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    
    return intersection / union

In [None]:
# Semantic Based Similarity

from sentence_transformers import SentenceTransformer
import torch.nn.functional as F
import torch

def semantic_similarity(text1, text2):
    # Using BERT-based model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Generate embeddings
    emb1 = model.encode(text1)
    emb2 = model.encode(text2)
    
    # Compute similarity
    cosine_sim = F.cosine_similarity(
        torch.tensor(emb1).unsqueeze(0),
        torch.tensor(emb2).unsqueeze(0)
    )
    
    return cosine_sim.item()

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

def bow_cosine_similarity(text1, text2):
    vectorizer = CountVectorizer()
    vectors = vectorizer.fit_transform([text1, text2])
    dense = vectors.toarray()
    
    # Calculate cosine similarity
    dot_product = np.dot(dense[0], dense[1])
    norm1 = np.linalg.norm(dense[0])
    norm2 = np.linalg.norm(dense[1])
    
    return dot_product / (norm1 * norm2)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_cosine_similarity(text1, text2):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text1, text2])
    dense = vectors.toarray()
    
    dot_product = np.dot(dense[0], dense[1])
    norm1 = np.linalg.norm(dense[0])
    norm2 = np.linalg.norm(dense[1])
    
    return dot_product / (norm1 * norm2)

# Function (similar approach above but using a simpler implementation - use either one)
def tfidf_similarity(doc1, doc2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([doc1, doc2])
    return 1 - cosine(tfidf_matrix.toarray()[0], tfidf_matrix.toarray()[1])


## Examples

### 1. Basic Example

In [None]:
texts = [
    "The cat sat on the mat",
    "The dog sat on the mat",
    "The weather is nice today"
]

print(f"Jaccard: {jaccard_similarity(texts[0], texts[1])}")      # Higher
print(f"BOW: {bow_cosine_similarity(texts[0], texts[1])}")       # Higher
print(f"TF-IDF: {tfidf_cosine_similarity(texts[0], texts[1])}")  # Lower (common words penalized)
print(f"TF-IDF: {tfidf_similarity(texts[0], texts[1])}")  # Lower (common words penalized)


Jaccard: 0.7142857142857143
BOW: 0.8749999999999998
TF-IDF: 0.7799154245579976
TF-IDF: 0.7799154245579976


In [None]:
df['Jaccard'] = df.apply(lambda x: jaccard_similarity(x['text1'], x['text2']), axis=1)
df['bow_cosine_similarity'] = df.apply(lambda x: bow_cosine_similarity(x['text1'], x['text2']), axis=1)
df['tfidf_cosine_similarity'] = df.apply(lambda x: tfidf_cosine_similarity(x['text1'], x['text2']), axis=1)
df['semantic_similarity'] = df.apply(lambda x: semantic_similarity(x['text1'], x['text2']), axis=1)


### 2. Example using the Synthetic data

In [None]:
df

Unnamed: 0,id,text1,text2,similarity_score,tfidf_cosine_similarity,Jaccard,bow_cosine_similarity
0,1,Speech guess able everything suddenly clearly....,Speech guess able everything suddenly clearly....,0.417,0.393520,0.375000,0.557364
1,2,The evolution of artificial intelligence has t...,The evolution of artificial intelligence has t...,0.938,0.858762,0.784314,0.923111
2,3,Lose song pattern fear show produce keep. Refl...,Seven record agency hotel get. Office maintain...,0.712,0.047622,0.012048,0.089893
3,4,Least west ok whether. Why sell lot your troub...,Least west ok whether. Why sell lot your troub...,0.563,0.473771,0.470588,0.640057
4,5,Bar require prepare then gun they discover. Tr...,Matter audience production. Go if clear medica...,0.167,0.011135,0.000000,0.021760
...,...,...,...,...,...,...,...
95,96,Free little begin need including. Drug souther...,Treatment provide kid support. Forward later i...,0.186,0.000000,0.000000,0.000000
96,97,The evolution of artificial intelligence has t...,The evolution of artificial intelligence has t...,0.830,0.858762,0.784314,0.923111
97,98,Machine learning applications in healthcare co...,Machine learning applications in healthcare co...,0.907,0.890324,0.794118,0.941242
98,99,Catch always hundred treat stand last. At choo...,Oil life administration.\nSuccess development ...,0.091,0.025783,0.012658,0.049507


# APPENDIX

## Python connecting to excel document (example)

In [None]:
# Connect to excel file:
def read_and_append_excel(file_path, sheet_name='Sheet1'):
    # Read the existing excel file
    existing_df = pd.read_excel(file_path, sheet_name=sheet_name)
    
    # Append the new dataframe (df) to the existing dataframe
    combined_df = pd.concat([existing_df, df], axis=1)
    
    # Write the combined dataframe back to the excel file
    with pd.ExcelWriter(file_path, engine='openpyxl', mode='a') as writer:
        combined_df.to_excel(writer, sheet_name=sheet_name, index=False)

    # Example usage
    # read_and_append_excel('path_to_your_excel_file.xlsx')
    # Required libraries for reading and writing Excel files
    # !pip install openpyxl
    # !pip install pandas

## Alternative approach is to use a pre-trained language model i.e. Spacy
Spacy is a pre-trained statistical model using word vectors.
in short, its similar to a language model and has been pre-trained to identify text,its parts of speech(POS) adding semantic meaning to the words.

In [None]:
# Another approach is using spacy
# this is a pretrained model which calculates similarity based on two texts 
# reference - https://www.youtube.com/watch?v=DIxxz_DvqLA&t=301s

import spacy
from typing import Dict, Tuple

class SpacyTextAnalyzer:
    def __init__(self):
        # Load large model with more vectors
        self.nlp = spacy.load('en_core_web_lg')
    
    def analyze_similarity(self, text1: str, text2: str) -> Dict[str, float]:
        """Comprehensive similarity analysis using spaCy's large model"""
        # Process texts
        doc1 = self.nlp(text1)
        doc2 = self.nlp(text2)
        
        # Calculate different similarity metrics
        return {
            'doc_similarity': doc1.similarity(doc2),
            'token_similarity': self._get_token_similarity(doc1, doc2),
            'entity_similarity': self._get_entity_similarity(doc1, doc2)
        }
    
    def _get_token_similarity(self, doc1, doc2) -> float:
        """Calculate average token similarity"""
        similarities = []
        for token1 in doc1:
            for token2 in doc2:
                if token1.has_vector and token2.has_vector:
                    # Only compare meaningful tokens
                    if not (token1.is_stop or token2.is_stop):
                        similarities.append(token1.similarity(token2))
        
        return sum(similarities) / len(similarities) if similarities else 0.0
    
    def _get_entity_similarity(self, doc1, doc2) -> float:
        """Compare named entities"""
        ents1 = set([ent.text for ent in doc1.ents])
        ents2 = set([ent.text for ent in doc2.ents])
        
        if not (ents1 or ents2):
            return 0.0
            
        intersection = len(ents1 & ents2)
        union = len(ents1 | ents2)
        return intersection / union if union > 0 else 0.0

# Example usage
if __name__ == "__main__":
    analyzer = SpacyTextAnalyzer()
    
    text1 = "Google and Microsoft are leading AI research in Silicon Valley"
    text2 = "Tech giants like Microsoft and Google are investing heavily in artificial intelligence"
    
    results = analyzer.analyze_similarity(text1, text2)
    
    for metric, score in results.items():
        print(f"{metric}: {score:.3f}")

## some libraries and dependency (how to install - only for M3 macbook)

In [None]:
# # Create and activate virtual environment (recommended)
# python3 -m venv venv
# source venv/bin/activate

# # Install dependencies for M3 Pro
# arch -arm64 pip install python-Levenshtein
# arch -arm64 pip install fuzzywuzzy
# arch -arm64 pip install pandas
# arch -arm64 pip install faker
# arch -arm64 pip install scikit-learn
# arch -arm64 pip install scipy
# arch -arm64 pip install --no-cache-dir torch
# arch -arm64 pip install sentence-transformers

# # If python-Levenshtein fails, try:
# arch -arm64 pip install python-Levenshtein-wheels

# # Optional: requirements.txt
# pip freeze > requirements.txt