In [3]:
from sentence_transformers import SentenceTransformer, InputExample, losses, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
import random
import os
from tqdm import tqdm
from tqdm.autonotebook import tqdm as notebook_tqdm

# Use correct path or download from web directly
print("Loading all-MiniLM-L6-v2 model...")
model = SentenceTransformer('model/all-MiniLM-L6-v2')  # Using the model name directly to download

def load_your_dataset():
    try:
        df = pd.read_csv('dataset/apps.csv')
        print(f"Dataset loaded with {len(df)} records")
        # Print the first few rows to understand the structure
        print("\nSample data:")
        print(df.head(3))
        print("\nColumns:", df.columns.tolist())
        return df
    except Exception as e:
        print(f"Error loading dataset: {e}")
        # Create a sample dataset for testing
        print("Creating sample dataset for testing...")
        sample_df = pd.DataFrame({
            'app_name': [
                'Facebook', 'Instagram', 'Twitter', 
                'Booking.com', 'Agoda', 'Hotels.com',
                'Calculator', 'Calendar', 'Weather'
            ],
            'description': [
                'Social media platform to connect with friends', 
                'Photo sharing social network',
                'Microblogging social platform',
                'Hotel booking and travel app',
                'Hotel reservations and deals',
                'Book rooms and accommodations',
                'Simple calculator app',
                'Manage your schedule',
                'Weather forecasts and alerts'
            ],
            'keywords': [
                'Social', 'Social', 'Social',
                'Travel', 'Travel', 'Travel',
                'Utility', 'Utility', 'Utility'
            ]
        })
        return sample_df

def create_training_examples(df):
    training_examples = []
    
    # Check if we have the expected columns
    required_columns = ['app_name', 'description', 'keywords']
    missing_columns = [col for col in required_columns if col not in df.columns]
    
    if missing_columns:
        print(f"Warning: Missing columns: {missing_columns}")
        print("Available columns:", df.columns.tolist())
        print("Creating alternative examples based on available data")
        
        # Create examples using whatever columns are available
        all_apps = df.to_dict('records')
        
        # Create positive pairs using any common column
        if 'keywords' in df.columns:
            keyword_groups = df.groupby('keywords')
        elif 'category' in df.columns:
            keyword_groups = df.groupby('category')
        else:
            # Just create random pairs if no grouping column exists
            print("No grouping column found. Creating random pairs...")
            for i in range(len(df)):
                for j in range(i+1, min(i+5, len(df))):
                    app1 = all_apps[i]
                    app2 = all_apps[j]
                    
                    text1 = f"{app1.get('app_name', '')} {app1.get('description', '')}"
                    text2 = f"{app2.get('app_name', '')} {app2.get('description', '')}"
                    
                    training_examples.append(InputExample(texts=[text1, text2], label=0.8))
            
            # Create some negative examples
            for _ in range(len(training_examples)):
                i, j = random.sample(range(len(all_apps)), 2)
                app1 = all_apps[i]
                app2 = all_apps[j]
                
                text1 = f"{app1.get('app_name', '')} {app1.get('description', '')}"
                text2 = f"{app2.get('app_name', '')} {app2.get('description', '')}"
                
                training_examples.append(InputExample(texts=[text1, text2], label=0.2))
            
            return training_examples
    
    # Standard example creation with expected columns
    keyword_groups = df.groupby('keywords')
    
    print(f"Found {len(keyword_groups)} keyword groups")
    
    # Create positive pairs (apps with similar keywords/descriptions)
    for keyword, group in keyword_groups:
        print(f"Processing keyword group: {keyword} with {len(group)} apps")
        if len(group) > 1:  # Need at least 2 apps for pairs
            app_list = group.to_dict('records')
            for i in range(len(app_list)):
                for j in range(i + 1, len(app_list)):
                    app1 = app_list[i]
                    app2 = app_list[j]
                    
                    # Combine app_name with description for richer context
                    text1 = f"{app1['app_name']} {app1['description']}"
                    text2 = f"{app2['app_name']} {app2['description']}"
                    
                    # Score: 1.0 for apps with same keywords (positive pairs)
                    training_examples.append(InputExample(texts=[text1, text2], label=1.0))
    
    # Create negative pairs from different keyword groups
    all_apps = df.to_dict('records')
    
    # If we have too few examples (less than 10), generate more pairs
    if len(training_examples) < 10:
        print("Generating additional training pairs since we have few examples...")
        for i in range(len(df)):
            for j in range(len(df)):
                if i != j:
                    app1 = all_apps[i]
                    app2 = all_apps[j]
                    
                    text1 = f"{app1['app_name']} {app1['description']}"
                    text2 = f"{app2['app_name']} {app2['description']}"
                    
                    # Assign higher score if same keyword group, lower otherwise
                    if app1['keywords'] == app2['keywords']:
                        score = 0.9
                    else:
                        score = 0.1
                    
                    training_examples.append(InputExample(texts=[text1, text2], label=score))
    else:
        # Add some negative examples if we already have sufficient positive examples
        for _ in range(min(len(training_examples), 100)):  # Limit to reasonable number
            app1, app2 = random.sample(all_apps, 2)
            # Ensure different keywords for negative pairs
            if app1['keywords'] != app2['keywords']:
                text1 = f"{app1['app_name']} {app1['description']}"
                text2 = f"{app2['app_name']} {app2['description']}"
                training_examples.append(InputExample(texts=[text1, text2], label=0.0))
    
    print(f"Created {len(training_examples)} training examples")
    return training_examples

def split_examples(examples, train_ratio=0.8):
    """Split examples into training and validation sets"""
    if not examples:
        print("Warning: No examples to split!")
        # Create some dummy examples to avoid errors
        return create_dummy_examples(), create_dummy_examples()
        
    random.shuffle(examples)
    split_idx = max(1, int(len(examples) * train_ratio))  # Ensure at least 1 example in each split
    return examples[:split_idx], examples[split_idx:] or examples[:1]  # Use first example as validation if needed

def create_dummy_examples():
    """Create dummy examples for testing when no real data is available"""
    print("Creating dummy examples for demonstration...")
    dummy_examples = [
        InputExample(texts=["Hotel booking app", "Travel booking service"], label=1.0),
        InputExample(texts=["Social media app", "Photo sharing platform"], label=0.8),
        InputExample(texts=["Calculator app", "Hotel booking service"], label=0.1),
    ]
    return dummy_examples

class CustomCallback:
    """Custom callback to display progress during training"""
    def __init__(self):
        self.epoch = 0
        self.best_score = -1
        
    def __call__(self, score, epoch, steps):
        """This method is called by the SentenceTransformer framework"""
        self.epoch = epoch
        if score > self.best_score:
            self.best_score = score
        print(f"\nEpoch {epoch} completed. Evaluation score: {score:.4f} (Best: {self.best_score:.4f})")

# 4. Fine-tuning function
def fine_tune_model(model, train_examples, val_examples, output_path='fine_tuned_minilm'):
    """Fine-tune the model with proper error handling"""
    if not train_examples:
        print("No training examples available. Skipping fine-tuning.")
        return model
        
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path) if '/' in output_path else '.', exist_ok=True)
    
    # Define train dataset and dataloader
    print(f"Creating dataloader with {len(train_examples)} training examples")
    train_dataloader = DataLoader(
        train_examples,
        shuffle=True,
        batch_size=min(16, len(train_examples))  # Adjust batch size for small datasets
    )
    
    train_loss = losses.MultipleNegativesRankingLoss(model=model)
    
    # Handle empty validation set
    if not val_examples:
        print("No validation examples available. Using training examples for validation.")
        val_examples = train_examples[:min(len(train_examples), 3)]
        
    evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
        val_examples,
        name='playstore-eval'
    )
    
    callback = CustomCallback()
    
    # Fine-tune the model
    print(f"Fine-tuning the model on {len(train_examples)} examples with {len(val_examples)} validation examples...")
    
    try:
        model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            evaluator=evaluator,
            epochs=3,
            warmup_steps=10,  # Reduce for small datasets
            output_path=output_path,
            show_progress_bar=True,
            callback=callback
        )
        print("Fine-tuning completed successfully")
    except Exception as e:
        print(f"Error during fine-tuning: {e}")
        print("Continuing with original model")
    
    return model

def compare_models(original_model, fine_tuned_model, test_sentences):
    """Compare embeddings from original and fine-tuned models"""
    print("\n--- Model Comparison ---")
    
    # Encode with original model
    print("Computing embeddings with original model...")
    original_embeddings = original_model.encode(test_sentences)
    original_similarities = util.cos_sim(original_embeddings, original_embeddings)
    
    # Encode with fine-tuned model
    print("Computing embeddings with fine-tuned model...")
    fine_tuned_embeddings = fine_tuned_model.encode(test_sentences)
    fine_tuned_similarities = util.cos_sim(fine_tuned_embeddings, fine_tuned_embeddings)
    
    # Compare similarities
    print("\nSimilarity Comparison (Original vs Fine-tuned):")
    for i in range(len(test_sentences)):
        for j in range(i + 1, len(test_sentences)):
            orig_sim = original_similarities[i][j].item()
            ft_sim = fine_tuned_similarities[i][j].item()
            diff = ft_sim - orig_sim
            change = "↑" if diff > 0 else "↓"
            
            print(f"{test_sentences[i]} vs {test_sentences[j]}:")
            print(f"  Original: {orig_sim:.4f}, Fine-tuned: {ft_sim:.4f}")
            print(f"  Change: {abs(diff):.4f} {change}")

def test_model(model, df, name="Model"):
    """Test model on dataset samples"""
    # Handle small datasets
    sample_size = min(5, len(df))
    if sample_size == 0:
        print(f"No data to test {name}")
        return
        
    test_apps = df.sample(sample_size)
    app_texts = []
    
    for _, app in test_apps.iterrows():
        # Handle missing columns
        app_name = app.get('app_name', 'Unknown App')
        description = app.get('description', '')
        app_texts.append(f"{app_name} {description}")
    
    print(f"\n--- Testing {name} ---")
    print("Test apps:")
    for i, (_, app) in enumerate(test_apps.iterrows()):
        keywords = app.get('keywords', 'Unknown')
        print(f"{i+1}. {app.get('app_name', 'App')} (Category: {keywords})")
    
    # Calculate embeddings and similarities
    embeddings = model.encode(app_texts, convert_to_tensor=True)
    similarities = util.cos_sim(embeddings, embeddings)
    
    # Print similarities between apps
    print("\nSimilarity matrix:")
    for i, row1 in enumerate(test_apps.iterrows()):
        _, app1 = row1
        for j, row2 in enumerate(test_apps.iterrows()):
            _, app2 = row2
            if i < j:
                sim_value = similarities[i][j].item()
                # Handle missing columns
                app1_keywords = app1.get('keywords', None)
                app2_keywords = app2.get('keywords', None)
                same_category = app1_keywords == app2_keywords if app1_keywords and app2_keywords else "Unknown"
                marker = "✓" if same_category is True else "✗" if same_category is False else "?"
                
                print(f"{app1.get('app_name', 'App1')} vs {app2.get('app_name', 'App2')}: {sim_value:.4f} {marker}")
                print(f"  Same category: {same_category}")

# Main execution
if __name__ == "__main__":
    try:
        # Load your dataset
        print("Loading dataset...")
        df = load_your_dataset()
        print(f"Loaded {len(df)} apps from dataset")
        
        # Save original model for comparison later
        original_model = SentenceTransformer('model/all-MiniLM-L6-v2')
        
        # Prepare training data
        train_examples = create_training_examples(df)
        print(f"Created {len(train_examples)} training examples")
        
        # Handle case with no training examples
        if not train_examples:
            print("No training examples were created. Creating dummy examples...")
            train_examples = create_dummy_examples()
        
        # Split into train and validation sets
        train_examples, val_examples = split_examples(train_examples)
        print(f"Split data into {len(train_examples)} training and {len(val_examples)} validation examples")
        
        # Fine-tune the model
        fine_tuned_model = fine_tune_model(model, train_examples, val_examples)
        
        # Test both models on the same data for comparison
        test_model(original_model, df, name="Original Model")
        test_model(fine_tuned_model, df, name="Fine-tuned Model")
        
        # Save the model
        output_path = 'fine_tuned_minilm'
        print(f"Saving fine-tuned model to {output_path}")
        fine_tuned_model.save(output_path)
        
        # Example usage with specific test cases
        test_sentences = [
            "Agoda Hotel Booking",
            "Booking.com Hotels", 
            "Hotel.com Reservations",
            "Facebook Social Media",
            "Twitter Social Networking",
            "Calculator App",
            "Weather Forecast"
        ]
        
        # Compare original and fine-tuned models
        compare_models(original_model, fine_tuned_model, test_sentences)
        
    except Exception as e:
        print(f"An error occurred during execution: {e}")
        import traceback
        traceback.print_exc()

Loading all-MiniLM-L6-v2 model...


No sentence-transformers model found with name model/all-MiniLM-L6-v2. Creating a new one with mean pooling.


Loading dataset...
Dataset loaded with 10 records

Sample data:
      app_name                                      keywords  \
0        Agoda   hotel booking, travel, accommodation, deals   
1  Booking.com  hotel reservation, travel, lodging, vacation   
2    Hotel.com          hotel search, booking, travel, rooms   

                                         description  
0  Book hotels and accommodations worldwide with ...  
1  Find and book hotels, apartments, and vacation...  
2  Search and reserve hotels globally with compet...  

Columns: ['app_name', 'keywords', 'description']
Loaded 10 apps from dataset


No sentence-transformers model found with name model/all-MiniLM-L6-v2. Creating a new one with mean pooling.


Found 10 keyword groups
Processing keyword group: hotel booking, travel, accommodation, deals with 1 apps
Processing keyword group: hotel reservation, travel, lodging, vacation with 1 apps
Processing keyword group: hotel search, booking, travel, rooms with 1 apps
Processing keyword group: music streaming, playlist, audio, songs with 1 apps
Processing keyword group: navigation, maps, travel, directions with 1 apps
Processing keyword group: ride sharing, transportation, travel, taxi with 1 apps
Processing keyword group: travel booking, hotels, flights, vacation with 1 apps
Processing keyword group: travel reviews, hotels, booking, tourism with 1 apps
Processing keyword group: travel search, hotels, flights, deals with 1 apps
Processing keyword group: video streaming, movies, series, entertainment with 1 apps
Generating additional training pairs since we have few examples...
Created 90 training examples
Created 90 training examples
Split data into 72 training and 18 validation examples
Cr

                                                                     

Step,Training Loss,Validation Loss,Playstore-eval Pearson Cosine,Playstore-eval Spearman Cosine
5,No log,No log,,
10,No log,No log,,
15,No log,No log,,



Epoch 1.0 completed. Evaluation score: nan (Best: -1.0000)


  eval_pearson, _ = pearsonr(labels, scores)
  eval_spearman, _ = spearmanr(labels, scores)



Epoch 2.0 completed. Evaluation score: nan (Best: -1.0000)


  eval_pearson, _ = pearsonr(labels, scores)
  eval_spearman, _ = spearmanr(labels, scores)



Epoch 3.0 completed. Evaluation score: nan (Best: -1.0000)


  eval_pearson, _ = pearsonr(labels, scores)
  eval_spearman, _ = spearmanr(labels, scores)


Fine-tuning completed successfully

--- Testing Original Model ---
Test apps:
1. Kayak (Category: travel search, hotels, flights, deals)
2. Booking.com (Category: hotel reservation, travel, lodging, vacation)
3. Expedia (Category: travel booking, hotels, flights, vacation)
4. Agoda (Category: hotel booking, travel, accommodation, deals)
5. Netflix (Category: video streaming, movies, series, entertainment)

Similarity matrix:
Kayak vs Booking.com: 0.4822 ✗
  Same category: False
Kayak vs Expedia: 0.4000 ✗
  Same category: False
Kayak vs Agoda: 0.4131 ✗
  Same category: False
Kayak vs Netflix: 0.1676 ✗
  Same category: False
Booking.com vs Expedia: 0.4725 ✗
  Same category: False
Booking.com vs Agoda: 0.5680 ✗
  Same category: False
Booking.com vs Netflix: 0.3295 ✗
  Same category: False
Expedia vs Agoda: 0.4455 ✗
  Same category: False
Expedia vs Netflix: 0.1903 ✗
  Same category: False
Agoda vs Netflix: 0.1755 ✗
  Same category: False

--- Testing Fine-tuned Model ---
Test apps:
1. Ago