# E-commerce Recommendation System Training
## Content-Based Filtering with Advanced Features

This notebook trains and evaluates recommendation models for the e-commerce platform.

In [None]:
# Install required packages
!pip install pandas numpy scikit-learn matplotlib seaborn
!pip install pymongo python-dotenv
!pip install plotly wordcloud

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import pymongo
import pickle
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Data Loading and Exploration

In [None]:
# MongoDB connection
MONGO_URI = "mongodb://localhost:27017/ecommerce"  # Update this
client = pymongo.MongoClient(MONGO_URI)
db = client['ecommerce']

# Load products data
products_collection = db['products']
products_cursor = products_collection.find({})
products_data = list(products_cursor)

print(f"Loaded {len(products_data)} products from MongoDB")
print("Sample product:", products_data[0] if products_data else "No products found")

In [None]:
# Convert to DataFrame
df_data = []
for product in products_data:
    df_data.append({
        '_id': str(product['_id']),
        'name': product.get('name', ''),
        'category': product.get('category', ''),
        'subCategory': product.get('subCategory', ''),
        'description': product.get('description', ''),
        'price': product.get('price', 0),
        'sizes': ','.join(product.get('sizes', [])),
        'image': product.get('image', []),
        'bestseller': product.get('bestseller', False),
        'date': product.get('date', datetime.now())
    })

df = pd.DataFrame(df_data)
print(f"DataFrame shape: {df.shape}")
df.head()

## 2. Exploratory Data Analysis

In [None]:
# Basic statistics
print("Dataset Info:")
print(f"Total products: {len(df)}")
print(f"Categories: {df['category'].nunique()}")
print(f"Subcategories: {df['subCategory'].nunique()}")
print(f"Price range: {df['price'].min()} - {df['price'].max()}")
print(f"Bestsellers: {df['bestseller'].sum()}")

# Missing values
print("\nMissing values:")
print(df.isnull().sum())

In [None]:
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Category distribution
df['category'].value_counts().plot(kind='bar', ax=axes[0,0])
axes[0,0].set_title('Products by Category')
axes[0,0].tick_params(axis='x', rotation=45)

# Price distribution
df['price'].hist(bins=30, ax=axes[0,1])
axes[0,1].set_title('Price Distribution')
axes[0,1].set_xlabel('Price')

# Subcategory distribution
df['subCategory'].value_counts().head(10).plot(kind='bar', ax=axes[1,0])
axes[1,0].set_title('Top 10 Subcategories')
axes[1,0].tick_params(axis='x', rotation=45)

# Bestseller distribution
df['bestseller'].value_counts().plot(kind='pie', ax=axes[1,1], autopct='%1.1f%%')
axes[1,1].set_title('Bestseller Distribution')

plt.tight_layout()
plt.show()

## 3. Feature Engineering

In [None]:
# Create content features
df['content_features'] = (
    df['name'].fillna('') + ' ' +
    df['category'].fillna('') + ' ' +
    df['subCategory'].fillna('') + ' ' +
    df['description'].fillna('') + ' ' +
    df['sizes'].fillna('')
)

# Price categories
df['price_category'] = pd.cut(df['price'], 
                             bins=[0, 100000, 300000, 500000, float('inf')],
                             labels=['Budget', 'Mid-range', 'Premium', 'Luxury'])

print("Sample content features:")
print(df[['name', 'content_features']].head(3))

## 4. Model Training and Evaluation

In [None]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(
    max_features=1000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=1,
    max_df=0.8
)

tfidf_matrix = vectorizer.fit_transform(df['content_features'])
print(f"TF-IDF Matrix shape: {tfidf_matrix.shape}")

# Feature names
feature_names = vectorizer.get_feature_names_out()
print(f"Number of features: {len(feature_names)}")
print("Sample features:", feature_names[:10])

In [None]:
# Calculate cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(f"Cosine similarity matrix shape: {cosine_sim.shape}")

# Visualize similarity distribution
plt.figure(figsize=(10, 6))
plt.hist(cosine_sim.flatten(), bins=50, alpha=0.7)
plt.title('Distribution of Cosine Similarities')
plt.xlabel('Cosine Similarity')
plt.ylabel('Frequency')
plt.show()

print(f"Mean similarity: {cosine_sim.mean():.4f}")
print(f"Std similarity: {cosine_sim.std():.4f}")

## 5. Recommendation Function

In [None]:
def get_recommendations(product_id, num_recommendations=5):
    """Get product recommendations based on content similarity"""
    try:
        # Find product index
        product_idx = df[df['_id'] == product_id].index
        if len(product_idx) == 0:
            return []
        
        product_idx = product_idx[0]
        
        # Get similarity scores
        sim_scores = cosine_sim[product_idx]
        
        # Get similar products (excluding the product itself)
        similar_indices = sim_scores.argsort()[::-1][1:num_recommendations+1]
        
        recommendations = []
        for idx in similar_indices:
            product = df.iloc[idx]
            recommendations.append({
                'productId': product['_id'],
                'name': product['name'],
                'category': product['category'],
                'price': product['price'],
                'similarity_score': float(sim_scores[idx])
            })
        
        return recommendations
    except Exception as e:
        print(f"Error: {e}")
        return []

# Test the function
if len(df) > 0:
    test_product_id = df.iloc[0]['_id']
    test_recommendations = get_recommendations(test_product_id, 5)
    
    print(f"Recommendations for product '{df.iloc[0]['name']}':")
    for i, rec in enumerate(test_recommendations, 1):
        print(f"{i}. {rec['name']} (similarity: {rec['similarity_score']:.3f})")

## 6. Model Evaluation

In [None]:
# Evaluate recommendation quality
def evaluate_recommendations():
    """Evaluate the quality of recommendations"""
    category_matches = []
    subcategory_matches = []
    similarity_scores = []
    
    # Sample 20 products for evaluation
    sample_products = df.sample(min(20, len(df)))
    
    for _, product in sample_products.iterrows():
        recommendations = get_recommendations(product['_id'], 5)
        
        if recommendations:
            # Check category matches
            cat_matches = sum(1 for rec in recommendations 
                            if df[df['_id'] == rec['productId']]['category'].iloc[0] == product['category'])
            category_matches.append(cat_matches / len(recommendations))
            
            # Check subcategory matches
            subcat_matches = sum(1 for rec in recommendations 
                               if df[df['_id'] == rec['productId']]['subCategory'].iloc[0] == product['subCategory'])
            subcategory_matches.append(subcat_matches / len(recommendations))
            
            # Average similarity
            avg_sim = np.mean([rec['similarity_score'] for rec in recommendations])
            similarity_scores.append(avg_sim)
    
    results = {
        'avg_category_match': np.mean(category_matches),
        'avg_subcategory_match': np.mean(subcategory_matches),
        'avg_similarity': np.mean(similarity_scores)
    }
    
    return results

# Run evaluation
if len(df) > 0:
    eval_results = evaluate_recommendations()
    print("Recommendation Quality Metrics:")
    print(f"Average Category Match: {eval_results['avg_category_match']:.3f}")
    print(f"Average Subcategory Match: {eval_results['avg_subcategory_match']:.3f}")
    print(f"Average Similarity Score: {eval_results['avg_similarity']:.3f}")

## 7. Advanced Analysis

In [None]:
# PCA for dimensionality reduction and visualization
pca = PCA(n_components=2)
tfidf_2d = pca.fit_transform(tfidf_matrix.toarray())

plt.figure(figsize=(12, 8))
scatter = plt.scatter(tfidf_2d[:, 0], tfidf_2d[:, 1], 
                     c=df['category'].astype('category').cat.codes, 
                     alpha=0.6, s=50)
plt.title('Product Clustering in 2D TF-IDF Space')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
plt.colorbar(scatter, label='Category')
plt.show()

print(f"Total variance explained: {pca.explained_variance_ratio_.sum():.2%}")

In [None]:
# K-means clustering
n_clusters = min(5, len(df))
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(tfidf_matrix)

df['cluster'] = clusters

# Visualize clusters
plt.figure(figsize=(12, 8))
scatter = plt.scatter(tfidf_2d[:, 0], tfidf_2d[:, 1], 
                     c=clusters, alpha=0.6, s=50, cmap='viridis')
plt.title('K-means Clustering of Products')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
plt.colorbar(scatter, label='Cluster')
plt.show()

# Cluster analysis
print("\nCluster Analysis:")
for cluster_id in range(n_clusters):
    cluster_products = df[df['cluster'] == cluster_id]
    print(f"\nCluster {cluster_id} ({len(cluster_products)} products):")
    print(f"  Categories: {cluster_products['category'].value_counts().to_dict()}")
    print(f"  Avg Price: {cluster_products['price'].mean():.0f}")

## 8. Save Models and Results

In [None]:
# Save models and data
import os

# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)
os.makedirs('../data', exist_ok=True)

# Save vectorizer
with open('../models/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# Save similarity matrix
np.save('../models/cosine_similarity_matrix.npy', cosine_sim)

# Save processed dataframe
df.to_csv('../data/processed_products.csv', index=False)

# Save evaluation results
if len(df) > 0:
    with open('../models/evaluation_results.json', 'w') as f:
        json.dump(eval_results, f, indent=2)

# Save model metadata
metadata = {
    'training_date': datetime.now().isoformat(),
    'num_products': len(df),
    'num_features': len(feature_names),
    'categories': df['category'].unique().tolist(),
    'model_type': 'content_based_tfidf',
    'evaluation_metrics': eval_results if len(df) > 0 else None
}

with open('../models/model_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print("Models and data saved successfully!")
print(f"Files saved in:")
print(f"  - ../models/tfidf_vectorizer.pkl")
print(f"  - ../models/cosine_similarity_matrix.npy")
print(f"  - ../data/processed_products.csv")
print(f"  - ../models/evaluation_results.json")
print(f"  - ../models/model_metadata.json")

## 9. Production Deployment Code

In [None]:
# Generate production-ready code
production_code = '''
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

class OptimizedRecommendationEngine:
    def __init__(self, model_path="models/"):
        self.model_path = model_path
        self.vectorizer = None
        self.similarity_matrix = None
        self.products_df = None
        self.load_models()
    
    def load_models(self):
        """Load pre-trained models"""
        try:
            # Load vectorizer
            with open(f"{self.model_path}tfidf_vectorizer.pkl", "rb") as f:
                self.vectorizer = pickle.load(f)
            
            # Load similarity matrix
            self.similarity_matrix = np.load(f"{self.model_path}cosine_similarity_matrix.npy")
            
            # Load products data
            self.products_df = pd.read_csv(f"{self.model_path}../data/processed_products.csv")
            
            print(f"Models loaded successfully! {len(self.products_df)} products ready.")
            return True
        except Exception as e:
            print(f"Error loading models: {e}")
            return False
    
    def get_recommendations(self, product_id, num_recommendations=5):
        """Get recommendations using pre-computed similarity matrix"""
        try:
            # Find product index
            product_idx = self.products_df[self.products_df['_id'] == product_id].index
            if len(product_idx) == 0:
                return []
            
            product_idx = product_idx[0]
            
            # Get similarity scores from pre-computed matrix
            sim_scores = self.similarity_matrix[product_idx]
            
            # Get similar products (excluding the product itself)
            similar_indices = sim_scores.argsort()[::-1][1:num_recommendations+1]
            
            recommendations = []
            for idx in similar_indices:
                product = self.products_df.iloc[idx]
                recommendations.append({
                    'productId': product['_id'],
                    'name': product['name'],
                    'category': product['category'],
                    'price': product['price'],
                    'similarity_score': float(sim_scores[idx])
                })
            
            return recommendations
        except Exception as e:
            print(f"Error getting recommendations: {e}")
            return []
'''

# Save production code
with open('../optimized_engine.py', 'w') as f:
    f.write(production_code)

print("Production code generated: ../optimized_engine.py")

## Summary

This notebook has:
1. ✅ Loaded and analyzed e-commerce product data
2. ✅ Created content-based features using TF-IDF
3. ✅ Built recommendation system with cosine similarity
4. ✅ Evaluated recommendation quality
5. ✅ Performed clustering analysis
6. ✅ Saved models for production use
7. ✅ Generated optimized production code

The trained models are ready for deployment in the Flask API!