# Fake Product Detection - EDA & Model Training

Exploratory Data Analysis and baseline model training for fake product detection in e-commerce.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_curve, auc, classification_report
)
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Data

In [None]:
# Load sample dataset
import json

with open('../dataset/sample-listings.json', 'r') as f:
    data_raw = json.load(f)

df = pd.DataFrame(data_raw['listings'])
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Class distribution
print("Label Distribution:")
print(df['is_fake'].value_counts())
print(f"\nFake percentage: {df['is_fake'].mean()*100:.1f}%")

# Visualize
fig, ax = plt.subplots(1, 1, figsize=(8, 5))
df['is_fake'].value_counts().plot(kind='bar', ax=ax, color=['green', 'red'])
ax.set_xlabel('Label (0=Real, 1=Fake)')
ax.set_ylabel('Count')
ax.set_title('Class Distribution')
plt.tight_layout()

## 2. Exploratory Data Analysis

In [None]:
# Price distribution by label
fig, ax = plt.subplots(1, 2, figsize=(14, 5))

df[df['is_fake']==0]['price'].hist(ax=ax[0], bins=30, color='green', alpha=0.7)
ax[0].set_title('Real Products - Price Distribution')
ax[0].set_xlabel('Price')

df[df['is_fake']==1]['price'].hist(ax=ax[1], bins=30, color='red', alpha=0.7)
ax[1].set_title('Fake Products - Price Distribution')
ax[1].set_xlabel('Price')

plt.tight_layout()

In [None]:
# Rating distribution by label
fig, ax = plt.subplots(1, 2, figsize=(14, 5))

df[df['is_fake']==0][['rating', 'review_count']].plot.scatter(ax=ax[0], s=100, alpha=0.6, color='green')
ax[0].set_title('Real Products - Rating vs Reviews')
ax[0].set_xlabel('Rating')
ax[0].set_ylabel('Review Count')

df[df['is_fake']==1][['rating', 'review_count']].plot.scatter(ax=ax[1], s=100, alpha=0.6, color='red')
ax[1].set_title('Fake Products - Rating vs Reviews')
ax[1].set_xlabel('Rating')
ax[1].set_ylabel('Review Count')

plt.tight_layout()

## 3. Feature Engineering

In [None]:
def engineer_features(df):
    """Engineer features from raw listing data"""
    X = pd.DataFrame()
    
    # Price features
    X['price'] = df['price']
    X['log_price'] = np.log1p(df['price'])
    
    # Rating features
    X['rating'] = df['rating']
    X['review_count'] = df['review_count']
    X['log_reviews'] = np.log1p(df['review_count'])
    
    # Derived features
    X['rating_review_ratio'] = X['rating'] / (X['log_reviews'] + 1)
    X['review_score'] = df['review_count'] * df['rating']
    X['perfect_rating_low_reviews'] = ((df['rating'] >= 4.9) & (df['review_count'] < 10)).astype(int)
    
    # Text features
    X['title_length'] = df['title'].str.len()
    X['description_length'] = df['description'].fillna('').str.len()
    X['seller_name_length'] = df['seller'].str.len()
    
    # Category encoding
    for cat in df['category'].unique():
        X[f'cat_{cat}'] = (df['category'] == cat).astype(int)
    
    # Country encoding (simplified)
    X['country_CN'] = (df['country'] == 'CN').astype(int)  # Often higher fake rate
    X['country_US'] = (df['country'] == 'US').astype(int)
    X['country_IN'] = (df['country'] == 'IN').astype(int)
    
    return X

X = engineer_features(df)
y = df['is_fake']

print(f"Feature matrix shape: {X.shape}")
print(f"Features: {X.columns.tolist()}")

## 4. Train Baseline Model

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nTraining label dist: {y_train.value_counts().to_dict()}")
print(f"Test label dist: {y_test.value_counts().to_dict()}")

In [None]:
# Train RandomForest
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    random_state=42,
    class_weight='balanced'
)

model.fit(X_train, y_train)
print("Model trained successfully!")

## 5. Evaluation

In [None]:
# Predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("=== Performance Metrics ===")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Real', 'Fake']))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{cm}")

# Visualize
fig, ax = plt.subplots(1, 1, figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax, 
            xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
ax.set_ylabel('True Label')
ax.set_xlabel('Predicted Label')
ax.set_title('Confusion Matrix')
plt.tight_layout()

## 6. Feature Importance

In [None]:
# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 Features:")
print(feature_importance.head(10))

In [None]:
# Visualize
fig, ax = plt.subplots(figsize=(10, 8))
top_features = feature_importance.head(10)
ax.barh(range(len(top_features)), top_features['importance'], color='steelblue')
ax.set_yticks(range(len(top_features)))
ax.set_yticklabels(top_features['feature'])
ax.set_xlabel('Importance')
ax.set_title('Top 10 Most Important Features')
plt.tight_layout()

## 7. Save Model

In [None]:
import pickle
import os

# Create models directory
os.makedirs('../models', exist_ok=True)

# Save model
with open('../models/fake_detector_v0.1.pkl', 'wb') as f:
    pickle.dump(model, f)

print("Model saved!")