In [2]:
# =========================================
# Cell 1: Imports
# =========================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import mlflow
import mlflow.sklearn

# Reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("✅ Imports completed")


✅ Imports completed


In [3]:
# =========================================
# Cell 2: Load datasets
# =========================================
movies = pd.read_csv('data/processed/movies_processed.csv')
ratings = pd.read_csv('data/processed/ratings_processed.csv')
ratings_with_tags = pd.read_csv('data/processed/ratings_with_tags.csv')
user_stats = pd.read_csv('data/processed/user_stats.csv')
links = pd.read_csv('data/processed/links_processed.csv')

print("✅ Datasets loaded")


✅ Datasets loaded


In [4]:
# =========================================
# Cell 3: Merge & create target
# =========================================
# Merge user features
df = ratings_with_tags.merge(user_stats, on='userId', how='left')

# Merge movie features (avoid exploding memory: select relevant columns)
movie_features = movies.drop(columns=['title', 'genres'])  # keep numeric/embedding features
df = df.merge(movie_features, on='movieId', how='left')

# Create classification target: rating >=4 -> liked (1), else 0
df['target'] = (df['rating'] >= 4).astype(int)

# Drop original rating and IDs
df = df.drop(columns=['rating', 'userId', 'movieId'])

print(f"✅ Dataset ready with shape: {df.shape}")


✅ Dataset ready with shape: (100836, 27)


In [5]:
# =========================================
# Cell 4: Split & scale features
# =========================================

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# -------------------------------
# Step 1: Identify features
# -------------------------------

# Numeric features
numeric_cols = ['user_avg_rating', 'user_rating_count', 'movie_avg_rating', 'movie_rating_count']

# Genre columns (already one-hot encoded)
genre_cols = [col for col in df.columns if col.startswith('genre_')]

# Combine features
X = df[numeric_cols + genre_cols]
y = df['target']

print(f"✅ Total features: {X.shape[1]} ({len(numeric_cols)} numeric, {len(genre_cols)} genre)")
print(f"✅ Target distribution:\n{y.value_counts(normalize=True)}")

# -------------------------------
# Step 2: Train/test split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
print(f"✅ Train/Test split: {X_train.shape[0]} train, {X_test.shape[0]} test")

# -------------------------------
# Step 3: Scale numeric features
# -------------------------------
scaler = StandardScaler()

# Fit on training data only
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

print("✅ Numeric features scaled")
print("✅ Features ready for model training")


✅ Total features: 24 (4 numeric, 20 genre)
✅ Target distribution:
target
0    0.518228
1    0.481772
Name: proportion, dtype: float64
✅ Train/Test split: 80668 train, 20168 test
✅ Numeric features scaled
✅ Features ready for model training


In [6]:
# =========================================
# Cell 5: Train & evaluate models
# =========================================

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# -------------------------------
# Step 1: Define models
# -------------------------------
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=200, random_state=42),
    "SVM": SVC(kernel='linear', max_iter=1000, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
    "MLP": MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
}

# -------------------------------
# Step 2: Train & evaluate
# -------------------------------
results = []

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred)
    })

# -------------------------------
# Step 3: Results table
# -------------------------------
results_df = pd.DataFrame(results).sort_values(by="F1", ascending=False)
results_df.reset_index(drop=True, inplace=True)
results_df


Training LogisticRegression...
Training RandomForest...
Training GradientBoosting...
Training SVM...




Training KNN...
Training MLP...


Unnamed: 0,Model,Accuracy,F1,Precision,Recall
0,GradientBoosting,0.745438,0.74089,0.726877,0.755455
1,LogisticRegression,0.734877,0.729581,0.717212,0.742384
2,MLP,0.732695,0.722728,0.722319,0.723137
3,RandomForest,0.71906,0.708809,0.707863,0.709757
4,KNN,0.69759,0.689223,0.682511,0.696068
5,SVM,0.577202,0.674181,0.536129,0.907987


In [7]:
# =========================================
# Cell 8: Setup MLflow for experiment tracking
# =========================================
import mlflow
import mlflow.sklearn

# Set experiment name
mlflow.set_experiment("Movie_Rating_Classification")

print("✅ MLflow setup done")


✅ MLflow setup done


  return FileStore(store_uri, store_uri)


In [10]:
# =========================================
# Gradient Boosting - baseline + hyperparameter tuning with MLflow
# =========================================

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.sklearn
import pandas as pd

# -------------------------------
# Features and target
# -------------------------------
X_full_df = df.drop(columns=['target', 'timestamp', 'tag'])
y = df['target']

# -------------------------------
# Train/Test split
# -------------------------------
X_train_full, X_test_full, y_train, y_test = train_test_split(
    X_full_df, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------------
# Hyperparameter grid
# -------------------------------
param_grid = [
    {"n_estimators": 200, "learning_rate": 0.1, "max_depth": 3},
    {"n_estimators": 500, "learning_rate": 0.05, "max_depth": 3},
    {"n_estimators": 500, "learning_rate": 0.05, "max_depth": 5}
]

results = []

# -------------------------------
# Train & log models in MLflow
# -------------------------------
for i, params in enumerate(param_grid):
    with mlflow.start_run(run_name=f"GB_Run_{i+1}"):
        # Initialize model
        gb_model = GradientBoostingClassifier(**params, random_state=42)
        gb_model.fit(X_train_full, y_train)
        
        # Predict
        y_pred = gb_model.predict(X_test_full)
        
        # Evaluate
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        
        # Log parameters & metrics
        mlflow.log_params(params)
        mlflow.log_metrics({"accuracy": acc, "f1": f1, "precision": prec, "recall": rec})
        
        # Log model (cast input_example to float64 to avoid warnings)
        mlflow.sklearn.log_model(
            gb_model,
            name="gradient_boosting_model",
            input_example=X_train_full.iloc[:1].astype("float64")
        )
        
        # Store results
        results.append({
            "n_estimators": params["n_estimators"],
            "learning_rate": params["learning_rate"],
            "max_depth": params["max_depth"],
            "Accuracy": acc,
            "F1": f1,
            "Precision": prec,
            "Recall": rec
        })

# Display results sorted by F1
results_df = pd.DataFrame(results).sort_values(by="F1", ascending=False)
results_df.reset_index(drop=True, inplace=True)
results_df


  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|███████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 1761.15it/s]
Downloading artifacts: 100%|███████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 1748.46it/s]
Downloading artifacts: 100%|███████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 1166.89it/s]


Unnamed: 0,n_estimators,learning_rate,max_depth,Accuracy,F1,Precision,Recall
0,500,0.05,5,0.751339,0.74596,0.734464,0.757822
1,500,0.05,3,0.746579,0.741281,0.729356,0.753602
2,200,0.1,3,0.745438,0.74089,0.726877,0.755455


In [11]:
import mlflow

mlflow.end_run()

In [10]:
# =========================================
# Logistic Regression - MLflow logging
# =========================================

import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# -------------------------------
# Initialize model with higher iterations
# -------------------------------
model = LogisticRegression(max_iter=5000, solver='lbfgs', random_state=42)

# -------------------------------
# Train
# -------------------------------
model.fit(X_train_full, y_train)

# -------------------------------
# Predict
# -------------------------------
y_pred = model.predict(X_test_full)

# -------------------------------
# Evaluate
# -------------------------------
metrics = {
    "accuracy": accuracy_score(y_test, y_pred),
    "f1": f1_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred)
}

print("✅ Metrics:", metrics)

# -------------------------------
# Log metrics & model in MLflow
# -------------------------------
with mlflow.start_run(run_name="logistic_regression"):
    mlflow.log_params(model.get_params())
    mlflow.log_metrics(metrics)
    mlflow.sklearn.log_model(
        sk_model=model,
        name="logistic_regression_model",
        input_example=X_train_full.iloc[:1]
    )

print("✅ Logistic Regression logged in MLflow")


✅ Metrics: {'accuracy': 0.7347282824276081, 'f1': 0.7293879615579161, 'precision': 0.7171275114382335, 'recall': 0.7420749279538905}


Downloading artifacts: 100%|███████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 1399.83it/s]

✅ Logistic Regression logged in MLflow





In [12]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split

# -------------------------------
# Take a sample for SVM (to speed up training)
# -------------------------------
X_svm_sample, _, y_svm_sample, _ = train_test_split(
    X_train_full, y_train,
    train_size=20000,  # adjust as needed
    random_state=42,
    stratify=y_train
)

# -------------------------------
# Initialize LinearSVC
# -------------------------------
svm_model = LinearSVC(max_iter=5000, random_state=42)  # increase max_iter to avoid convergence warning

# -------------------------------
# Train
# -------------------------------
svm_model.fit(X_svm_sample, y_svm_sample)

# -------------------------------
# Predict
# -------------------------------
y_pred = svm_model.predict(X_test_full)

# -------------------------------
# Evaluate
# -------------------------------
metrics = {
    "accuracy": accuracy_score(y_test, y_pred),
    "f1": f1_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred)
}
print("✅ Metrics for SVM:", metrics)

# -------------------------------
# Log into MLflow
# -------------------------------
with mlflow.start_run(run_name="SVM_Linear"):
    mlflow.log_params(svm_model.get_params())
    mlflow.log_metrics(metrics)
    mlflow.sklearn.log_model(
        svm_model,
        name="svm_linear_model",
        input_example=X_test_full.iloc[:1]
    )
print("✅ SVM logged in MLflow")


✅ Metrics for SVM: {'accuracy': 0.7338853629512099, 'f1': 0.7299350878075781, 'precision': 0.7140888057497292, 'recall': 0.7465006175380815}


Downloading artifacts: 100%|███████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 1749.81it/s]

✅ SVM logged in MLflow





In [13]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

# -------------------------------
# Models
# -------------------------------
models = {
    "KNN": KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
    "MLP": MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
}

# -------------------------------
# Train, evaluate, and log each model
# -------------------------------
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_full, y_train)
    y_pred = model.predict(X_test_full)
    
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred)
    }
    print(f"✅ Metrics for {name}:", metrics)
    
    # MLflow logging
    with mlflow.start_run(run_name=name):
        mlflow.log_params(model.get_params())
        mlflow.log_metrics(metrics)
        mlflow.sklearn.log_model(
            model,
            name=f"{name.lower()}_model",
            input_example=X_test_full.iloc[:1]
        )
    print(f"✅ {name} logged in MLflow")


Training KNN...
✅ Metrics for KNN: {'accuracy': 0.6576755255850852, 'f1': 0.6513131313131313, 'precision': 0.6394287980959936, 'recall': 0.6636475916014821}


Downloading artifacts: 100%|███████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 1166.61it/s]


✅ KNN logged in MLflow
Training MLP...
✅ Metrics for MLP: {'accuracy': 0.7304145180483935, 'f1': 0.7246670380310933, 'precision': 0.7132888047054132, 'recall': 0.7364141622066694}


Downloading artifacts: 100%|███████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 1749.81it/s]


✅ MLP logged in MLflow
