In [None]:
    # 03_modeling.ipynb
    # Machine Learning Models for High-Growth Prediction

    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns

    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier

    plt.style.use("ggplot")

    print("Modeling notebook ready.")
    

In [None]:
    # Load processed features
    features_path = "../data/processed/features.csv"
    df = pd.read_csv(features_path, parse_dates=["trending_date", "publish_date"])

    print("Features shape:", df.shape)
    df.head()
    

In [None]:
    # Define feature matrix X and target y
    # We avoid using view_growth / growth_rate as predictors because they are derived from future information.
    feature_cols = [
        "views",
        "likes",
        "dislikes",
        "comment_count",
        "like_view_ratio",
        "comment_view_ratio",
        "publish_hour",
        "category_id",
    ]

    X = df[feature_cols].copy()
    y = df["high_growth"].astype(int)

    X.head()
    

In [None]:
    # Train-test split (time-ignorant simple split; can be refined later)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    print("Train size:", X_train.shape[0], "Test size:", X_test.shape[0])
    

In [None]:
    # Baseline model: Logistic Regression
    log_reg = LogisticRegression(max_iter=1000)
    log_reg.fit(X_train, y_train)

    y_pred_lr = log_reg.predict(X_test)

    print("Logistic Regression accuracy:", accuracy_score(y_test, y_pred_lr))
    print("Logistic Regression F1:", f1_score(y_test, y_pred_lr))
    print("\nClassification report (LogReg):\n", classification_report(y_test, y_pred_lr))
    

In [None]:
    # Random Forest model
    rf = RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        random_state=42,
        n_jobs=-1,
    )
    rf.fit(X_train, y_train)

    y_pred_rf = rf.predict(X_test)

    print("Random Forest accuracy:", accuracy_score(y_test, y_pred_rf))
    print("Random Forest F1:", f1_score(y_test, y_pred_rf))
    print("\nClassification report (Random Forest):\n", classification_report(y_test, y_pred_rf))
    

In [None]:
    # Confusion matrix for Random Forest
    cm = confusion_matrix(y_test, y_pred_rf)
    fig, ax = plt.subplots(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    ax.set_title("Confusion Matrix - Random Forest")
    plt.show()
    

In [None]:
    # Feature importance from Random Forest
    importances = rf.feature_importances_
    fi = pd.Series(importances, index=feature_cols).sort_values(ascending=False)

    plt.figure(figsize=(8, 4))
    fi.plot(kind="bar")
    plt.title("Feature Importances (Random Forest)")
    plt.ylabel("Importance")
    plt.show()

    fi
    