In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from prophet import Prophet
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib
import os
import warnings
warnings.filterwarnings("ignore")

# Create output folders
os.makedirs("temp", exist_ok=True)
os.makedirs("forecasts", exist_ok=True)


In [4]:
df = pd.read_csv("Screentime - App Details.csv")

df['Date'] = pd.to_datetime(df['Date'])

df['App'] = df['App'].fillna('Unknown')

df[['Usage', 'Notifications', 'Times opened']] = df[['Usage', 'Notifications', 'Times opened']].fillna(0)

df.head()


Unnamed: 0,Date,Usage,Notifications,Times opened,App
0,2022-08-26,38,70,49,Instagram
1,2022-08-27,39,43,48,Instagram
2,2022-08-28,64,231,55,Instagram
3,2022-08-29,14,35,23,Instagram
4,2022-08-30,3,19,5,Instagram


In [9]:
def remove_outliers_iqr(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    return data[~((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR)))]

# Function to remove outliers per group
def clean_group(df, app, metric):
    sub_df = df[df['App'] == app][['Date', metric]]
    sub_df = sub_df.groupby('Date').sum().reset_index()
    sub_df = sub_df.rename(columns={'Date': 'ds', metric: 'y'})
    sub_df['y'] = remove_outliers_iqr(sub_df['y'])
    sub_df = sub_df.dropna()
    return sub_df


In [10]:
def train_and_save_prophet(app, metric_col, model_id):
    df_cleaned = clean_group(df, app, metric_col)
    if df_cleaned.shape[0] < 10:
        print(f"⚠️ Skipping {app} - {metric_col}: Not enough data after cleaning.")
        return None

    # Train Prophet model
    model = Prophet()
    model.fit(df_cleaned)

    # Forecast next 30 days
    future = model.make_future_dataframe(periods=30)
    forecast = model.predict(future)

    # Save model and forecast
    model_path = f"temp/model_{model_id}.pkl"
    forecast_path = f"forecasts/{model_id}_forecast.csv"
    joblib.dump(model, model_path)
    forecast.to_csv(forecast_path, index=False)

    # Accuracy on training data
    y_true = df_cleaned['y']
    y_pred = model.predict(df_cleaned)['yhat']
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)

    print(f"✅ Trained {model_id} | MAE: {mae:.2f}, RMSE: {rmse:.2f}")
    return {
        'App': app,
        'Metric': metric_col.replace(' (minutes)', '').replace(' ', '_'),
        'MAE': round(mae, 2),
        'RMSE': round(rmse, 2),
        'Model': os.path.basename(model_path),
        'Forecast File': os.path.basename(forecast_path)
    }


In [11]:
apps = ['Instagram', 'Whatsapp']
metrics = ['Usage', 'Notifications', 'Times opened']

results = []

# Individual app models
for app in apps:
    for metric in metrics:
        model_id = f"{app.lower()}_{metric.lower().replace(' ', '_').replace('(', '').replace(')', '')}"
        result = train_and_save_prophet(app, metric, model_id)
        if result:
            results.append(result)

# Total (all apps combined)
df_total = df.copy()
df_total['App'] = 'Total'
for metric in metrics:
    model_id = f"total_{metric.lower().replace(' ', '_').replace('(', '').replace(')', '')}"
    result = train_and_save_prophet('Total', metric, model_id)
    if result:
        results.append(result)


04:08:40 - cmdstanpy - INFO - Chain [1] start processing
04:08:41 - cmdstanpy - INFO - Chain [1] done processing
04:08:41 - cmdstanpy - INFO - Chain [1] start processing


✅ Trained instagram_usage | MAE: 14.15, RMSE: 18.18


04:08:41 - cmdstanpy - INFO - Chain [1] done processing
04:08:41 - cmdstanpy - INFO - Chain [1] start processing


✅ Trained instagram_notifications | MAE: 11.65, RMSE: 15.37


04:08:42 - cmdstanpy - INFO - Chain [1] done processing
04:08:42 - cmdstanpy - INFO - Chain [1] start processing


✅ Trained instagram_times_opened | MAE: 18.80, RMSE: 24.47


04:08:42 - cmdstanpy - INFO - Chain [1] done processing
04:08:42 - cmdstanpy - INFO - Chain [1] start processing


✅ Trained whatsapp_usage | MAE: 37.76, RMSE: 48.28


04:08:42 - cmdstanpy - INFO - Chain [1] done processing
04:08:43 - cmdstanpy - INFO - Chain [1] start processing


✅ Trained whatsapp_notifications | MAE: 31.72, RMSE: 37.43


04:08:43 - cmdstanpy - INFO - Chain [1] done processing


✅ Trained whatsapp_times_opened | MAE: 14.99, RMSE: 18.64
⚠️ Skipping Total - Usage: Not enough data after cleaning.
⚠️ Skipping Total - Notifications: Not enough data after cleaning.
⚠️ Skipping Total - Times opened: Not enough data after cleaning.


In [12]:
summary_df = pd.DataFrame(results)
summary_df.to_csv("forecasts/model_summary.csv", index=False)
summary_df


Unnamed: 0,App,Metric,MAE,RMSE,Model,Forecast File
0,Instagram,Usage,14.15,18.18,model_instagram_usage.pkl,instagram_usage_forecast.csv
1,Instagram,Notifications,11.65,15.37,model_instagram_notifications.pkl,instagram_notifications_forecast.csv
2,Instagram,Times_opened,18.8,24.47,model_instagram_times_opened.pkl,instagram_times_opened_forecast.csv
3,Whatsapp,Usage,37.76,48.28,model_whatsapp_usage.pkl,whatsapp_usage_forecast.csv
4,Whatsapp,Notifications,31.72,37.43,model_whatsapp_notifications.pkl,whatsapp_notifications_forecast.csv
5,Whatsapp,Times_opened,14.99,18.64,model_whatsapp_times_opened.pkl,whatsapp_times_opened_forecast.csv


In [13]:
def train_and_save_total(metric_col, model_id):
    sub_df = df_total[['Date', metric_col]].rename(columns={'Date': 'ds', metric_col: 'y'})
    sub_df['y'] = remove_outliers_iqr(sub_df['y'])
    sub_df = sub_df.dropna()
    
    if sub_df.shape[0] < 10:
        print(f"⚠️ Skipping Total - {metric_col}: Not enough data after cleaning.")
        return None

    model = Prophet()
    model.fit(sub_df)

    future = model.make_future_dataframe(periods=30)
    forecast = model.predict(future)

    model_path = f"temp/model_total_{metric_col.lower().replace(' ', '_')}.pkl"
    forecast_path = f"forecasts/total_{metric_col.lower().replace(' ', '_')}_forecast.csv"
    joblib.dump(model, model_path)
    forecast.to_csv(forecast_path, index=False)

    y_true = sub_df['y']
    y_pred = model.predict(sub_df)['yhat']
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)

    print(f"Trained Total - {metric_col} | MAE: {mae:.2f}, RMSE: {rmse:.2f}")
    return {
        'App': 'Total',
        'Metric': metric_col.replace(' (minutes)', '').replace(' ', '_'),
        'MAE': round(mae, 2),
        'RMSE': round(rmse, 2),
        'Model': os.path.basename(model_path),
        'Forecast File': os.path.basename(forecast_path)
    }


In [14]:
# Total (all apps combined)
for metric in metrics:
    result = train_and_save_total(metric, metric.lower().replace(' ', '_').replace('(', '').replace(')', ''))
    if result:
        results.append(result)


04:13:58 - cmdstanpy - INFO - Chain [1] start processing
04:13:58 - cmdstanpy - INFO - Chain [1] done processing
04:13:59 - cmdstanpy - INFO - Chain [1] start processing


Trained Total - Usage | MAE: 45.35, RMSE: 53.73


04:13:59 - cmdstanpy - INFO - Chain [1] done processing
04:13:59 - cmdstanpy - INFO - Chain [1] start processing


Trained Total - Notifications | MAE: 86.01, RMSE: 99.31


04:13:59 - cmdstanpy - INFO - Chain [1] done processing


Trained Total - Times opened | MAE: 34.90, RMSE: 42.21
