In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

# We'll create synthetic data to train our regression model.
# The model will predict 'recommended_total_study_minutes' based on the inputs.

def generate_synthetic_data(num_samples=1000):
    """
    Generates synthetic data for training the study schedule regression model.

    Args:
        num_samples (int): The number of data samples to generate.

    Returns:
        pd.DataFrame: A DataFrame containing features and the target variable.
    """
    data = {
        'num_subjects': np.random.randint(1, 6, num_samples),
        'hours_per_day': np.random.randint(1, 9, num_samples),
        'num_topics': np.random.randint(5, 51, num_samples),
        'num_days': np.random.randint(1, 61, num_samples)
    }
    df = pd.DataFrame(data)


    df['max_possible_minutes'] = df['hours_per_day'] * df['num_days'] * 60
    

    df['base_target_minutes'] = (
        df['num_topics'] * np.random.uniform(25, 45, num_samples) +
        df['num_subjects'] * np.random.uniform(50, 100, num_samples) +
        np.random.normal(0, 100, num_samples)
    )
    df['recommended_total_study_minutes'] = df.apply(
        lambda row: min(
            row['max_possible_minutes'] * 0.9,
            max(
                row['num_topics'] * 10,
                row['base_target_minutes']
            )
        ), axis=1
    )

    df['recommended_total_study_minutes'] = df['recommended_total_study_minutes'] + np.random.normal(0, 50, num_samples)
    df['recommended_total_study_minutes'] = df['recommended_total_study_minutes'].round().astype(int)

    # Ensure recommended_total_study_minutes is not negative
    df['recommended_total_study_minutes'] = df['recommended_total_study_minutes'].apply(lambda x: max(0, x))

    return df[['num_subjects', 'hours_per_day', 'num_topics', 'num_days', 'recommended_total_study_minutes']]

In [8]:
generate_synthetic_data()

Unnamed: 0,num_subjects,hours_per_day,num_topics,num_days,recommended_total_study_minutes
0,1,6,9,16,394
1,3,4,12,53,565
2,3,5,22,14,1096
3,3,2,11,41,543
4,2,8,9,32,463
...,...,...,...,...,...
995,1,7,43,10,1296
996,5,1,47,31,1635
997,5,1,39,21,1227
998,3,1,24,43,950


In [9]:

# Generate the dataset
print("Generating synthetic data...")
synthetic_df = generate_synthetic_data(num_samples=2000)

synthetic_df.describe()

Generating synthetic data...


Unnamed: 0,num_subjects,hours_per_day,num_topics,num_days,recommended_total_study_minutes
count,2000.0,2000.0,2000.0,2000.0,2000.0
mean,2.9905,4.584,27.2975,31.361,1109.3585
std,1.412235,2.271018,13.19447,17.077016,524.17137
min,1.0,1.0,5.0,1.0,27.0
25%,2.0,3.0,16.0,17.0,703.0
50%,3.0,5.0,27.0,32.0,1076.0
75%,4.0,7.0,38.0,46.0,1495.0
max,5.0,8.0,50.0,60.0,2601.0


In [10]:

# --- 2. Model Training ---

# Define features (X) and target (y)
X = synthetic_df[['num_subjects', 'hours_per_day', 'num_topics', 'num_days']]
y = synthetic_df['recommended_total_study_minutes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

print("\nTraining RandomForestRegressor model...")
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)
print("Model training complete.")

y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nModel Evaluation:")
print(f"Mean Absolute Error (MAE): {mae:.2f} minutes")
print(f"R-squared (R2) Score: {r2:.2f}")


model_filename = 'study_schedule_model.joblib'
joblib.dump(model, model_filename)
print(f"\nModel saved to {model_filename}")



Training data shape: (1600, 4)
Testing data shape: (400, 4)

Training RandomForestRegressor model...
Model training complete.

Model Evaluation:
Mean Absolute Error (MAE): 170.88 minutes
R-squared (R2) Score: 0.82

Model saved to study_schedule_model.joblib


In [12]:

def predict_study_time(num_subjects, hours_per_day, num_topics, num_days, loaded_model=None):
    """
    Predicts the recommended total study minutes and average time per topic
    using the trained model.

    Args:
        num_subjects (int): Number of subjects.
        hours_per_day (int): Hours available for study per day.
        num_topics (int): Total number of topics to cover.
        num_days (int): Number of days to cover the topics.
        loaded_model (sklearn.ensemble.RandomForestRegressor, optional): Pre-loaded model.
                                                                        If None, attempts to load from file.

    Returns:
        tuple: (recommended_total_study_minutes, average_time_per_topic_minutes)
               Returns (None, None) if model cannot be loaded.
    """
    if loaded_model is None:
        try:
            model = joblib.load(model_filename)
            print(f"Model loaded from {model_filename}")
        except FileNotFoundError:
            print(f"Error: Model file '{model_filename}' not found. Please train and save the model first.")
            return None, None
    else:
        model = loaded_model

    input_data = pd.DataFrame([[num_subjects, hours_per_day, num_topics, num_days]],
                              columns=['num_subjects', 'hours_per_day', 'num_topics', 'num_days'])
    prediction = model.predict(input_data)[0]


    predicted_minutes = int(max(0, round(prediction)))


    average_time_per_topic_minutes = 0
    if num_topics > 0:
        average_time_per_topic_minutes = predicted_minutes / num_topics
        
    
    return str(timedelta(minutes=predicted_minutes)),str(timedelta(minutes=average_time_per_topic_minutes)) 


print("\n--- Example Prediction ---")

loaded_model = joblib.load(model_filename)

example_subjects = 3
example_hours_day = 4
example_topics = 25
example_days = 30

predicted_minutes, time_per_topic = predict_study_time(
    example_subjects,
    example_hours_day,
    example_topics,
    example_days,
    loaded_model=loaded_model
)



--- Example Prediction ---


In [13]:

predicted_minutes, time_per_topic = predict_study_time(
    example_subjects,
    example_hours_day,
    example_topics,
    example_days,
    loaded_model=loaded_model
)


In [14]:
print(predicted_minutes,time_per_topic)

1085 43.4


In [15]:
from datetime import timedelta

18:05:00
