In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

In [2]:
# We'll create synthetic data to train our regression model.
# The model will predict 'recommended_total_study_minutes' based on the inputs.

def generate_synthetic_data(num_samples=1000):
    """
    Generates synthetic data for training the study schedule regression model.

    Args:
        num_samples (int): The number of data samples to generate.

    Returns:
        pd.DataFrame: A DataFrame containing features and the target variable.
    """
    data = {
        'num_subjects': np.random.randint(1, 6, num_samples),
        'hours_per_day': np.random.randint(1, 9, num_samples),
        'num_topics': np.random.randint(5, 51, num_samples),
        'num_days': np.random.randint(1, 61, num_samples)
    }
    df = pd.DataFrame(data)


    df['max_possible_minutes'] = df['hours_per_day'] * df['num_days'] * 60
    

    df['base_target_minutes'] = (
        df['num_topics'] * np.random.uniform(25, 45, num_samples) +
        df['num_subjects'] * np.random.uniform(50, 100, num_samples) +
        np.random.normal(0, 100, num_samples)
    )
    df['recommended_total_study_minutes'] = df.apply(
        lambda row: min(
            row['max_possible_minutes'] * 0.9,
            max(
                row['num_topics'] * 10,
                row['base_target_minutes']
            )
        ), axis=1
    )

    df['recommended_total_study_minutes'] = df['recommended_total_study_minutes'] + np.random.normal(0, 50, num_samples)
    df['recommended_total_study_minutes'] = df['recommended_total_study_minutes'].round().astype(int)

    # Ensure recommended_total_study_minutes is not negative
    df['recommended_total_study_minutes'] = df['recommended_total_study_minutes'].apply(lambda x: max(0, x))

    return df[['num_subjects', 'hours_per_day', 'num_topics', 'num_days', 'recommended_total_study_minutes']]

In [3]:
generate_synthetic_data()

Unnamed: 0,num_subjects,hours_per_day,num_topics,num_days,recommended_total_study_minutes
0,2,7,17,45,652
1,5,6,30,60,1567
2,5,6,21,46,1162
3,1,4,9,40,195
4,2,3,43,60,1692
...,...,...,...,...,...
995,3,7,9,45,637
996,4,3,9,17,480
997,2,2,29,10,1103
998,2,5,22,17,960


In [8]:

# Generate the dataset
print("Generating synthetic data...")
df = generate_synthetic_data(num_samples=8000)

df.describe()

Generating synthetic data...


Unnamed: 0,num_subjects,hours_per_day,num_topics,num_days,recommended_total_study_minutes
count,8000.0,8000.0,8000.0,8000.0,8000.0
mean,3.00225,4.5035,27.661625,30.346875,1112.35975
std,1.42109,2.291919,13.208262,17.282307,526.952605
min,1.0,1.0,5.0,1.0,0.0
25%,2.0,2.0,16.0,15.0,691.0
50%,3.0,5.0,28.0,30.0,1074.0
75%,4.0,6.0,39.0,45.0,1495.0
max,5.0,8.0,50.0,60.0,2744.0


In [10]:
df.shape

(8000, 5)

In [15]:

# --- 2. Model Training ---

# Define features (X) and target (y)
X = df[['num_subjects', 'hours_per_day', 'num_topics', 'num_days']]
y = df['recommended_total_study_minutes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")



Training data shape: (6400, 4)
Testing data shape: (1600, 4)


In [16]:

print("\nTraining RandomForestRegressor model...")
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)
print("Model training complete.")

y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nModel Evaluation:")
print(f"Mean Absolute Error (MAE): {mae:.2f} minutes")
print(f"R-squared (R2) Score: {r2:.2f}")


Training RandomForestRegressor model...
Model training complete.

Model Evaluation:
Mean Absolute Error (MAE): 166.29 minutes
R-squared (R2) Score: 0.83


In [24]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.tree import DecisionTreeRegressor
model=DecisionTreeRegressor()
model.fit(X_train,y_train)

In [25]:
r2_score(model.predict(X_test),y_test)

0.715832138881993