In [17]:
import numpy as np
import pandas as pd

np.random.seed(42)
n_days = 20

In [18]:
# Function to generate sleep and work hours with a constraint
def generate_hours(n, max_total_hours=20):
    sleep_hours = np.random.normal(loc=7, scale=1.5, size=n).clip(min=0, max=12)
    # Adjust working hours so the total does not exceed max_total_hours
    working_hours = np.random.normal(loc=8, scale=2, size=n).clip(min=0)
    total_hours = sleep_hours + working_hours
    # Scale down the working hours if the total exceeds max_total_hours
    scale_factor = np.where(total_hours > max_total_hours, max_total_hours / total_hours, 1)
    working_hours *= scale_factor
    return sleep_hours, working_hours.clip(max=16)

In [19]:
# Generate sleep and work hours
sleep_hours, working_hours = generate_hours(n_days)

# Generate other data
data = {
    'Sleep Hours': sleep_hours,
    'Rank of Food Quality': np.random.randint(1, 6, size=n_days),  # Random ranks between 1 and 5
    'Working Hours': working_hours,
    'Rank of Health Condition': np.random.randint(1, 6, size=n_days),
    'Rank of Mood': np.random.randint(1, 6, size=n_days),
    'Rank of Productivity': np.random.randint(1, 6, size=n_days)  # Random initial ranking
}

In [20]:
# Create DataFrame
df = pd.DataFrame(data)

df['Rank of Productivity'] = (0.15 * df['Sleep Hours'] +
                              0.15 * df['Rank of Food Quality'] +
                              0.3 * df['Working Hours'] +
                              0.15 * df['Rank of Health Condition'] +
                              0.25 * df['Rank of Mood']).astype(int).clip(lower=1, upper=5)

df

Unnamed: 0,Sleep Hours,Rank of Food Quality,Working Hours,Rank of Health Condition,Rank of Mood,Rank of Productivity
0,7.745071,2,10.931298,5,2,5
1,6.792604,2,7.548447,1,1,3
2,7.971533,4,8.135056,1,5,5
3,9.284545,2,5.150504,1,3,4
4,6.64877,2,6.911235,1,4,4
5,6.648795,4,8.221845,4,3,5
6,9.368819,4,5.698013,3,3,4
7,8.151152,1,8.751396,3,1,4
8,6.295788,5,6.798723,1,3,4
9,7.81384,5,7.416613,3,5,5


In [22]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Assuming 'df' is your DataFrame
X = df[['Sleep Hours', 'Rank of Food Quality', 'Working Hours', 'Rank of Health Condition', 'Rank of Mood']]
y = df['Rank of Productivity']

# StandardScaler for continuous variables
scaler = StandardScaler()
X[['Sleep Hours', 'Working Hours']] = scaler.fit_transform(X[['Sleep Hours', 'Working Hours']])

# MinMaxScaler for ordinal variables
min_max_scaler = MinMaxScaler()
X[['Rank of Food Quality', 'Rank of Health Condition', 'Rank of Mood']] = min_max_scaler.fit_transform(X[['Rank of Food Quality', 'Rank of Health Condition', 'Rank of Mood']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[['Sleep Hours', 'Working Hours']] = scaler.fit_transform(X[['Sleep Hours', 'Working Hours']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[['Rank of Food Quality', 'Rank of Health Condition', 'Rank of Mood']] = min_max_scaler.fit_transform(X[['Rank of Food Quality', 'Rank of Health Condition', 'Rank of Mood']])


Unnamed: 0,Sleep Hours,Rank of Food Quality,Working Hours,Rank of Health Condition,Rank of Mood
0,0.713902,0.25,1.835266,1.0,0.25
1,0.035304,0.25,0.042605,0.0,0.0
2,0.875248,0.75,0.353464,0.0,1.0
3,1.810722,0.25,-1.228129,0.0,0.5
4,-0.067173,0.25,-0.295071,0.0,0.75
5,-0.067155,0.75,0.399456,0.75,0.5
6,1.870764,0.75,-0.937989,0.5,0.5
7,1.00322,0.0,0.680079,0.5,0.0
8,-0.318659,1.0,-0.354694,0.0,0.5
9,0.762898,1.0,-0.027258,0.5,1.0


Sleep Hours and Working Hours: These are continuous variables and could be standardized to normalize their distribution.
Rank of Food Quality, Rank of Health Condition, Rank of Mood: These ordinal variables can be scaled using Min-Max scaling if you want to keep the interpretability straightforward (i.e., keeping them in a 0-1 range) or standardize if you wish to emphasize the relationship in a more normalized scale.
Rank of Productivity: Since this is your target variable, you generally do not need to scale it for linear regression models.

In [25]:
y

0     5
1     3
2     5
3     4
4     4
5     5
6     4
7     4
8     4
9     5
10    4
11    5
12    5
13    3
14    5
15    3
16    4
17    4
18    3
19    5
Name: Rank of Productivity, dtype: int64

In [12]:
df.to_csv('my_routine_data.csv', index=False)

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [30]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=44)  

In [31]:
# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

In [32]:
# Predict the productivity rankings on the test set
y_pred = model.predict(X_test)

# Calculate and print the Mean Squared Error and the R-squared value
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared value: {r2}")

Mean Squared Error: 0.15319602650114442
R-squared value: 0.8276544701862125


In [33]:
#print classification report 
from sklearn.metrics import classification_report
y_pred = model.predict(X)
y_pred = np.round(y_pred)
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           3       1.00      1.00      1.00         4
           4       0.89      1.00      0.94         8
           5       1.00      0.88      0.93         8

    accuracy                           0.95        20
   macro avg       0.96      0.96      0.96        20
weighted avg       0.96      0.95      0.95        20

