In [15]:
import random
import pandas as pd

def generate_student_data(num_rows):
  data = []
  student_ids = range(1, 101)  # Assuming up to 100 students
  session_ids = range(1, 501)   # Assuming up to 500 sessions
  courses = ["Introduction to Software Engineering", "Data Structures", "Algorithms", "Database Systems"]
  emotional_states = ["Happy", "Neutral", "Sad", "Frustrated", "Anxious", "Excited", "Tired"]

  for _ in range(num_rows):
    row = {
        'Student_ID': random.choice(student_ids),
        'Session_ID': random.choice(session_ids),
        'Emotional_State': random.choice(emotional_states),
        'Focus_Level': random.randint(50, 100),
        'Quiz_1_Marks': random.choice([0, 20, 40, 60, 80, 100]) ,
        'Quiz_2_Marks':random.choice([0, 20, 40, 60, 80, 100]) ,
        'Quiz_3_Marks': random.choice([0, 20, 40, 60, 80, 100]) ,
        'Quiz_4_Marks': random.choice([0, 20, 40, 60, 80, 100]) ,
        'Quiz_5_Marks': random.choice([0, 20, 40, 60, 80, 100]) ,
        'Course_Name': random.choice(courses),
        'Time_Spent_Studying': f"{random.randint(1, 5)} hours" 
    }

    # Calculate Cumulative Average and Recent Quiz Trend
    quiz_scores = [row[f'Quiz_{i}_Marks'] for i in range(1, 6)]
    row['Cumulative_Average_Quiz_Score'] = round(sum(quiz_scores) / len(quiz_scores), 1)

    if quiz_scores[-1] > quiz_scores[-2]:
        row['Recent_Quiz_Trend'] = 'Improving'
    elif quiz_scores[-1] < quiz_scores[-2]:
        row['Recent_Quiz_Trend'] = 'Declining'
    else:
        row['Recent_Quiz_Trend'] = 'Stable'

    # Predict Focus Level and Exam Score (simple estimations for now)
    row['Predicted_Focus_Level'] =  max(50, row['Focus_Level'] + random.randint(-10, 10)) 
    row['Exam_Score'] =  max(20, int(row['Cumulative_Average_Quiz_Score'] + random.randint(-10, 10))) 

    data.append(row)

  return pd.DataFrame(data)

# Generate and display 10 rows of new data
new_data = generate_student_data(5000)
# print(new_data)

In [16]:
new_data.to_csv('student_learning_data_1.csv', index=False)

In [17]:
new_data

Unnamed: 0,Student_ID,Session_ID,Emotional_State,Focus_Level,Quiz_1_Marks,Quiz_2_Marks,Quiz_3_Marks,Quiz_4_Marks,Quiz_5_Marks,Course_Name,Time_Spent_Studying,Cumulative_Average_Quiz_Score,Recent_Quiz_Trend,Predicted_Focus_Level,Exam_Score
0,7,204,Excited,72,100,40,60,60,80,Data Structures,1 hours,68.0,Improving,68,72
1,48,481,Happy,82,0,20,0,80,40,Algorithms,2 hours,28.0,Declining,91,37
2,33,43,Tired,100,100,40,40,20,100,Database Systems,4 hours,60.0,Improving,105,56
3,67,191,Neutral,82,100,80,60,0,80,Algorithms,5 hours,64.0,Improving,85,57
4,19,443,Neutral,87,60,20,100,0,60,Database Systems,2 hours,48.0,Improving,77,46
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,9,448,Tired,50,0,80,100,20,0,Database Systems,1 hours,40.0,Declining,52,39
4996,49,160,Sad,64,60,40,80,20,40,Algorithms,2 hours,48.0,Improving,63,45
4997,22,382,Anxious,81,40,20,40,20,60,Algorithms,2 hours,36.0,Improving,77,40
4998,32,257,Sad,52,40,0,100,20,20,Introduction to Software Engineering,4 hours,36.0,Stable,50,30


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
import xgboost as xgb


# Load the dataset
df = pd.read_csv('student_learning_data.csv')

# Selecting features and target variable
X = df[['Emotional_State', 'Time_Spent_Studying', 'Focus_Level', 'Cumulative_Average_Quiz_Score']]
y = df['Exam_Score']

# Preprocessing for categorical data
# Convert 'Time_Spent_Studying' to numeric (e.g., "5 hours" -> 5)
X['Time_Spent_Studying'] = X['Time_Spent_Studying'].str.extract('(\d+)').astype(int)

# One-Hot Encoding for categorical features
categorical_features = ['Emotional_State']
numeric_features = ['Time_Spent_Studying', 'Focus_Level', 'Cumulative_Average_Quiz_Score']

# Define Column Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor',  LinearRegression())
])


# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")




Mean Absolute Error: 4.893451134317678
Root Mean Squared Error: 5.7883115916995695


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Time_Spent_Studying'] = X['Time_Spent_Studying'].str.extract('(\d+)').astype(int)


In [3]:
# Predicting Exam_Score for new data
new_data = pd.DataFrame({
    'Emotional_State': ['Happy', 'Sad'],
    'Time_Spent_Studying': ['1 hours', '4 hours'],
    'Focus_Level': [60, 75],
    'Cumulative_Average_Quiz_Score': [20, 88]
})

# Convert 'Time_Spent_Studying' to numeric
new_data['Time_Spent_Studying'] = new_data['Time_Spent_Studying'].str.extract('(\d+)').astype(int)

predicted_scores = model.predict(new_data)
print(f"Predicted Exam Scores: {predicted_scores}")

Predicted Exam Scores: [17.46114357 88.36982068]


In [4]:
new_data = pd.DataFrame({
    'Emotional_State': ['Happy'],
    'Time_Spent_Studying': ['1 hours'],
    'Focus_Level': [60],
    'Cumulative_Average_Quiz_Score': [20]
})

# Convert 'Time_Spent_Studying' to numeric
new_data['Time_Spent_Studying'] = new_data['Time_Spent_Studying'].str.extract('(\d+)').astype(int)

predicted_scores = model.predict(new_data)
print(f"Predicted Exam Scores: {predicted_scores}")

Predicted Exam Scores: [17.46114357]


In [4]:
import joblib

# Save the model to a file
joblib.dump(model, 'exam_score_predictor.pkl')


['exam_score_predictor.pkl']

In [5]:
results_df = pd.DataFrame({'Actual Score': y_test, 'Predicted Score': y_pred})
print(results_df)

     Actual Score  Predicted Score
521            84        75.294762
737            81        76.211414
740            58        67.472363
660            77        77.680278
411            80        74.687801
..            ...              ...
408            58        68.186555
332            73        71.354872
208            80        75.174017
613            66        71.482353
78             89        82.077341

[200 rows x 2 columns]


In [10]:
df

Unnamed: 0,Student_ID,Session_ID,Emotional_State,Focus_Level,Quiz_1_Marks,Quiz_2_Marks,Quiz_3_Marks,Quiz_4_Marks,Quiz_5_Marks,Course_Name,Time_Spent_Studying,Cumulative_Average_Quiz_Score,Recent_Quiz_Trend,Predicted_Focus_Level,Exam_Score
0,77,371,Neutral,100,98,68,53,51,87,Introduction to Software Engineering,3 hours,71.4,Improving,109,79
1,59,448,Sad,54,91,73,93,60,92,Data Structures,2 hours,81.8,Improving,53,80
2,16,44,Sad,83,90,61,62,57,76,Database Systems,1 hours,69.2,Improving,79,71
3,43,121,Sad,76,76,70,98,52,81,Database Systems,2 hours,75.4,Improving,85,83
4,1,410,Sad,63,85,93,99,69,70,Algorithms,5 hours,83.2,Improving,58,84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,70,458,Anxious,54,87,73,82,58,77,Introduction to Software Engineering,1 hours,75.4,Improving,64,77
996,85,167,Frustrated,98,53,99,65,63,98,Algorithms,2 hours,75.6,Improving,104,77
997,24,303,Neutral,87,100,69,62,54,77,Data Structures,2 hours,72.4,Improving,91,62
998,16,80,Anxious,55,83,68,84,50,56,Data Structures,5 hours,68.2,Improving,50,67


In [11]:
df.columns

Index(['Student_ID', 'Session_ID', 'Emotional_State', 'Focus_Level',
       'Quiz_1_Marks', 'Quiz_2_Marks', 'Quiz_3_Marks', 'Quiz_4_Marks',
       'Quiz_5_Marks', 'Course_Name', 'Time_Spent_Studying',
       'Cumulative_Average_Quiz_Score', 'Recent_Quiz_Trend',
       'Predicted_Focus_Level', 'Exam_Score', 'new_exam_score',
       'task_to_take'],
      dtype='object')

In [6]:
# Add 'new_exam_score' column with random values added to 'Exam_Score'
def generate_new_exam_score(score):
    max_addition = min(98 - score, 10)  # Ensure the new score is less than 98
    if max_addition < 5:
        return score  # If max addition is less than 5, return the original score
    return score + np.random.randint(5, max_addition + 1)

df['new_exam_score'] = df['Exam_Score'].apply(generate_new_exam_score)

# Add 'task_to_take' column with task groups assigned based on some logic
# Example: Assign task groups randomly
task_groups = ['Task Group 1', 'Task Group 2', 'Task Group 3', 'Task Group 4', 'Task Group 5']
df['task_to_take'] = np.random.choice(task_groups, size=len(df))

In [10]:
df.to_csv('student_learning_data_all.csv', index=False)

In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [9]:
def recommend_task_collaborative(student_id):
    # Simulated task interaction matrix
    tasks = {
        'Student_ID': df['Student_ID'],
        'Task_Group_1': np.random.randint(0, 2, size=1000),
        'Task_Group_2': np.random.randint(0, 2, size=1000),
        'Task_Group_3': np.random.randint(0, 2, size=1000),
        'Task_Group_4': np.random.randint(0, 2, size=1000),
        'Task_Group_5': np.random.randint(0, 2, size=1000)
    }
    task_df = pd.DataFrame(tasks)
    
    # Extract the task preferences for the student
    student_tasks = task_df[task_df['Student_ID'] == student_id].drop('Student_ID', axis=1)
    
    if student_tasks.empty:
        return "Student not found."

    task_features = task_df.drop('Student_ID', axis=1)
    task_similarity = cosine_similarity(task_features, task_features)
    collab_recommendations = task_similarity.dot(task_features)
    collab_recommendations_df = pd.DataFrame(collab_recommendations, columns=task_features.columns)
    
    # Recommend the top task group for the student
    recommended_task = collab_recommendations_df.loc[0].idxmax().replace('Task_Group_', 'Task Group ')
    
    return recommended_task

# Example usage
print(recommend_task_collaborative(43))


Task Group 2


In [12]:
df

Unnamed: 0,Student_ID,Session_ID,Emotional_State,Focus_Level,Quiz_1_Marks,Quiz_2_Marks,Quiz_3_Marks,Quiz_4_Marks,Quiz_5_Marks,Course_Name,Time_Spent_Studying,Cumulative_Average_Quiz_Score,Recent_Quiz_Trend,Predicted_Focus_Level,Exam_Score,new_exam_score,task_to_take
0,77,371,Neutral,100,98,68,53,51,87,Introduction to Software Engineering,3 hours,71.4,Improving,109,79,85,Task Group 2
1,59,448,Sad,54,91,73,93,60,92,Data Structures,2 hours,81.8,Improving,53,80,89,Task Group 3
2,16,44,Sad,83,90,61,62,57,76,Database Systems,1 hours,69.2,Improving,79,71,81,Task Group 1
3,43,121,Sad,76,76,70,98,52,81,Database Systems,2 hours,75.4,Improving,85,83,91,Task Group 2
4,1,410,Sad,63,85,93,99,69,70,Algorithms,5 hours,83.2,Improving,58,84,92,Task Group 5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,70,458,Anxious,54,87,73,82,58,77,Introduction to Software Engineering,1 hours,75.4,Improving,64,77,82,Task Group 2
996,85,167,Frustrated,98,53,99,65,63,98,Algorithms,2 hours,75.6,Improving,104,77,87,Task Group 3
997,24,303,Neutral,87,100,69,62,54,77,Data Structures,2 hours,72.4,Improving,91,62,68,Task Group 4
998,16,80,Anxious,55,83,68,84,50,56,Data Structures,5 hours,68.2,Improving,50,67,72,Task Group 2
