In [2]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/Kaggl')

Mounted at /content/drive


In [3]:
import pandas as pd
import os
from datasets import load_dataset
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

In [13]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
import torch



In [10]:
np.random.seed(42)

#first look at the data
data = pd.read_csv('StudentPerformanceFactors.csv')
data.head()

missing_before = data.isnull().sum()
missing_before = missing_before[missing_before > 0]

#imputation with mode
data['Teacher_Quality'] = data['Teacher_Quality'].fillna('Medium')
data['Parental_Education_Level'] = data['Parental_Education_Level'].fillna('High School')
data['Distance_from_Home'] = data['Distance_from_Home'].fillna('Near')



def encode_features(df):

    df_encoded = df.copy()

    #features with 'low, medium, high' scale
    low_medium_high_cols = [
        'Parental_Involvement', 'Access_to_Resources',
        'Motivation_Level', 'Family_Income', 'Teacher_Quality'
    ]

    for col in low_medium_high_cols:
        if col in df_encoded.columns:
            df_encoded[col] = df_encoded[col].map({'Low': 0, 'Medium': 1, 'High': 2})

    #other ordinal features
    other_ordinal = {
        'Parental_Education_Level': {'High School': 0, 'College': 1, 'Postgraduate': 2},
        'Distance_from_Home': {'Near': 0, 'Moderate': 1, 'Far': 2},
        'Peer_Influence': {'Negative': 0, 'Neutral': 1, 'Positive': 2}
    }

    df_encoded.replace(other_ordinal, inplace=True)

    binary_cols = ['Extracurricular_Activities', 'Internet_Access',
                   'Learning_Disabilities', 'School_Type', 'Gender']

    binary_map = {
        'No': 0, 'Yes': 1,
        'Public': 0, 'Private': 1,
        'Male': 0, 'Female': 1
    }

    for col in binary_cols:
        if col in df_encoded.columns:
            df_encoded[col] = df_encoded[col].map(binary_map)

    return df_encoded

data_encoded = encode_features(data)


#using all features except the target variable
X = data_encoded.drop('Exam_Score', axis=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)
data_encoded['Cluster'] = clusters

#creating SD-based groups for further comparison
mean = data_encoded['Exam_Score'].mean()
std = data_encoded['Exam_Score'].std()
data_encoded['SD_Group'] = pd.cut(
    data_encoded['Exam_Score'],
    bins=[0, mean-std, mean+std, 100],
    labels=['At-Risk', 'Average', 'High-Performing']
)


#comparing clusters to SD groups
for i in range(3):
    cluster_data = data_encoded[data_encoded['Cluster'] == i]


  df_encoded.replace(other_ordinal, inplace=True)


In [14]:
import random

feature_cols = [
    col for col in data_encoded.columns
]


# Randomized rule-based feedback
def rule_feedback(row):
    f = []


    # Hours studied
    if row["Hours_Studied"] < 15:
        options = [
            "Try spending a bit more time studying each week to strengthen your learning.",
            "Increasing your study time slightly could help deepen your understanding.",
            "Set aside a little extra study time to reinforce what you're learning."
        ]
    else:
        options = [
            "You're doing well with your study time, so keep maintaining that consistency.",
            "Your study habits are strong, keep up the steady effort.",
            "Great job staying consistent with your study time, it really supports your progress."
        ]
    f.append(random.choice(options))

    # Attendance
    if row["Attendance"] < 75:
        options = [
            "Attending classes more regularly will help you follow lessons more effectively.",
            "Try to improve attendance, as being present in class boosts your understanding.",
            "Increasing your class attendance could greatly support your academic progress."
        ]
    elif row["Attendance"] < 90:
        options = [
            "You are attending fairly well, but being even more consistent could benefit you.",
            "Your attendance is decent, aiming for even more consistency will help.",
            "You're doing okay with attendance, improving it slightly could make a difference."
        ]
    else:
        options = [
            "Your attendance habits are strong and support your progress.",
            "Great job maintaining excellent attendance, this really helps your learning.",
            "Your consistent attendance is a strong foundation for academic success."
        ]
    f.append(random.choice(options))

    # Motivation
    if row["Motivation_Level"] == "Low":
        options = [
            "Setting simple weekly goals might help you stay motivated and focused.",
            "Try breaking tasks into smaller steps to boost your motivation.",
            "Finding small wins each week can help increase your motivation."
        ]
        f.append(random.choice(options))

    # Sleep
    if row["Sleep_Hours"] < 7:
        options = [
            "Try getting more rest, as being well rested improves concentration and memory.",
            "Improving your sleep habits could help you stay focused and energized.",
            "Aim for more sleep each night, rest is essential for academic performance."
        ]
        f.append(random.choice(options))

    # Score-based advice
    score = row["Exam_Score"]

    if score < 60:
        options = [
            "Reviewing key topics and practicing regularly can help improve your results.",
            "Focus on strengthening the basics and revisiting challenging concepts.",
            "Try setting a structured study plan to help lift your performance."
        ]
    elif score < 80:
        options = [
            "You are doing fairly well, and with steady effort you can reach higher scores.",
            "You're on the right track, continuing consistent study can raise your score further.",
            "Good progress so far, building on your current habits will help you improve."
        ]
    else:
        options = [
            "Your performance is strong, keep reinforcing the habits that work for you.",
            "Great job! Your strong results reflect your effort, keep it up.",
            "You're performing very well, maintaining your current habits will support continued success."
        ]
    f.append(random.choice(options))

    # SD Group
    sd = row["SD_Group"]
    if sd == "At-Risk":
        options = [
            "Your current performance indicates some challenges. Try focusing on the most difficult topics first.",
            "It may help to set smaller, manageable goals to gradually improve your results.",
            "You appear to be at a higher risk academically, so taking steady action now can make a big difference.",
            "Consider seeking additional help or using study resources to strengthen areas of difficulty.",
            "Improving consistency in study habits and attendance could help you move out of the at-risk range."
        ]

    elif sd == "Average":
        options = [
            "Your performance is around the average range, continuing steady effort will help you improve.",
            "You are performing at a typical level,small improvements in study habits can move you even higher.",
            "Maintaining consistency is key, you're in a solid position but can still grow with focused effort.",
            "You're doing reasonably well, and building on your strengths can help you stand out.",
            "A bit more structure in your study routine could help you rise above the average range."
        ]

    elif sd == "High-Performing":
        options = [
            "You're performing at a high level,great job maintaining strong academic habits!",
            "Your results suggest excellent focus and consistency; keep challenging yourself.",
            "You are among the high-performing students, so continue building on your strengths.",
            "Outstanding work,maintaining your current dedication will help you keep performing well.",
            "Your strong performance reflects your effort, staying consistent will help you sustain this level."
        ]

    f.append(random.choice(options))

    # Combine into one feedback message
    return " ".join(f)



# Build training dataset
lines = []

for index, row in data_encoded.iterrows():
    # Build profile string
    profile = ", ".join(f"{col}={row[col]}" for col in feature_cols)

    # Generate randomized feedback
    feedback = rule_feedback(row)

    # Final training line
    line = f"Profile: {profile} ||| Feedback: {feedback} <END>"
    lines.append(line)


# Write to file
with open("gpt_training_data_kaggle.txt", "w") as f:
    for line in lines:
        f.write(line + "\n")

print("Training data created:", len(lines), "examples.")


Training data created: 6607 examples.


In [15]:
model_path = "gpt2"   # base model from HuggingFace
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [16]:
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_path).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [17]:
train_file_path = "gpt_training_data_kaggle.txt"
dataset = load_dataset("text", data_files={"train": train_file_path})



# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=256,
    )

# Tokenize dataset
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

# Add labels
tokenized_dataset = tokenized_dataset.map(
    lambda x: {"labels": x["input_ids"]},
    batched=True
)

lm_dataset = tokenized_dataset

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


save_path = "gpt2_finetuned_kaggle"
os.makedirs(save_path, exist_ok=True)

training_args = TrainingArguments(
    output_dir=save_path,
    overwrite_output_dir=True,
    per_device_train_batch_size=2,
    num_train_epochs=3,
    save_steps=1000,
    logging_steps=500,
    learning_rate=5e-5,
    report_to="none",
    save_total_limit=1
)


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=lm_dataset["train"],
)

# Train and save model
trainer.train()

model.save_pretrained(save_path, safe_serialization=True)
tokenizer.save_pretrained(save_path)

print("Training complete!")


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/6607 [00:00<?, ? examples/s]

Map:   0%|          | 0/6607 [00:00<?, ? examples/s]

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,0.312
1000,0.1838
1500,0.1715
2000,0.1666
2500,0.165
3000,0.1623
3500,0.1611
4000,0.1587
4500,0.1574
5000,0.1568


Training complete!


In [18]:
def generate_feedback_kaggle(profile_text):
    prompt = f"Profile: {profile_text} ||| Feedback:"
    device = next(model.parameters()).device

    # Tokenizes prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generates
    output_ids = model.generate(
        **inputs,
        max_new_tokens=80,
        do_sample=False,
        eos_token_id=tokenizer.convert_tokens_to_ids("<END>"),
        pad_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.2,
    )

    # Decode generated tokens
    text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    if "||| Feedback:" in text:
        text = text.split("||| Feedback:")[1]

    if "<END>" in text:
        text = text.split("<END>")[0]

    return text.strip()

In [19]:
high_profile = (
    "Hours_Studied=25, Attendance=95, Sleep_Hours=8, "
    "Motivation_Level=High, Previous_Scores=90, SD_Group=High-Performing"
)
low_profile = (
    "Hours_Studied=5, Attendance=60, Sleep_Hours=5, "
    "Motivation_Level=Low, Previous_Scores=50, SD_Group=At-Risk"
)
print(generate_feedback_kaggle(high_profile))
print()
print(generate_feedback_kaggle(low_profile))


Your study habits are strong, keep up the steady effort. Great job maintaining excellent attendance, this really helps your learning. You're on the right track, continuing consistent study can raise your score further. Outstanding work,maintaining your current dedication will help you keep performing well.

Set aside a little extra study time to reinforce what you're learning. Try to improve attendance, as being present in class boosts your understanding. Improving your sleep habits could help you stay focused and energized. You are doing fairly well, and with steady effort you can reach higher scores. It may help to set smaller, manageable goals to gradually improve your results.
