In [2]:
# ============================================================
# üìò PEP Student Performance Dataset Generator (with Grades)
# ============================================================

import pandas as pd
import numpy as np
import random

# -------------------------------
# Step 1: Initialize
# -------------------------------
np.random.seed(42)
random.seed(42)
n = 300  # number of students

# -------------------------------
# Step 2: Create Base Data
# -------------------------------
data = {
    "Student_ID": np.arange(1, n + 1),
    "Age": np.random.randint(18, 24, n),
    "Gender": np.random.choice(["M", "F"], n),
    "Height_cm": np.random.randint(160, 186, n),
    "Weight_kg": np.random.randint(55, 85, n)
}
df = pd.DataFrame(data)

# Calculate BMI
df["BMI"] = (df["Weight_kg"] / (df["Height_cm"] / 100) ** 2).round(1)

# -------------------------------
# Step 3: Activity Data
# -------------------------------
df["Run_3km_Min"] = np.random.uniform(12, 30, n).round(1)
df["Pushups"] = np.random.randint(12, 36, n)
df["Situps"] = np.random.randint(14, 36, n)

# Beep Test (65% between 6‚Äì8, rest 4‚Äì10)
df["Beep_Test"] = [
    round(random.uniform(6, 8), 1) if random.random() < 0.65 else round(random.uniform(4, 10), 1)
    for _ in range(n)
]

# Attendance (80‚Äì90% for 80% students, else 60‚Äì79%)
df["Attendance_%"] = [
    round(random.uniform(80, 90), 1) if random.random() < 0.8 else round(random.uniform(60, 79), 1)
    for _ in range(n)
]

# -------------------------------
# Step 4: Performance Scores
# -------------------------------
df["Speed_Score"] = ((30 - df["Run_3km_Min"]) / 18 * 10).round(2)
df["Strength_Score"] = ((df["Pushups"] + df["Situps"]) / 70 * 10).round(2)
df["Overall_Score"] = ((df["Speed_Score"] + df["Strength_Score"] + df["Beep_Test"]) / 3).round(1)

# -------------------------------
# Step 5: Assign Grades (A‚ÄìF)
# -------------------------------
def assign_grade(row):
    if row["Attendance_%"] < 80:
        return "F"
    elif row["Overall_Score"] >= 8.0:
        return "A"
    elif row["Overall_Score"] >= 7.0:
        return "B"
    elif row["Overall_Score"] >= 6.0:
        return "C"
    elif row["Overall_Score"] >= 5.0:
        return "D"
    else:
        return "F"

df["Grade"] = df.apply(assign_grade, axis=1)

# -------------------------------
# Step 6: Save to Excel
# -------------------------------
output_file = "PEP_Student_Performance.xlsx"
df.to_excel(output_file, index=False)

# -------------------------------
# Step 7: Summary
# -------------------------------
print(f"‚úÖ Dataset generated successfully with {len(df)} rows and {len(df.columns)} columns.")
print(f"üìÇ Saved as: {output_file}")
print("\nüéì Grade distribution:")
print(df['Grade'].value_counts())
display(df.head(10))


‚úÖ Dataset generated successfully with 300 rows and 15 columns.
üìÇ Saved as: PEP_Student_Performance.xlsx

üéì Grade distribution:
Grade
F    89
C    74
D    73
B    51
A    13
Name: count, dtype: int64


Unnamed: 0,Student_ID,Age,Gender,Height_cm,Weight_kg,BMI,Run_3km_Min,Pushups,Situps,Beep_Test,Attendance_%,Speed_Score,Strength_Score,Overall_Score,Grade
0,1,21,F,161,67,25.8,17.8,28,15,6.1,87.2,6.78,6.14,6.3,C
1,2,22,F,176,58,18.7,22.7,26,27,6.4,87.6,4.06,7.57,6.0,C
2,3,20,M,179,84,26.2,18.6,23,15,8.1,86.5,6.33,5.43,6.6,C
3,4,22,M,183,55,16.4,20.2,27,17,4.5,87.9,5.44,6.29,5.4,D
4,5,22,M,171,71,24.3,21.9,35,30,6.1,82.2,4.5,9.29,6.6,C
5,6,19,M,177,62,19.8,21.9,30,16,7.0,83.1,4.5,6.57,6.0,C
6,7,20,F,162,56,21.3,15.6,19,26,6.4,84.7,8.0,6.43,6.9,C
7,8,20,F,160,82,32.0,24.3,32,22,7.1,84.3,3.17,7.71,6.0,C
8,9,20,F,160,62,24.2,13.6,28,20,7.2,83.3,9.11,6.86,7.7,B
9,10,22,F,178,61,19.3,14.5,34,28,4.0,82.7,8.61,8.86,7.2,B


In [3]:
# ============================================================
# üìä Visualization Code for Normalized PEP Dataset
# ============================================================

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load your normalized dataset
df = pd.read_excel("PEP_Student_Performance_Final.xlsx")   # adjust name if needed

sns.set(style="whitegrid", palette="viridis")

# ------------------------------------------------------------
# 1Ô∏è‚É£ Distribution Plots (Histograms)
# ------------------------------------------------------------

numeric_columns = [
    "Run_3km_Min", "Pushups", "Situps", "Beep_Test",
    "Attendance_%", "Speed_Score", "Strength_Score", "Overall_Score"
]

df[numeric_columns].hist(figsize=(14, 12), bins=15, edgecolor='black')
plt.suptitle("üìä Distribution of Normalized Features", fontsize=16)
plt.show()

# ------------------------------------------------------------
# 2Ô∏è‚É£ Boxplots (Check Outliers)
# ------------------------------------------------------------
plt.figure(figsize=(14, 8))
sns.boxplot(data=df[numeric_columns])
plt.title("üì¶ Boxplot of Normalized Features (Outlier Detection)")
plt.xticks(rotation=45)
plt.show()

# ------------------------------------------------------------
# 3Ô∏è‚É£ Pairplot (Relationships Between Features)
# ------------------------------------------------------------
sns.pairplot(df[numeric_columns], diag_kind="kde")
plt.suptitle("üîó Pairplot of Performance Features", y=1.02)
plt.show()

# ------------------------------------------------------------
# 4Ô∏è‚É£ Correlation Heatmap
# ------------------------------------------------------------
plt.figure(figsize=(12, 8))
corr = df[numeric_columns].corr()
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("üî• Correlation Heatmap of Normalized Features")
plt.show()

# ------------------------------------------------------------
# 5Ô∏è‚É£ Grade Distribution
# ------------------------------------------------------------
plt.figure(figsize=(8, 5))
sns.countplot(x="Grade", data=df, palette="coolwarm")
plt.title("üéì Grade Distribution")
plt.xlabel("Grade")
plt.ylabel("Count")
plt.show()

# ------------------------------------------------------------
# 6Ô∏è‚É£ Attendance vs Grade
# ------------------------------------------------------------
plt.figure(figsize=(8, 6))
sns.boxplot(x="Grade", y="Attendance_%", data=df)
sns.swarmplot(x="Grade", y="Attendance_%", data=df, color="black", alpha=0.6)
plt.title("üìÖ Attendance vs Grade")
plt.show()

# ------------------------------------------------------------
# 7Ô∏è‚É£ Performance Comparison by Grade
# ------------------------------------------------------------
plt.figure(figsize=(12, 6))
sns.boxplot(x="Grade", y="Overall_Score", data=df, palette="viridis")
plt.title("üí™ Overall Score by Grade")
plt.show()

plt.figure(figsize=(12, 6))
sns.boxplot(x="Grade", y="Speed_Score", data=df)
plt.title("üèÉ Speed Score by Grade")
plt.show()

plt.figure(figsize=(12, 6))
sns.boxplot(x="Grade", y="Strength_Score", data=df)
plt.title("üèã Strength Score by Grade")
plt.show()

# ------------------------------------------------------------
# 8Ô∏è‚É£ 3km Run Time vs Grade
# ------------------------------------------------------------
plt.figure(figsize=(10, 6))
sns.boxplot(x="Grade", y="Run_3km_Min", data=df)
plt.title("‚è±Ô∏è 3 km Run Time vs Grade")
plt.show()

print("‚úÖ All visualizations generated successfully!")



KeyboardInterrupt

