In [2]:
# Install required packages
!pip install pyspark
!pip install plotly

# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, min, max, count, stddev, when, sum as spark_sum, round
import random
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Initialize Spark session for Colab
spark = SparkSession.builder \
    .appName("ResultManagementSystem") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

# Define subject parameters
subject_params = {
    "Electronics": {"pass_rate": 0.7, "max": 90},
    "Programming": {"pass_rate": 0.8, "max": 95},
    "Database": {"pass_rate": 0.6, "max": 85},
    "Data Science": {"pass_rate": 0.55, "max": 80},
    "Mathematics": {"pass_rate": 0.5, "max": 75},
    "DSA": {"pass_rate": 0.75, "max": 92}
}

def generate_student_profile():
    """Generate a single student profile with marks"""
    subjects = ["Electronics", "Programming", "Database", "Data Science", "Mathematics", "DSA"]

    student_id = random.randint(1, 10000)
    name = f"Student_{student_id}"

    marks = {}
    for subject in subjects:
        params = subject_params[subject]
        if random.random() < params["pass_rate"]:
            mark = random.randint(40, params["max"])
        else:
            mark = random.randint(0, 39)
        marks[subject] = mark

    return (student_id, name, marks)

# Generate student data
print("Generating student data...")
students_data = [generate_student_profile() for _ in range(10000)]

# Create DataFrame
print("Creating DataFrame...")
df = spark.createDataFrame(students_data, ["Student_ID", "Name", "marks"])

# Flatten the marks structure
df = df.select("Student_ID", "Name",
               col("marks.Electronics").alias("Electronics"),
               col("marks.Programming").alias("Programming"),
               col("marks.Database").alias("Database"),
               col("marks.`Data Science`").alias("Data_Science"),
               col("marks.Mathematics").alias("Mathematics"),
               col("marks.DSA").alias("DSA"))

# Calculate average marks
df = df.withColumn("Average_Marks",
                   round((col("Electronics") + col("Programming") + col("Database") +
                         col("Data_Science") + col("Mathematics") + col("DSA")) / 6, 2))

# Define subjects list
subjects = ["Electronics", "Programming", "Database", "Data_Science", "Mathematics", "DSA"]

# Calculate statistics for each subject
stats = {}
for subject in subjects:
    stats_data = df.agg(
        avg(subject).alias("avg"),
        min(subject).alias("min"),
        max(subject).alias("max"),
        stddev(subject).alias("std"),
        spark_sum(when(col(subject) >= 40, 1).otherwise(0)).alias("pass_count"),
        spark_sum(when(col(subject) < 40, 1).otherwise(0)).alias("fail_count")
    ).collect()[0]

    stats[subject] = {
        "avg": float(stats_data["avg"]),
        "min": float(stats_data["min"]),
        "max": float(stats_data["max"]),
        "std": float(stats_data["std"]),
        "pass_count": int(stats_data["pass_count"]),
        "fail_count": int(stats_data["fail_count"])
    }

# Create Dashboard
print("\nCreating dashboard...")
fig = make_subplots(
    rows=3, cols=2,
    subplot_titles=("Subject-wise Average Marks", "Mark Distribution",
                   "Pass/Fail Distribution", "Subject-wise Performance",
                   "Maximum vs Minimum Marks", "Standard Deviation")
)

# Subject-wise Average Marks
avg_marks = [stats[subject]["avg"] for subject in subjects]
fig.add_trace(
    go.Bar(x=subjects, y=avg_marks, name="Average Marks"),
    row=1, col=1
)

# Mark Distribution
fig.add_trace(
    go.Box(y=df.select("Average_Marks").rdd.flatMap(lambda x: x).collect(),
           name="Mark Distribution"),
    row=1, col=2
)

# Pass/Fail Distribution
pass_counts = [stats[subject]["pass_count"] for subject in subjects]
fail_counts = [stats[subject]["fail_count"] for subject in subjects]

fig.add_trace(
    go.Bar(x=subjects, y=pass_counts, name="Passed", marker_color='green'),
    row=2, col=1
)
fig.add_trace(
    go.Bar(x=subjects, y=fail_counts, name="Failed", marker_color='red'),
    row=2, col=1
)

# Subject-wise Performance
fig.add_trace(
    go.Scatter(x=subjects, y=avg_marks, mode='lines+markers', name="Average Performance"),
    row=2, col=2
)

# Maximum vs Minimum Marks
max_marks = [stats[subject]["max"] for subject in subjects]
min_marks = [stats[subject]["min"] for subject in subjects]

fig.add_trace(
    go.Bar(x=subjects, y=max_marks, name="Maximum Marks", marker_color='gold'),
    row=3, col=1
)
fig.add_trace(
    go.Bar(x=subjects, y=min_marks, name="Minimum Marks", marker_color='orange'),
    row=3, col=1
)

# Standard Deviation
std_marks = [stats[subject]["std"] for subject in subjects]
fig.add_trace(
    go.Bar(x=subjects, y=std_marks, name="Standard Deviation", marker_color='purple'),
    row=3, col=2
)

# Update layout
fig.update_layout(height=1200, width=1000,
                 showlegend=True,
                 title_text=" Result Management System Dashboard")

# Show the dashboard
fig.show()

# Print Statistics
print("\nBasic Statistics:")
print("=================")
for subject in subjects:
    print(f"\n{subject} Statistics:")
    print(f"Average: {stats[subject]['avg']:.2f}")
    print(f"Minimum: {stats[subject]['min']:.2f}")
    print(f"Maximum: {stats[subject]['max']:.2f}")
    print(f"Std Dev: {stats[subject]['std']:.2f}")
    print(f"Pass Rate: {(stats[subject]['pass_count']/10000)*100:.2f}%")
    print(f"Fail Rate: {(stats[subject]['fail_count']/10000)*100:.2f}%")

# Display Top Performers
print("\nTop 10 Performers:")
df.orderBy("Average_Marks", ascending=False) \
  .select("Student_ID", "Name", "Average_Marks") \
  .show(10)

# Stop Spark session
spark.stop()

Generating student data...
Creating DataFrame...

Creating dashboard...



Basic Statistics:

Electronics Statistics:
Average: 50.95
Minimum: 0.00
Maximum: 90.00
Std Dev: 24.74
Pass Rate: 69.73%
Fail Rate: 30.27%

Programming Statistics:
Average: 57.81
Minimum: 0.00
Maximum: 95.00
Std Dev: 24.75
Pass Rate: 79.88%
Fail Rate: 20.12%

Database Statistics:
Average: 45.06
Minimum: 0.00
Maximum: 85.00
Std Dev: 24.71
Pass Rate: 59.86%
Fail Rate: 40.14%

Data_Science Statistics:
Average: 41.99
Minimum: 0.00
Maximum: 80.00
Std Dev: 23.27
Pass Rate: 55.27%
Fail Rate: 44.73%

Mathematics Statistics:
Average: 38.54
Minimum: 0.00
Maximum: 75.00
Std Dev: 22.06
Pass Rate: 50.08%
Fail Rate: 49.92%

DSA Statistics:
Average: 54.15
Minimum: 0.00
Maximum: 92.00
Std Dev: 24.81
Pass Rate: 74.85%
Fail Rate: 25.15%

Top 10 Performers:
+----------+------------+-------------+
|Student_ID|        Name|Average_Marks|
+----------+------------+-------------+
|      4912|Student_4912|        77.17|
|      7077|Student_7077|        76.83|
|      5039|Student_5039|         76.5|
|      3018