In [None]:
import pandas as pd

# Load dataset
file_path = 'student_performance.csv'
df = pd.read_csv(file_path)

# Display basic info
display(df.head())

In [None]:
# Check for missing values
print(df.isnull().sum())

In [None]:
# Convert Distance_from_Home to numerical values
distance_mapping = {"Near": 1, "Moderate": 2, "Far": 3}
df["Distance_from_Home"] = df["Distance_from_Home"].map(distance_mapping)

# Fill missing values for numerical columns with median
numerical_cols = ["Distance_from_Home"]
for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Fill missing values for categorical columns with mode
categorical_cols = ["Teacher_Quality", "Parental_Education_Level"]
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Check if all missing values are handled
print(df.isnull().sum())


In [None]:
import plotly.express as px
import numpy as np
import plotly.graph_objects as go

# Scatter plot
fig = px.scatter(
    df, 
    x="Hours_Studied", 
    y="Exam_Score", 
    color="Exam_Score", 
    color_continuous_scale="Viridis",
    title="Relationship Between Hours Studied and Exam Score",
    labels={"Hours_Studied": "Hours Studied", "Exam_Score": ""},  # Remove legend title
    hover_data=["Hours_Studied", "Exam_Score"]
)

# Add best-fit regression line
m, b = np.polyfit(df["Hours_Studied"], df["Exam_Score"], 1)  # Linear fit
fig.add_trace(go.Scatter(
    x=df["Hours_Studied"],
    y=m * df["Hours_Studied"] + b,
    mode="lines",
    name="Regression Line",
    line=dict(color="red", width=2, dash="dash")
))

# Customize layout
fig.update_layout(
    font=dict(size=14, family="Arial"),
    plot_bgcolor="rgba(240, 240, 240, 0.8)",
    title_x=0.5,
    title_font_size=20,
    xaxis=dict(showgrid=True, zeroline=False, linecolor="black"),
    yaxis=dict(showgrid=True, zeroline=False, linecolor="black"),
    hovermode="x unified",
    coloraxis_colorbar=dict(title="..")
)

# Show plot
fig.show()