In [None]:
import pandas as pd

# Load dataset
file_path = 'student_performance.csv'
df = pd.read_csv(file_path)

# Display basic info
display(df.head())

In [None]:
# Check for missing values
print(df.isnull().sum())

In [None]:
# Convert Distance_from_Home to numerical values
distance_mapping = {"Near": 1, "Moderate": 2, "Far": 3}
df["Distance_from_Home"] = df["Distance_from_Home"].map(distance_mapping)

# Fill missing values for numerical columns with median
numerical_cols = ["Distance_from_Home"]
for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Fill missing values for categorical columns with mode
categorical_cols = ["Teacher_Quality", "Parental_Education_Level"]
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Check if all missing values are handled
print(df.isnull().sum())


In [None]:
import plotly.express as px
import numpy as np
import plotly.graph_objects as go

# Scatter plot
fig = px.scatter(
    df, 
    x="Hours_Studied", 
    y="Exam_Score", 
    color="Exam_Score", 
    color_continuous_scale="Viridis",
    title="Relationship Between Hours Studied and Exam Score",
    labels={"Hours_Studied": "Hours Studied", "Exam_Score": ""},  # Remove legend title
    hover_data=["Hours_Studied", "Exam_Score"]
)

# Add best-fit regression line
m, b = np.polyfit(df["Hours_Studied"], df["Exam_Score"], 1)  # Linear fit
fig.add_trace(go.Scatter(
    x=df["Hours_Studied"],
    y=m * df["Hours_Studied"] + b,
    mode="lines",
    name="Regression Line",
    line=dict(color="red", width=2, dash="dash")
))

# Customize layout
fig.update_layout(
    font=dict(size=14, family="Arial"),
    plot_bgcolor="rgba(240, 240, 240, 0.8)",
    title_x=0.5,
    title_font_size=20,
    xaxis=dict(showgrid=True, zeroline=False, linecolor="black"),
    yaxis=dict(showgrid=True, zeroline=False, linecolor="black"),
    hovermode="x unified",
    coloraxis_colorbar=dict(title="..")
)

# Show plot
fig.show()

In [None]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
from scipy import stats
from sklearn.linear_model import LinearRegression

# Load dataset
file_path = 'student_performance.csv'
df = pd.read_csv(file_path)

# Perform Linear Regression
X = df[["Hours_Studied"]]  # Independent variable
y = df["Exam_Score"]  # Dependent variable
model = LinearRegression()
model.fit(X, y)

# Get Regression Line Values
slope = model.coef_[0]  # Slope
intercept = model.intercept_  # Intercept
r_squared = model.score(X, y)  # R² value
df["Predicted_Score"] = model.predict(X)

# Compute Pearson correlation coefficient
correlation_coefficient, p_value = stats.pearsonr(df["Hours_Studied"], df["Exam_Score"])

# Display correlation analysis results
print("🔍 **Correlation Analysis Between Hours Studied & Exam Score**")
print(f"✅ **Pearson Correlation Coefficient (r):** {correlation_coefficient:.3f}")
print(f"📊 **P-value:** {p_value:.5f}")

# Interpretation of correlation strength
if abs(correlation_coefficient) >= 0.7:
    strength = "strong"
elif abs(correlation_coefficient) >= 0.4:
    strength = "moderate"
else:
    strength = "weak"

if correlation_coefficient > 0:
    print(f"🔹 There is a **{strength} positive correlation** between Hours Studied and Exam Score.")
    print("📈 This suggests that as students study more, their exam scores tend to improve.")
elif correlation_coefficient < 0:
    print(f"🔹 There is a **{strength} negative correlation** between Hours Studied and Exam Score.")
    print("📉 This unusual finding suggests that more study hours may not always lead to better performance.")
else:
    print("❌ No significant correlation found between Hours Studied and Exam Score.")

# Create the Scatter Plot
fig = go.Figure()

# Scatter plot of actual data points
fig.add_trace(go.Scatter(
    x=df["Hours_Studied"],
    y=df["Exam_Score"],
    mode="markers",
    name="Actual Scores",
    marker=dict(size=8, color=df["Exam_Score"], colorscale="Viridis", showscale=False)
))

# Add regression line
fig.add_trace(go.Scatter(
    x=df["Hours_Studied"],
    y=slope * df["Hours_Studied"] + intercept,
    mode="lines",
    line=dict(color="red", width=2, dash="dash"),
    name="Regression Line"
))

# Add regression analysis text to the plot
fig.add_annotation(
    x=max(df["Hours_Studied"]),
    y=min(df["Exam_Score"]),
    text=f"Regression Line: y = {slope:.2f}x + {intercept:.2f}<br>R² = {r_squared:.2f}",
    showarrow=False,
    font=dict(size=12, color="black"),
    align="left",
    xanchor="right",
    yanchor="bottom",
    bgcolor="white",
    bordercolor="black",
    borderwidth=1
)

# Update layout for better readability
fig.update_layout(
    title="📊 Relationship Between Hours Studied and Exam Score",
    xaxis_title="Hours Studied",
    yaxis_title="Exam Score",
    template="plotly_white",
    width=800,
    height=500
)

# Show the figure
fig.show()