# Data Exploration

## Load the datasets

In [None]:
import pandas as pd

# Load datasets
mat_df = pd.read_csv('../data/raw/student-mat.csv', sep=';')
por_df = pd.read_csv('../data/raw/student-por.csv', sep=';')

# Display the first few rows and shapes of each dataset
print("Student-Math Dataset:")
print(mat_df.head())
print(f"Shape: {mat_df.shape}\n")

print("Student-Portuguese Dataset:")
print(por_df.head())
print(f"Shape: {por_df.shape}\n")

## Explore the data

In [None]:
# Check for missing values in both datasets
print("Missing Values (Student-Math):")
print(mat_df.isnull().sum().sort_values(ascending=False))
print("\nMissing Values (Student-Portuguese):")
print(por_df.isnull().sum().sort_values(ascending=False))

# Display summary statistics for numeric columns
print("\nSummary Statistics (Student-Math):")
print(mat_df.describe())
print("\nSummary Statistics (Student-Portuguese):")
print(por_df.describe())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Visualize distributions of the target variables (G3)
plt.figure(figsize=(10, 6))
sns.histplot(mat_df['G3'], kde=True, color='blue', label='Math Final Grade')
sns.histplot(por_df['G3'], kde=True, color='orange', label='Portuguese Final Grade', alpha=0.6)
plt.title("Distribution of Final Grades (G3)")
plt.xlabel("Grade")
plt.ylabel("Frequency")
plt.legend()
plt.show()

In [None]:
# Filter numeric columns only
numeric_columns = mat_df.select_dtypes(include=['number']).columns

# Calculate correlation matrix for numeric columns
plt.figure(figsize=(12, 10))
corr_matrix = mat_df[numeric_columns].corr()
sns.heatmap(corr_matrix, annot=False, cmap="coolwarm", cbar=True)
plt.title("Correlation Matrix (Student-Math)")
plt.show()

In [None]:
# Filter numeric columns only
numeric_columns = por_df.select_dtypes(include=['number']).columns

# Calculate correlation matrix for numeric columns
plt.figure(figsize=(12, 10))
corr_matrix = por_df[numeric_columns].corr()
sns.heatmap(corr_matrix, annot=False, cmap="coolwarm", cbar=True)
plt.title("Correlation Matrix (Student-Portugeuse)")
plt.show()

## Merge the Datasets

In [None]:
# Common columns to merge on
common_columns = ['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 
                  'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 
                  'traveltime', 'studytime', 'failures', 'schoolsup', 
                  'famsup', 'paid', 'activities', 'nursery', 'higher', 
                  'internet', 'romantic', 'famrel', 'freetime', 'goout', 
                  'Dalc', 'Walc', 'health', 'absences']

# Merge datasets on common columns
merged_df = pd.merge(mat_df, por_df, on=common_columns, suffixes=('_mat', '_por'))
print("\nMerged Dataset:")
print(merged_df.head())
print(f"Shape: {merged_df.shape}\n")

## Feature Engineering

In [None]:
# Create a binary target variable: Did the student pass? (G3 >= 10)
merged_df['pass_mat'] = merged_df['G3_mat'].apply(lambda x: 1 if x >= 10 else 0)
merged_df['pass_por'] = merged_df['G3_por'].apply(lambda x: 1 if x >= 10 else 0)

# Convert categorical variables into dummy/indicator variables
merged_df = pd.get_dummies(merged_df, drop_first=True)

# Display the first few rows of the processed data
merged_df.head()

In [None]:
# Check class distribution for binary targets
print("Class Distribution for pass_mat (Math):")
print(merged_df['pass_mat'].value_counts(normalize=True))
print("\nClass Distribution for pass_por (Portuguese):")
print(merged_df['pass_por'].value_counts(normalize=True))

In [None]:
# Visualize class distribution for Math pass rate
plt.figure(figsize=(6, 4))
sns.countplot(x='pass_mat', data=merged_df, palette='Set2')
plt.title("Pass Rate Distribution (Math)")
plt.xlabel("Pass (1) / Fail (0)")
plt.ylabel("Count")
plt.show()

In [None]:
# Visualize relationship between final grade and absences
plt.figure(figsize=(10, 6))
sns.scatterplot(x='absences', y='G3_mat', data=merged_df, color='purple', alpha=0.7)
plt.title("Relationship Between Absences and Final Grade (Math)")
plt.xlabel("Absences")
plt.ylabel("Final Grade (G3)")
plt.show()

## Modeling

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Define features and target variable
X = merged_df.drop(columns=['G1_mat', 'G2_mat', 'G3_mat', 'G1_por', 'G2_por', 'G3_por', 'pass_mat', 'pass_por'])
y = merged_df['pass_mat']  # or 'pass_port' depending on the analysis

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model 1: Decision Tree
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)
print("Decision Tree:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_tree)}")
print(f"Classification Report:\n{classification_report(y_test, y_pred_tree)}")

# Model 2: Logistic Regression
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)
print("\nLogistic Regression:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_logistic)}")
print(f"Classification Report:\n{classification_report(y_test, y_pred_logistic)}")

# Model 3: Random Forest
forest_model = RandomForestClassifier(random_state=42)
forest_model.fit(X_train, y_train)
y_pred_forest = forest_model.predict(X_test)
print("\nRandom Forest:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_forest)}")
print(f"Classification Report:\n{classification_report(y_test, y_pred_forest)}")

# Optional: Evaluate with ROC-AUC Score
for model, name in zip([tree_model, logistic_model, forest_model], 
                       ["Decision Tree", "Logistic Regression", "Random Forest"]):
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    print(f"\n{name} ROC-AUC Score: {roc_auc:.4f}")

# Visualize Feature Importance (Random Forest)
feature_importances = pd.Series(forest_model.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances[:10], y=feature_importances.index[:10])
plt.title("Top 10 Feature Importances (Random Forest)")
plt.show()
