In [None]:
# ==================================================
# TASK 1: Import Required Python Libraries
# ==================================================
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# ==================================================
# TASK 2: Dataset Description and Source
# ==================================================
"""
Dataset: Students Performance in Exams
Source: https://www.kaggle.com/spscientist/students-performance-in-exams
Description: Contains 1000 student records with:
- Demographic features (gender, race/ethnicity)
- Educational background (parental education, lunch type)
- Test preparation status
- Math, Reading, and Writing scores (0-100)
"""

# ==================================================
# TASK 3: Load Dataset into Pandas DataFrame
# ==================================================
df = pd.read_csv("StudentsPerformance.csv")

# ==================================================
# TASK 4: Data Preprocessing
# ==================================================
print("\n" + "="*40 + " DATA PREPROCESSING " + "="*40)

# 4.1 Check Dimensions
print("\n1. DataFrame Dimensions:")
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")

# 4.2 Check Missing Values
print("\n2. Missing Values:")
print(df.isnull().sum())

# 4.3 Initial Statistics
print("\n3. Numerical Statistics:")
print(df.describe())

# 4.4 Variable Descriptions
print("\n4. Variable Types:")
print(df.info())

# ==================================================
# TASK 5: Data Formatting and Normalization
# ==================================================
print("\n" + "="*40 + " DATA FORMATTING " + "="*40)

# 5.1 Convert Object Types to Category
cat_cols = ['gender', 'race/ethnicity', 'parental level of education', 
            'lunch', 'test preparation course']
df[cat_cols] = df[cat_cols].astype('category')

# 5.2 Handle Missing Values (Writing Score)
df['writing score'].fillna(df['writing score'].median(), inplace=True)

# 5.3 Normalize Numerical Scores (0-1 range)
scaler = MinMaxScaler()
num_cols = ['math score', 'reading score', 'writing score']
df[num_cols] = scaler.fit_transform(df[num_cols])

print("\n5. Updated Data Types:")
print(df.dtypes)

# ==================================================
# TASK 6: Categorical to Quantitative Conversion
# ==================================================
print("\n" + "="*40 + " CATEGORICAL ENCODING " + "="*40)

# 6.1 Label Encoding (for ordinal data)
df['parental_edu_encoded'] = df['parental level of education'].cat.codes

# 6.2 One-Hot Encoding (for nominal data)
df = pd.get_dummies(df, columns=['gender', 'race/ethnicity', 
                               'lunch', 'test preparation course'])

# ==================================================
# FINAL VERIFICATION
# ==================================================
print("\n" + "="*40 + " FINAL DATAFRAME " + "="*40)
print(f"\nFinal Dimensions: {df.shape}")
print("\nFirst 3 Rows:")
print(df.head(3))