In [26]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('/content/students.csv')
print("Number of duplicate rows:", df.duplicated().sum())

df = df.drop_duplicates()
print("After removing duplicates:", df.duplicated().sum())

print("\nMissing values before cleaning:")
print(df.isnull().sum())

# Fill missing numeric values with mean, and categorical with mode
for col in df.columns:
    if df[col].dtype == 'object':  # Categorical column
        df[col].fillna(df[col].mode()[0])
    else:  # Numeric column
        df[col].fillna(df[col].mean(), inplace=True)

print("\nMissing values after filling:")
print(df.isnull().sum())

#  Detecting outliers in numeric columns using boxplot
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

for col in numeric_cols:
    plt.figure(figsize=(6,3))
    sns.boxplot(x=df[col])
    plt.title(f'Outlier check for {col}')
    plt.show()

# Remove outliers using IQR method for each numeric column
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    df = df[(df[col] >= Q1 - 1.5 * IQR) & (df[col] <= Q3 + 1.5 * IQR)]

print("\nData types:")
print(df.dtypes)


# Reload original dataset again for comparison
original_df = pd.read_csv('/content/students.csv')

print("BEFORE CLEANING:")
print("Rows, Columns:", original_df.shape)
print("Missing values:\n", original_df.isnull().sum())
print("Duplicates:", original_df.duplicated().sum())

print("\nAFTER CLEANING:")
print("Rows, Columns:", df.shape)
print("Missing values:\n", df.isnull().sum())
print("Duplicates:", df.duplicated().sum())

print("\nChanges Summary:")
print(f"Rows removed: {original_df.shape[0] - df.shape[0]}")

df.to_csv('StudentsPerformance_Cleaned.csv', index=False)



Number of duplicate rows: 0
After removing duplicates: 0

Missing values before cleaning:
school;sex;age;address;famsize;Pstatus;Medu;Fedu;Mjob;Fjob;reason;guardian;traveltime;studytime;failures;schoolsup;famsup;paid;activities;nursery;higher;internet;romantic;famrel;freetime;goout;Dalc;Walc;health;absences;G1;G2;G3    0
dtype: int64

Missing values after filling:
school;sex;age;address;famsize;Pstatus;Medu;Fedu;Mjob;Fjob;reason;guardian;traveltime;studytime;failures;schoolsup;famsup;paid;activities;nursery;higher;internet;romantic;famrel;freetime;goout;Dalc;Walc;health;absences;G1;G2;G3    0
dtype: int64

Data types:
school;sex;age;address;famsize;Pstatus;Medu;Fedu;Mjob;Fjob;reason;guardian;traveltime;studytime;failures;schoolsup;famsup;paid;activities;nursery;higher;internet;romantic;famrel;freetime;goout;Dalc;Walc;health;absences;G1;G2;G3    object
dtype: object
BEFORE CLEANING:
Rows, Columns: (649, 1)
Missing values:
 school;sex;age;address;famsize;Pstatus;Medu;Fedu;Mjob;Fjob;reaso