In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kstest, norm

# Load Dataset

In [17]:
df = pd.read_csv('Student_Performance_Data.csv')
df.head(20)

Unnamed: 0,sex,age,mother_edu,father_edu,mother_job,father_job,traveltime,studytime,school_support,family_support,internet,romantic,freetime,health,absences,grade
0,F,18,4,4,at_home,teacher,2,2,yes,no,no,no,3,3,6,30
1,F,17,1,1,at_home,other,1,2,no,yes,yes,no,3,3,4,30
2,F,15,1,1,at_home,other,1,2,yes,no,yes,no,3,3,10,50
3,F,15,4,2,health,services,1,3,no,yes,yes,yes,2,5,2,75
4,F,16,3,3,other,other,1,2,no,yes,no,no,3,5,4,50
5,M,16,4,3,services,other,1,2,no,yes,yes,no,4,5,10,75
6,M,16,2,2,other,other,1,2,no,no,yes,no,4,3,0,55
7,F,17,4,4,other,teacher,2,2,yes,yes,no,no,1,1,6,30
8,M,15,3,2,services,other,1,2,no,yes,yes,no,2,1,0,95
9,M,15,3,4,other,other,1,2,no,yes,yes,no,5,5,0,75


# Data Preparation

In [4]:
# ringkasan dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   sex             395 non-null    object
 1   age             395 non-null    int64 
 2   mother_edu      395 non-null    int64 
 3   father_edu      395 non-null    int64 
 4   mother_job      395 non-null    object
 5   father_job      395 non-null    object
 6   traveltime      395 non-null    int64 
 7   studytime       395 non-null    int64 
 8   school_support  395 non-null    object
 9   family_support  395 non-null    object
 10  internet        395 non-null    object
 11  romantic        395 non-null    object
 12  freetime        395 non-null    int64 
 13  health          395 non-null    int64 
 14  absences        395 non-null    int64 
 15  grade           395 non-null    int64 
dtypes: int64(9), object(7)
memory usage: 49.5+ KB


In [5]:
# ringkasan statistik deskriptif
df.describe()

Unnamed: 0,age,mother_edu,father_edu,traveltime,studytime,freetime,health,absences,grade
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,2.749367,2.521519,1.448101,2.035443,3.235443,3.55443,5.708861,52.075949
std,1.276043,1.094735,1.088201,0.697505,0.83924,0.998862,1.390303,8.003096,22.907213
min,15.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
25%,16.0,2.0,2.0,1.0,1.0,3.0,3.0,0.0,40.0
50%,17.0,3.0,2.0,1.0,2.0,3.0,4.0,4.0,55.0
75%,18.0,4.0,3.0,2.0,2.0,4.0,5.0,8.0,70.0
max,22.0,4.0,4.0,4.0,4.0,5.0,5.0,75.0,100.0


In [6]:
# cek missing value
df.isnull().sum()

sex               0
age               0
mother_edu        0
father_edu        0
mother_job        0
father_job        0
traveltime        0
studytime         0
school_support    0
family_support    0
internet          0
romantic          0
freetime          0
health            0
absences          0
grade             0
dtype: int64

In [7]:
# cek nilai duplikat
df.duplicated().sum()

np.int64(0)

In [8]:
# cek nilai unik
unique_summary = pd.DataFrame({
    'n_unique': df.nunique(),
    'unique_values': df.apply(lambda x: x.unique())
})

unique_summary

Unnamed: 0,n_unique,unique_values
sex,2,"[F, M]"
age,8,"[18, 17, 15, 16, 19, 22, 20, 21]"
mother_edu,5,"[4, 1, 3, 2, 0]"
father_edu,5,"[4, 1, 2, 3, 0]"
mother_job,5,"[at_home, health, other, services, teacher]"
father_job,5,"[teacher, other, services, health, at_home]"
traveltime,4,"[2, 1, 3, 4]"
studytime,4,"[2, 3, 1, 4]"
school_support,2,"[yes, no]"
family_support,2,"[no, yes]"


In [9]:
# cek outliers
numeric_cols = [
    'age',
    'mother_edu',
    'father_edu',
    'traveltime',
    'studytime',
    'freetime',
    'health',
    'absences',
    'grade'
]

def detect_outliers_iqr(df, columns):
    outlier_summary = {}

    for col in columns:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1

        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr

        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]

        outlier_summary[col] = {
            'lower_bound': lower_bound,
            'upper_bound': upper_bound,
            'outlier_count': outliers.shape[0]
        }

    return pd.DataFrame(outlier_summary).T

outlier_summary = detect_outliers_iqr(df, numeric_cols)
outlier_summary

Unnamed: 0,lower_bound,upper_bound,outlier_count
age,13.0,21.0,1.0
mother_edu,-1.0,7.0,0.0
father_edu,0.5,4.5,2.0
traveltime,-0.5,3.5,8.0
studytime,-0.5,3.5,27.0
freetime,1.5,5.5,19.0
health,0.0,8.0,0.0
absences,-12.0,20.0,15.0
grade,-5.0,115.0,0.0


In [10]:
# cek kesesuaian input data numerik
num_cols = df.select_dtypes(include=['int64','float64']).columns
num_cols

outlier_summary = {}

# mencari outlier menggunakan metode IQR
for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    
# Filter data outlier
    outliers = df[(df[col] < lower) | (df[col] > upper)]

    outlier_summary[col] = {
        'lower_bound': lower,
        'upper_bound': upper,
        'n_outliers': outliers.shape[0],
        'percentage': (outliers.shape[0])/len(df[col])
    }
pd.DataFrame(outlier_summary)

Unnamed: 0,age,mother_edu,father_edu,traveltime,studytime,freetime,health,absences,grade
lower_bound,13.0,-1.0,0.5,-0.5,-0.5,1.5,0.0,-12.0,-5.0
upper_bound,21.0,7.0,4.5,3.5,3.5,5.5,8.0,20.0,115.0
n_outliers,1.0,0.0,2.0,8.0,27.0,19.0,0.0,15.0,0.0
percentage,0.002532,0.0,0.005063,0.020253,0.068354,0.048101,0.0,0.037975,0.0


In [23]:
# DATA DISTRIBUSI
#Variable Age
stat,p = kstest(df['age'], 'norm', args=(np.mean(df['age']),np.std(df['age'])))
print("Statistik uji:",stat)
print("p-value:",p)

if p > 0.05:
    print("Data berdistribusi normal")
else:
    print("Data tidak berdistribusi normal\n")

#Variable Hours per week
stat,p = kstest(df['absences'], 'norm', args=(np.mean(df['absences']),np.std(df['absences'])))
print("Statistik uji:",stat)
print("p-value:",p)

if p > 0.05:
    print("Data berdistribusi normal")
else:
    print("Data tidak berdistribusi normal\n")

#Variable Work_Life_Balance_Score
stat,p = kstest(df['studytime'], 'norm', args=(np.mean(df['studytime']),np.std(df['studytime'])))
print("Statistik uji:",stat)
print("p-value:",p)

if p > 0.05:
    print("Data berdistribusi normal")
else:
    print("Data tidak berdistribusi normal\n")

#Variable Social_Isolation_Score
stat,p = kstest(df['freetime'], 'norm', args=(np.mean(df['freetime']),np.std(df['freetime'])))
print("Statistik uji:",stat)
print("p-value:",p)

if p > 0.05:
    print("\nData berdistribusi normal")
else:
    print("Data tidak berdistribusi normal")

Statistik uji: 0.1784517204522323
p-value: 1.7810133109902215e-11
Data tidak berdistribusi normal

Statistik uji: 0.23754055773854704
p-value: 4.2931563213892315e-20
Data tidak berdistribusi normal

Statistik uji: 0.2839531933224267
p-value: 1.1257676173978144e-28
Data tidak berdistribusi normal

Statistik uji: 0.20088256080338324
p-value: 1.8972292550397793e-14
Data tidak berdistribusi normal
