<a href="https://colab.research.google.com/github/internship-kaard/zenteiq-kaard/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Develop a machine learning model that can predict student dropout rates or academic success based on a variety of factors, such as attendance, grades, and demographic data. The model should be able to identify students who are at risk of dropping out or falling behind and provide targeted interventions and support.

In [61]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import math
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier


In [62]:
#data loading
df = pd.read_csv('dataset.csv')
#view the data
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.67,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [63]:
#finding null and duplicates
df.isna().sum()
df.duplicated().sum()

0

In [64]:
#EDA - Exploratory Data Analysis
print(df.shape)

(4424, 37)


In [65]:
##basic information of data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 37 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital status                                  4424 non-null   int64  
 1   Application mode                                4424 non-null   int64  
 2   Application order                               4424 non-null   int64  
 3   Course                                          4424 non-null   int64  
 4   Daytime/evening attendance                      4424 non-null   int64  
 5   Previous qualification                          4424 non-null   int64  
 6   Previous qualification (grade)                  4424 non-null   float64
 7   Nacionality                                     4424 non-null   int64  
 8   Mother's qualification                          4424 non-null   int64  
 9   Father's qualification                   

In [66]:
##describe the data
from pandas.io.formats.format import set_option
set_option('display.precision',2)
df.describe()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
count,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,...,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4420.0
mean,1.18,18.67,1.73,8856.64,0.89,4.58,132.61,1.87,19.56,22.28,...,0.14,0.54,6.23,8.06,4.44,10.23,0.15,11.57,1.23,0.00197
std,0.61,17.48,1.31,2063.57,0.31,10.22,13.19,6.91,15.6,15.34,...,0.69,1.92,2.2,3.95,3.01,5.21,0.75,2.66,1.38,2.27
min,1.0,1.0,0.0,33.0,0.0,1.0,95.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.6,-0.8,-4.06
25%,1.0,1.0,1.0,9085.0,1.0,1.0,125.0,1.0,2.0,3.0,...,0.0,0.0,5.0,6.0,2.0,10.75,0.0,9.4,0.3,-1.7
50%,1.0,17.0,1.0,9238.0,1.0,1.0,133.1,1.0,19.0,19.0,...,0.0,0.0,6.0,8.0,5.0,12.2,0.0,11.1,1.4,0.32
75%,1.0,39.0,2.0,9556.0,1.0,1.0,140.0,1.0,37.0,37.0,...,0.0,0.0,7.0,10.0,6.0,13.33,0.0,13.9,2.6,1.79
max,6.0,57.0,9.0,9991.0,1.0,43.0,190.0,109.0,44.0,44.0,...,12.0,19.0,23.0,33.0,20.0,18.57,12.0,16.2,3.7,3.51


In [None]:
##finding unique values
# list(df)
for col in list(df):
  print(col)
  print(df[col].unique())

In [68]:
df['Target'].unique()

array(['Dropout', 'Graduate', 'Enrolled'], dtype=object)

In [70]:
df.loc[:,'Target'].value_counts()

Graduate    2209
Dropout     1421
Enrolled     794
Name: Target, dtype: int64

In [69]:
df['Marital status'].unique()

array([1, 2, 4, 3, 5, 6])

In [71]:
df.loc[:,'Marital status'].value_counts()

1    3919
2     379
4      91
5      25
6       6
3       4
Name: Marital status, dtype: int64

In [72]:
df.loc[:,'Gender'].value_counts()

0    2868
1    1556
Name: Gender, dtype: int64

In [88]:
df.columns

Index(['Marital status', 'Application mode', 'Application order', 'Course',
       'Daytime/evening attendance', 'Previous qualification', 'Previous qualification (grade)',
       'Nacionality', 'Mother's qualification', 'Father's qualification', 'Mother's occupation',
       'Father's occupation', 'Admission grade', 'Displaced', 'Educational special needs',
       'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder', 'Age at enrollment',
       'International', 'Curricular units 1st sem (credited)',
       'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)',
       'Curricular units 1st sem (without evaluations)', 'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)',
       'Curricular units 2nd sem (without e

In [73]:
#stacked bar chart


In [None]:
##finding correlations between attributes
df.corr(method='pearson')

In [None]:
##correlation plot
sns.heatmap(df.corr())

In [None]:
##finding skewness
df.skew()

In [None]:
##histogram
sns.set_theme(style = 'darkgrid')
df.hist(bins=10, figsize=(50, 35), grid=True, legend=None);

In [None]:
# #kernal density estimate
# sns.set_theme(style = 'darkgrid')
# df['Marital status'].value_counts().plot(kind = 'kde');