In [32]:
import numpy as np
import pandas as pd 
import seaborn as sea 
import matplotlib.pyplot as plt
from sklearn.metrics import *

In [33]:
data = pd.read_csv('datasets/StudentPrediction/StudentsPerformance.csv')

In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [35]:
data.head(3)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93


In [36]:
conj = data.isna().nunique().reset_index()
conj.loc[conj[0]>1]

Unnamed: 0,index,0


Não há valores nulos

In [37]:
subdata = data.nunique().reset_index()
subdata = subdata.loc[subdata[0]==2]
subdata.set_index('index').T.columns

Index(['gender', 'lunch', 'test preparation course'], dtype='object', name='index')

In [38]:
subdata = data.nunique().reset_index()
subdata = subdata.loc[subdata[0]>2]
subdata.set_index('index').T.columns

Index(['race/ethnicity', 'parental level of education', 'math score',
       'reading score', 'writing score'],
      dtype='object', name='index')

### Correlação

In [39]:
from scipy.stats import chi2_contingency, pointbiserialr

class Correlation:
    M_ = None
    
    def __init__(self, column1:list, column2:list, data:pd.DataFrame):
        self.columnNames1 = column1
        self.columnNames2 = column2
        self.data         = data
        self.tupla        = ( len(self.columnNames1) , len(self.columnNames2) ) 

    def quiQuadrada(self):
        self.M_ = np.zeros(shape=self.tupla)
        
        for enumI, i in enumerate(self.columnNames1):
            for enumJ, j in enumerate(self.columnNames2):
                chi, p, _, _ = chi2_contingency( pd.crosstab(self.data[i].values, self.data[j].values) )
                self.M_[enumI,enumJ] = round(p,10)
        
        return pd.DataFrame(self.M_, columns=self.columnNames1, index=self.columnNames2)

    def pointBisserl(self):
        self.M_ = np.zeros(shape=self.tupla)
        for enumI, i in enumerate(self.columnNames1):
            for enumJ, j in enumerate(self.columnNames2):
                corr, p_value = pointbiserialr(self.data[i].values, self.data[j].values)
              
                self.M_[enumI, enumJ] = corr
        return pd.DataFrame(self.M_.T, columns=self.columnNames1, index=self.columnNames2)

&nbsp;

## Pré processamento

In [40]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

Transformação de atributos

In [41]:
class TransformandoAtributosBin(BaseEstimator,TransformerMixin):
    def __init__(self,columns:list|str):
        self.columns = columns
        
    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
        if (type(self.columns) == list):
            for column in self.columns:
                X[column] = LabelBinarizer().fit_transform(X[column])
        else:
            X[self.columns] = LabelBinarizer().fit_transform(X[self.columns].values)
        return X

In [42]:
class TransformandoAtributosEnc(BaseEstimator,TransformerMixin):
    def __init__(self,columns:list|str):
        self.columns = columns

    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
        if (type(self.columns) == list):
            for column in self.columns:
                X[column] = LabelBinarizer().fit_transform(X[column])
        else:
            X[self.columns] = LabelEncoder().fit_transform(X[self.columns].values)
        return X

In [43]:
class Dummies(BaseEstimator,TransformerMixin):
    def __init__(self,columns:list | str) -> None:
        self.columns = columns 

    def fit(self,X,y=None):
        return self
    
    def transform(self, X:pd.DataFrame):
        return pd.get_dummies(X,columns=self.columns,dtype=int)

#### Pipeline

In [44]:
pipeline = Pipeline(
    steps=[
        ('Binarizer',TransformandoAtributosBin(columns=['gender', 'lunch', 'test preparation course'])),
        ('Encoder'  ,TransformandoAtributosEnc(columns=['race/ethnicity','parental level of education'])),
        ('Dummies'  ,Dummies(columns=['race/ethnicity','parental level of education']))
    ]
)
dataset = pipeline.fit_transform(data)

In [45]:
num = ['math score','reading score', 'writing score']
data[num].corr()

Unnamed: 0,math score,reading score,writing score
math score,1.0,0.81758,0.802642
reading score,0.81758,1.0,0.954598
writing score,0.802642,0.954598,1.0


In [58]:
data[num].describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


## Rede Neural

In [46]:
from sklearn.model_selection import train_test_split

In [47]:
X = dataset.drop(columns=['math score','reading score','writing score'])
y = dataset[['math score','reading score','writing score']]

In [48]:
data.nunique()

gender                          2
race/ethnicity                  2
parental level of education     2
lunch                           2
test preparation course         2
math score                     81
reading score                  72
writing score                  77
dtype: int64

In [49]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=42, test_size=0.25)

In [50]:
ytrainM ,ytrainR, ytrainW = (
    ytrain['math score'].values,
    ytrain['reading score'].values,
    ytrain['writing score'].values
)

ytestM ,ytestR, ytestW = (
    ytest['math score'].values,
    ytest['reading score'].values,
    ytest['writing score'].values
)

In [51]:
xtrain.shape

(750, 7)

In [52]:
import keras
import tensorflow as tf

In [53]:
input_ = keras.layers.Input(shape=xtrain.shape[1:])

oculta1 = keras.layers.Dense(30, activation='relu', name='camadaOculta_1')(input_)
oculta2 = keras.layers.Dense(15, activation='relu', name='camadaOculta_2')(oculta1)

concat  = keras.layers.concatenate([input_,oculta2], name='concat')

output1 = keras.layers.Dense(1)(concat)
output2 = keras.layers.Dense(1)(oculta2)
output3 = keras.layers.Dense(1)(oculta2)

model = keras.Model(inputs=[input_],outputs=[output1, output2, output3])

In [65]:
tf.keras.utils.plot_model(
    model,
    to_file='NeuralNetworkPlot/modelStudents.png',
    show_shapes=False,
    show_dtype=False,
    show_layer_names=True,
    rankdir='TB',
    expand_nested=False,
    dpi=96,
    layer_range=None,
    show_layer_activations=True,
    show_trainable=False
)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 7)]                  0         []                            
                                                                                                  
 camadaOculta_1 (Dense)      (None, 30)                   240       ['input_2[0][0]']             
                                                                                                  
 camadaOculta_2 (Dense)      (None, 15)                   465       ['camadaOculta_1[0][0]']      
                                                                                                  
 concat (Concatenate)        (None, 22)                   0         ['input_2[0][0]',             
                                                                     'camadaOculta_2[0][0]']

In [62]:
model.compile(
    loss = [
        keras.losses.mean_absolute_error,
        keras.losses.mean_absolute_error,
        keras.losses.mean_absolute_error
    ],
    optimizer = keras.optimizers.SGD()
)
history = model.fit(
    xtrain, (ytrainM ,ytrainR ,ytrainW ), epochs=30, batch_size=10
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [63]:
p1, p2, p3 = model.predict(xtest)



In [64]:
mean_absolute_error(p1,ytestM), mean_absolute_error(p2,ytestR)

(11.668461151123047, 11.934859130859374)