In [1]:
import numpy as np
import pandas as pd 
import seaborn as sea 
import matplotlib.pyplot as plt
from sklearn.metrics import *

In [2]:
data = pd.read_csv('datasets/StudentPrediction/StudentsPerformance.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [4]:
data.head(3)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93


In [5]:
conj = data.isna().nunique().reset_index()
conj.loc[conj[0]>1]

Unnamed: 0,index,0


Não há valores nulos

In [6]:
subdata = data.nunique().reset_index()
subdata = subdata.loc[subdata[0]==2]
subdata.set_index('index').T.columns

Index(['gender', 'lunch', 'test preparation course'], dtype='object', name='index')

In [7]:
subdata = data.nunique().reset_index()
subdata = subdata.loc[subdata[0]>2]
subdata.set_index('index').T.columns

Index(['race/ethnicity', 'parental level of education', 'math score',
       'reading score', 'writing score'],
      dtype='object', name='index')

### Correlação

In [8]:
from scipy.stats import chi2_contingency, pointbiserialr

class Correlation:
    M_ = None
    
    def __init__(self, column1:list, column2:list, data:pd.DataFrame):
        self.columnNames1 = column1
        self.columnNames2 = column2
        self.data         = data
        self.tupla        = ( len(self.columnNames1) , len(self.columnNames2) ) 

    def quiQuadrada(self):
        self.M_ = np.zeros(shape=self.tupla)
        
        for enumI, i in enumerate(self.columnNames1):
            for enumJ, j in enumerate(self.columnNames2):
                chi, p, _, _ = chi2_contingency( pd.crosstab(self.data[i].values, self.data[j].values) )
                self.M_[enumI,enumJ] = round(p,10)
        
        return pd.DataFrame(self.M_, columns=self.columnNames1, index=self.columnNames2)

    def pointBisserl(self):
        self.M_ = np.zeros(shape=self.tupla)
        for enumI, i in enumerate(self.columnNames1):
            for enumJ, j in enumerate(self.columnNames2):
                corr, p_value = pointbiserialr(self.data[i].values, self.data[j].values)
              
                self.M_[enumI, enumJ] = corr
        return pd.DataFrame(self.M_.T, columns=self.columnNames1, index=self.columnNames2)

&nbsp;

## Pré processamento

In [9]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

Transformação de atributos

In [10]:
class TransformandoAtributosBin(BaseEstimator,TransformerMixin):
    def __init__(self,columns:list|str):
        self.columns = columns
        
    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
        if (type(self.columns) == list):
            for column in self.columns:
                X[column] = LabelBinarizer().fit_transform(X[column])
        else:
            X[self.columns] = LabelBinarizer().fit_transform(X[self.columns].values)
        return X

In [11]:
class TransformandoAtributosEnc(BaseEstimator,TransformerMixin):
    def __init__(self,columns:list|str):
        self.columns = columns

    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
        if (type(self.columns) == list):
            for column in self.columns:
                X[column] = LabelBinarizer().fit_transform(X[column])
        else:
            X[self.columns] = LabelEncoder().fit_transform(X[self.columns].values)
        return X

In [12]:
class Dummies(BaseEstimator,TransformerMixin):
    def __init__(self,columns:list | str) -> None:
        self.columns = columns 

    def fit(self,X,y=None):
        return self
    
    def transform(self, X:pd.DataFrame):
        return pd.get_dummies(X,columns=self.columns,dtype=int)

#### Pipeline

In [13]:
pipeline = Pipeline(
    steps=[
        ('Binarizer',TransformandoAtributosBin(columns=['gender', 'lunch', 'test preparation course'])),
        ('Encoder'  ,TransformandoAtributosEnc(columns=['race/ethnicity','parental level of education'])),
        ('Dummies'  ,Dummies(columns=['race/ethnicity','parental level of education']))
    ]
)
dataset = pipeline.fit_transform(data)

In [14]:
num = ['math score','reading score', 'writing score']
data[num].corr()

Unnamed: 0,math score,reading score,writing score
math score,1.0,0.81758,0.802642
reading score,0.81758,1.0,0.954598
writing score,0.802642,0.954598,1.0


In [15]:
data[num].describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


## Rede Neural

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
X = dataset.drop(columns=['reading score','writing score'])
y = dataset[['reading score','writing score']]

In [50]:
data.nunique()

gender                          2
race/ethnicity                  2
parental level of education     2
lunch                           2
test preparation course         2
math score                     81
reading score                  72
writing score                  77
dtype: int64

In [51]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=42, test_size=0.25)

In [53]:
ytrainR, ytrainW = (
    ytrain['reading score'].values,
    ytrain['writing score'].values
)

ytestR, ytestW = (
    ytest['reading score'].values,
    ytest['writing score'].values
)

In [54]:
xtrain.shape

(750, 8)

In [55]:
import keras
import tensorflow as tf
from functools import partial

In [56]:
input_ = keras.layers.Input(shape=xtrain.shape[1:])

Dense = partial(
    keras.layers.Dense,
    activation=keras.activations.elu,
    kernel_initializer=keras.initializers.he_normal
)
oculta0 = Dense(60, name='camadaOculta_0')(input_)
oculta1 = Dense(30, name='camadaOculta_1')(oculta0)
oculta2 = Dense(15, name='camadaOculta_2')(oculta1)

oculta3 = Dense(7, name='camadaOculta_3')(oculta2)
oculta4 = Dense(3, name='camadaOculta_4')(oculta3)

output2 = keras.layers.Dense(1)(oculta4)
output3 = keras.layers.Dense(1)(oculta4)

model = keras.Model(inputs=[input_],outputs=[output2, output3])

In [57]:
tf.keras.utils.plot_model(
    model,
    to_file='NeuralNetworkPlot/modelStudents.png',
    show_shapes=False,
    show_dtype=False,
    show_layer_names=True,
    rankdir='TB',
    expand_nested=False,
    dpi=96,
    layer_range=None,
    show_layer_activations=True,
    show_trainable=False
)
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_4 (InputLayer)        [(None, 8)]                  0         []                            
                                                                                                  
 camadaOculta_0 (Dense)      (None, 60)                   540       ['input_4[0][0]']             
                                                                                                  
 camadaOculta_1 (Dense)      (None, 30)                   1830      ['camadaOculta_0[0][0]']      
                                                                                                  
 camadaOculta_2 (Dense)      (None, 15)                   465       ['camadaOculta_1[0][0]']      
                                                                                            

In [64]:
model.compile(
    loss = [
        keras.losses.mean_squared_error,
        keras.losses.mean_squared_error
    ],
    optimizer = keras.optimizers.Adam()
)
history = model.fit(
    xtrain, ( ytrainR ,ytrainW ), epochs=20, batch_size=32, verbose=0
)

In [60]:
p2, p3 = model.predict(xtest)



In [61]:
f'''
    {mean_absolute_error(p2,ytestR)}
    {mean_absolute_error(p3,ytestW)}
'''

'\n    64.58101810216904\n    62.6221444067955\n'

In [63]:
r2_score(p2,ytestR)

-844644.7089191539