# Regresión Lineal Multiple

## Paso 1: Importar librerías

In [56]:
##Warnings
import warnings
warnings.filterwarnings('ignore')

##test de normalidad
from scipy import stats

##Librerías
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

##Normalización
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

##Función para el modelo 
import statsmodels.api as sm

## Paso 2: Funciones


In [57]:
# Función para importar data .xlsx o .csv
def importDatos(ruta, archivo, ext, sep):
    if ext == '.csv':
        data = pd.read_csv(ruta+archivo+ext, sep)
        return data
    elif ext == '.xlsx':
        data = pd.read_excel(ruta+archivo+ext)
        return data
    else:
        print('Extensión diferente')
        
def diagCaja(df, coluno, coldos, coltres):
    fig, ax = plt.subplots(3, figsize=(5,5))
    plt1 = sns.boxplot(df[coluno], ax=ax[0])
    plt2 = sns.boxplot(df[coldos], ax=ax[1])
    plt3 = sns.boxplot(df[coltres], ax=ax[2])
    plt.tight_layout()

##Función para cruce de variables
def grafT(df, vect, vary):
    sns.pairplot(df, x_vars=vect, y_vars=vary, height=4, aspect=1, kind='reg')

# Función para el test de Shapiro - wilk - resistente al ruido en los datos

def testShapiroWilk(df):
    valoresP = []
    concepto = []
    variable = []
    for column in df:
        k2, p_value = stats.shapiro(df[column].values)
        valoresP.append(p_value)
        variable.append(column)
        if (p_value < 0.05):
            concepto.append('No es una variable Normal')
        else:
            concepto.append('Es una variable Normal')
    dfShapiro = pd.DataFrame(
        {'Variable': variable, 'Valores P': valoresP, 'Concepto': concepto})
    return dfShapiro

# Función para normalizar la data min - max sklearn


def normaData(df):
    valores = df.values  
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler = scaler.fit(valores)

    pd.DataFrame(np.vstack((scaler.data_min_, scaler.data_max_)),
             index=['Min', 'Max'],
             columns=df.columns)

    normalizados = scaler.transform(valores)
    df_norm = pd.DataFrame(normalizados,
                       index=df.index,
                       columns=df.columns)
    return df_norm

## Función para hacer la matriz de correlación de manera Tidy
def tidy_corr_matrix(corr_mat):
    '''
    Función para convertir una matriz de correlación de pandas en formato tidy.
    '''
    corr_mat = corr_mat.stack().reset_index()
    corr_mat.columns = ['variable_1','variable_2','r']
    corr_mat = corr_mat.loc[corr_mat['variable_1'] != corr_mat['variable_2'], :]
    corr_mat['abs_r'] = np.abs(corr_mat['r'])
    corr_mat = corr_mat.sort_values('abs_r', ascending=False)
    
    return(corr_mat)

##MultiGraficas
def multigraf(df, coluno, coldos):
    g = sns.FacetGrid(df, col=coluno)
    g.map(sns.histplot, coldos)
    return g

def multigrafDos(df, coluno, coldos, coltres):
  h = sns.FacetGrid(df, col=coluno, height=4, aspect=.5)
  h.map(sns.barplot, coldos, coltres, order=['M','F'])
  return h

def barrasM(df, colx, coly, cruce):
  sns.catplot(x=colx, y=coly, hue=cruce, kind='bar', data=df)

def disper(df, cruce, num, numdos):
  g = sns.FacetGrid(df, hue=cruce, palette='flare', height=5)
  g.map(sns.scatterplot, num,numdos, s=100, alpha=.5)
  g.add_legend()
  return g

def multigrafT(df, var1, var2, var3, var4, var5):
  sns.pairplot(df, x_vars=[var1, var2, var3, var4], y_vars=var5, height=6, aspect=1, kind='scatter')

def corr(df):
  f, ax = plt.subplots(figsize=(18,10))
  sns.heatmap(df.corr(), annot=True, linewidths=.5, ax=ax)

    

## Paso 3: Importar Data

In [58]:
data = importDatos('../Datasets/','student-mat','.csv',';')
data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [59]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,395.0,16.696203,1.276043,15.0,16.0,17.0,18.0,22.0
Medu,395.0,2.749367,1.094735,0.0,2.0,3.0,4.0,4.0
Fedu,395.0,2.521519,1.088201,0.0,2.0,2.0,3.0,4.0
traveltime,395.0,1.448101,0.697505,1.0,1.0,1.0,2.0,4.0
studytime,395.0,2.035443,0.83924,1.0,1.0,2.0,2.0,4.0
failures,395.0,0.334177,0.743651,0.0,0.0,0.0,0.0,3.0
famrel,395.0,3.944304,0.896659,1.0,4.0,4.0,5.0,5.0
freetime,395.0,3.235443,0.998862,1.0,3.0,3.0,4.0,5.0
goout,395.0,3.108861,1.113278,1.0,2.0,3.0,4.0,5.0
Dalc,395.0,1.481013,0.890741,1.0,1.0,1.0,2.0,5.0


## Paso 3: Multigráficas 

In [82]:
multigraf(data, 'traveltime','age')

<seaborn.axisgrid.FacetGrid at 0x1e93d074df0>

In [83]:
multigraf(data, 'studytime','age')

<seaborn.axisgrid.FacetGrid at 0x1e942561000>

In [62]:
multigraf(data, 'traveltime','sex')

<seaborn.axisgrid.FacetGrid at 0x1e940ada200>

In [63]:
multigraf(data, 'Medu','age')

<seaborn.axisgrid.FacetGrid at 0x1e93e398d60>

In [64]:
multigraf(data, 'Fedu','age')

<seaborn.axisgrid.FacetGrid at 0x1e9413372e0>

In [65]:
multigrafDos(data, 'Walc', 'sex','G3')

<seaborn.axisgrid.FacetGrid at 0x1e9416fa7a0>

In [66]:
multigrafDos(data, 'Walc', 'sex','G2')

<seaborn.axisgrid.FacetGrid at 0x1e941889660>

In [67]:
multigrafDos(data, 'Walc', 'sex','G1')

<seaborn.axisgrid.FacetGrid at 0x1e941be0b80>

In [68]:
barrasM(data, 'sex','G3', 'Dalc')

In [69]:
disper(data, 'sex', 'G3','absences')

<seaborn.axisgrid.FacetGrid at 0x1e9410993c0>

In [70]:
multigrafT(data, 'G1','G2','Walc','Medu','G3')

In [71]:
numerico = [i for i in data.columns if data[i].dtype!='O']
numerico

['age',
 'Medu',
 'Fedu',
 'traveltime',
 'studytime',
 'failures',
 'famrel',
 'freetime',
 'goout',
 'Dalc',
 'Walc',
 'health',
 'absences',
 'G1',
 'G2',
 'G3']

In [72]:
datos = data[numerico]
datos.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,18,4,4,2,2,0,4,3,4,1,1,3,6,5,6,6
1,17,1,1,1,2,0,5,3,3,1,1,3,4,5,5,6
2,15,1,1,1,2,3,4,3,2,2,3,3,10,7,8,10
3,15,4,2,1,3,0,3,2,2,1,1,5,2,15,14,15
4,16,3,3,1,2,0,4,3,2,1,2,5,4,6,10,10


In [73]:
##Normalización
dataNorm = normaData(datos)

dataNorm.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,395.0,0.242315,0.182292,0.0,0.142857,0.285714,0.428571,1.0
Medu,395.0,0.687342,0.273684,0.0,0.5,0.75,1.0,1.0
Fedu,395.0,0.63038,0.27205,0.0,0.5,0.5,0.75,1.0
traveltime,395.0,0.149367,0.232502,0.0,0.0,0.0,0.333333,1.0
studytime,395.0,0.345148,0.279747,0.0,0.0,0.333333,0.333333,1.0
failures,395.0,0.111392,0.247884,0.0,0.0,0.0,0.0,1.0
famrel,395.0,0.736076,0.224165,0.0,0.75,0.75,1.0,1.0
freetime,395.0,0.558861,0.249716,0.0,0.5,0.5,0.75,1.0
goout,395.0,0.527215,0.27832,0.0,0.25,0.5,0.75,1.0
Dalc,395.0,0.120253,0.222685,0.0,0.0,0.0,0.25,1.0


In [74]:
##Test shapiro - wilk
testShapiroWilk(dataNorm)

Unnamed: 0,Variable,Valores P,Concepto
0,age,1.587755e-14,No es una variable Normal
1,Medu,2.815527e-18,No es una variable Normal
2,Fedu,2.7068350000000003e-17,No es una variable Normal
3,traveltime,2.310144e-27,No es una variable Normal
4,studytime,6.547699e-20,No es una variable Normal
5,failures,1.138359e-31,No es una variable Normal
6,famrel,3.9062609999999997e-20,No es una variable Normal
7,freetime,6.425508e-15,No es una variable Normal
8,goout,1.412838e-14,No es una variable Normal
9,Dalc,2.9680920000000004e-29,No es una variable Normal


In [75]:
corr_matrix = dataNorm.corr(method='spearman')

tidy_corr_matrix(corr_matrix)

Unnamed: 0,variable_1,variable_2,r,abs_r
254,G3,G2,0.957125,0.957125
239,G2,G3,0.957125,0.957125
237,G2,G1,0.894792,0.894792
222,G1,G2,0.894792,0.894792
223,G1,G3,0.878001,0.878001
...,...,...,...,...
44,Fedu,absences,0.003568,0.003568
131,goout,traveltime,-0.001430,0.001430
56,traveltime,goout,-0.001430,0.001430
112,freetime,age,0.000302,0.000302


In [76]:
corr(dataNorm)

## Paso 4: Modelado

In [77]:
X = dataNorm.drop(['G3'], axis=1)
y = dataNorm['G3']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.85, random_state=100)

ml = LinearRegression()

ml.fit(X_train, y_train)

y_pred1 = ml.predict(X_test)

round(r2_score(y_test, y_pred1),3)



0.788

In [78]:
from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(y_test, y_pred1)

round(rmse, 3)

0.008

In [79]:
ml2 = LinearRegression()

ml2.fit(X_test, y_test)

y_pred2 = ml2.predict(X_test)

round(r2_score(y_test, y_pred2),3)

0.847

In [80]:
rmse2 = mean_squared_error(y_test, y_pred2)

round(rmse2, 3)

0.006

In [81]:
ax1 = sns.distplot(y_test, hist=False, color='r', label='Valores actuales')
sns.distplot(y_pred2, hist=False, color='b', label='Valores predecidos')

<AxesSubplot:xlabel='G3', ylabel='Density'>