# 0. Importación de librerías 

En las siguientes líneas de código se importan las librerías y herramientas necesarias para desarrollar el caso de uso.

In [1]:
seed = 161
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Composicion de pipelines
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.impute import SimpleImputer

# Regresion lineal
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse

# Importar/ Exportar modelos
from joblib import dump, load

# 1. Carga de los datos
A través de la librería **pandas** podemos realizar la carga de datos desde diferentes fuentes de información, en este caso se realizará la carga de un archivo plano csv, el separador del archivo es ; y se remplazaron los valores "-" por valores nulos.

In [2]:
# Se cargan los datos. 
df_datos=pd.read_csv('202210_Laboratorio3_data_datos_recientes.csv', sep=',', encoding = 'utf-8', header=0, na_values=["NA-VALUE"])

In [3]:
# Cantidad de datos y número de variables
df_datos.shape

(294, 19)

In [4]:
# Mostrar los datos
df_datos.head()

Unnamed: 0.1,Unnamed: 0,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 10-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,2644,151.0,0,1.8,423.295351,9.0,0,68.6,0,91.0,4.87,9.0,0.1,2284.37858,146.0,0.1,0.1,0.693,14.6
1,2645,153.0,0,1.79,45.851058,85.0,0,67.8,0,91.0,5.9,9.0,0.1,229.714718,99789.0,0.1,0.1,0.683,13.7
2,2646,155.0,0,1.51,310.820338,88.0,0,67.0,0,85.0,5.3,84.0,0.1,1842.44421,99184.0,0.1,0.1,0.679,13.5
3,2647,157.0,0,1.35,330.100739,91.0,4,66.2,0,91.0,5.66,89.0,0.1,1837.977391,98611.0,0.1,0.1,0.674,13.2
4,2648,158.0,0,1.24,40.491289,93.0,0,65.5,0,91.0,4.75,91.0,0.1,263.27236,9882.0,0.1,0.1,0.676,13.7


In [5]:
# Podemos ver los tipos de todas la variables.
df_datos.dtypes

Unnamed: 0                           int64
Adult Mortality                    float64
infant deaths                        int64
Alcohol                            float64
percentage expenditure             float64
Hepatitis B                        float64
Measles                              int64
BMI                                float64
under-five deaths                    int64
Polio                              float64
Total expenditure                  float64
Diphtheria                         float64
HIV/AIDS                           float64
GDP                                float64
Population                         float64
thinness  10-19 years              float64
thinness 5-9 years                 float64
Income composition of resources    float64
Schooling                          float64
dtype: object

In [6]:
# Y hacer una decsripción de los datos
df_datos.describe()

Unnamed: 0.1,Unnamed: 0,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 10-19 years,thinness 5-9 years,Income composition of resources,Schooling
count,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0
mean,2790.5,180.156463,22.748299,4.031327,250.691789,67.258503,2299.707483,39.811565,31.921769,82.459184,5.934014,80.578231,2.866327,2888.804225,4541904.0,5.141497,5.135374,0.492966,9.931293
std,85.014705,149.969676,28.065706,3.411991,636.324313,35.669719,6887.681389,20.32378,43.125549,21.932024,3.285364,24.922111,6.876873,7269.383426,12934990.0,4.007686,4.123723,0.289985,4.827973
min,2644.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,7.0,0.0,5.0,0.1,0.0,0.0,0.1,0.1,0.0,0.0
25%,2717.25,82.0,3.0,1.375,0.0,53.0,0.0,19.3,3.0,75.0,4.4,75.0,0.1,0.0,0.0,1.625,1.6,0.415,9.725
50%,2790.5,153.0,10.0,2.72,27.137321,83.0,55.5,43.0,12.0,92.0,5.405,92.0,0.1,430.82407,105519.0,5.05,4.9,0.6005,11.1
75%,2863.75,231.0,29.0,6.6325,194.536691,94.0,816.5,57.475,42.0,96.0,7.075,96.0,0.775,2244.678564,2482152.0,6.675,6.7,0.71575,12.975
max,2937.0,723.0,116.0,12.22,4003.908598,99.0,49871.0,79.3,191.0,99.0,17.6,99.0,43.5,45758.9554,78271470.0,15.8,16.4,0.836,15.7


# 2. Limpieza y preparación de los datos

Primero vamos a ejecutar los pasos de limpieza de los datos, relacionados el tratamiento de ausencias y registros duplicados.

In [7]:
# Es recomendable que todos los pasos de limpieza y preparación se realicen sobre otro archivo.
df_datos_t = df_datos

In [8]:
# Se observa que hay ausencias, sin embargo no son una cantidad significativa:
df_datos_t.isnull().sum()

Unnamed: 0                         0
Adult Mortality                    0
infant deaths                      0
Alcohol                            0
percentage expenditure             0
Hepatitis B                        0
Measles                            0
BMI                                0
under-five deaths                  0
Polio                              0
Total expenditure                  0
Diphtheria                         0
HIV/AIDS                           0
GDP                                0
Population                         0
thinness  10-19 years              0
thinness 5-9 years                 0
Income composition of resources    0
Schooling                          0
dtype: int64

In [9]:
# Eliminación registros con ausencias
df_datos_t = df_datos_t.dropna()
# Eliminación de registros duplicados.
df_datos_t = df_datos_t.drop_duplicates()

In [10]:
(df_datos_t==0).sum()

Unnamed: 0                           0
Adult Mortality                      1
infant deaths                       45
Alcohol                             17
percentage expenditure             113
Hepatitis B                         34
Measles                             84
BMI                                  0
under-five deaths                   38
Polio                                0
Total expenditure                   18
Diphtheria                           0
HIV/AIDS                             0
GDP                                 96
Population                         112
thinness  10-19 years                0
thinness 5-9 years                   0
Income composition of resources     67
Schooling                           50
dtype: int64

In [11]:
#Si juega 16 horas al dia puede lograr maximo 112 horas a la semana
df_datos_t = df_datos_t[df_datos_t.HoursPerWeek<112]
# El juego tiene 10 años por lo que el maximo de horas posibles son 80640
df_datos_t = df_datos_t[df_datos_t.TotalHours<80640]
# Nos interesa la información de los jugadores, si la persona no juega no va a tener una liga asignada
df_datos_t = df_datos_t[df_datos_t.HoursPerWeek>0]
# De acuerdo al diccionario el maximo para la variable LeagueIndex es 10
df_datos_t = df_datos_t[df_datos_t.LeagueIndex<11]

AttributeError: 'DataFrame' object has no attribute 'HoursPerWeek'

In [None]:
# Cantidad de datos y número de variables
df_datos_t.shape

# 3. Perfilamiento y entendimiento de los datos

## 3.1 Búsqueda de relaciones entre variables (diagramas de dispersión)

In [None]:
sns.pairplot(df_datos_t, height=3, y_vars = 'LeagueIndex', x_vars = df_datos_t.columns[0:5], kind='scatter')
sns.pairplot(df_datos_t, height=3, y_vars = 'LeagueIndex', x_vars = df_datos_t.columns[5:10], kind='scatter')
sns.pairplot(df_datos_t, height=3, y_vars = 'LeagueIndex', x_vars = df_datos_t.columns[10:15], kind='scatter')
sns.pairplot(df_datos_t, height=3, y_vars = 'LeagueIndex', x_vars = df_datos_t.columns[15:], kind='scatter')

##  3.2 Búsqueda de relaciones entre variables (Matriz de correlaciones)

In [None]:
f = plt.figure(figsize=(10, 10))
plt.matshow(df_datos_t.corr(), fignum=f.number, cmap = 'seismic')
plt.xticks(range(df_datos_t.select_dtypes(['number']).shape[1]), df_datos_t.select_dtypes(['number']).columns, fontsize=14, rotation=45)
plt.yticks(range(df_datos_t.select_dtypes(['number']).shape[1]), df_datos_t.select_dtypes(['number']).columns, fontsize=14)
cb = plt.colorbar()
_ = cb.ax.tick_params(labelsize=14)

# Selección de variables del modelo
Para seleccionar las variables del modelo se prueba el desempeño individual de cada una de las variables , midiendo su error cuadratico medio (MSE) y su coeficiente de determinación ajustado (R2).

In [None]:
# Se selecciona la variable objetivo, en este caso "Outcome".
Y = df_datos_t['LeagueIndex']
# Del conjunto de datos se elimina la variable "Outcome"
X = df_datos_t.drop(['LeagueIndex'], axis=1)

In [None]:
# Dividimos los datos en conjunto de prueba y conjunto de entrenamiento
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

print('El conjunto de entrenamiento tiene ' + str(len(X_train.index)) + ' muestras' )
print('El conjunto de prueba tiene ' + str(len(X_test.index)) + ' muestras' )

### Grafica MSE y R2
Resultado del MSE y R2 realizando un entrenamiento y prediccion para cada variable individual 

In [None]:
mse_list=[]
score=[]
variables = ['Age', 'HoursPerWeek', 'TotalHours', 'APM',
       'SelectByHotkeys', 'AssignToHotkeys', 'UniqueHotkeys', 'MinimapAttacks',
       'MinimapRightClicks', 'NumberOfPACs', 'GapBetweenPACs', 'ActionLatency',
       'ActionsInPAC', 'TotalMapExplored', 'WorkersMade', 'UniqueUnitsMade',
       'ComplexUnitsMade', 'ComplexAbilitiesUsed']
for i in variables:
  X_temp = X_train[[i]]
  X_temp_test = X_test[[i]]
  
  model = LinearRegression()
  model.fit(X_temp, Y_train)

  y_pred = model.predict(X_temp_test)
  score.append(model.score(X_temp_test, Y_test))
  mse_list.append(mse(Y_test, y_pred))


fig = plt.figure(figsize=(15,20))
fig.subplots_adjust(top=0.8)

ax1 = fig.add_subplot(211)
ax1.set_ylabel('MSE', fontsize=14)
ax1.set_xlabel('Variable', fontsize=14)
ax1.set_title('MSE', fontsize=18)
ax1.bar(variables,mse_list, align='center')
plt.xticks(rotation=90)

ax2 = fig.add_subplot(212)
ax2.set_ylabel('R2', fontsize=14)
ax2.set_xlabel('Variable', fontsize=14)
ax2.set_title('R2', fontsize=18)
ax2.bar(variables,score, align='center')
plt.xticks(rotation=90)

### Linealidad
Se plotea el modelo lineal que se obtiene al usar solo una variable del conjunto de datos

In [None]:
i = 0
for var in variables:
    i += 1
    X_temp = X_train[[var]]
    X_test_temp = X_test[[var]]

    model = LinearRegression()
    model.fit(X_temp, Y_train)

    y_pred = model.predict(X_test_temp)
    plt.rcParams['figure.figsize'] = [20, 20]
    plt.subplot(4, 5, i)
    plt.xlabel(var, fontsize=14)
    plt.ylabel("LeagueIndex", fontsize=14)
    plt.plot(X_test_temp, y_pred, X_test_temp , Y_test, 'k.')

    
plt.rcParams['figure.figsize'] = [6.4, 4.8]

# Modelos

Para modelar el comportamiento de los datos y poder sacar predicciones se usa un modelo de regresión lineal. La regresión Lineal busca una aproximación con coeficientes $\beta$ = [ $\beta_1$, $\dots$, $\beta_n$] que minimizen el error cuadratico medio entre los datos observados y la predicción modelada por una aproximación lineal.Para modelar el comportamiento de los datos y poder sacar predicciones se usa un modelo de regresión lineal. La regresión Lineal busca una aproximación con coeficientes $\beta$ = [ $\beta_1$, $\dots$, $\beta_n$] que minimizen el error cuadratico medio entre los datos observados y la predicción modelada por una aproximación lineal.

In [None]:
def transformacion(x):
    return x**0.5

In [None]:
def create_pipe(selected_cols_p, transform=False):
    pre = [('initial',ColumnTransformer([("selector", 'passthrough',selected_cols_p)])),]

    na_cleanup = [('na_cleanup', SimpleImputer(strategy="median")),]

    std = [('std_scaler', StandardScaler()),]

    m = [('model', LinearRegression()),]
    
    transformer = [("transformer",FunctionTransformer(transformacion)),]
    
    if transform:
        pipeline = Pipeline(pre+na_cleanup+transformer+std+m)
    else:
        pipeline = Pipeline(pre+na_cleanup+std+m)
    pipeline = pipeline.fit(X_train, Y_train)
    return pipeline

### Modelo 1.1
Los parametros se seleccionan de acuerdo a la seccion anterior, para el primer modelo se toman las 11 variables que tienen menor error cuadratico medio

In [None]:
selected_cols=[]

for i in range(0, 6):
  minimo=10000
  act=""
  for j in range (0,len(mse_list)):
    if mse_list[j]<minimo and variables[j] not in selected_cols:
      minimo=mse_list[j]
      act=variables[j]
  selected_cols.append(act)
print('Las variables seleccionadas son: ')
for v in selected_cols:
  print('', f'* {v.strip()}')

In [None]:
pipe = create_pipe(selected_cols)

In [None]:
print("R2: ",pipe.score(X_test, Y_test))
print("ECM: " , mse(Y_test, pipe.predict(X_test)))

### Modelo 1.2
Para el modelo 1.2 se tomaron las variables con mayor coeficiente de determinación ajustado (R2)

In [None]:
selected_cols=[]

for i in range(0, 8):
  maxi=0
  act=""
  for j in range (0,len(score)):
    if score[j]>maxi and variables[j] not in selected_cols:
      maxi=score[j]
      act=variables[j]
  selected_cols.append(act)

print('Las variables seleccionadas son: ')
for v in selected_cols:
  print('', f'* {v.strip()}')

In [None]:
pipe = create_pipe(selected_cols)

In [None]:
print("R2: ",pipe.score(X_test, Y_test))
print("ECM: " , mse(Y_test, pipe.predict(X_test)))

### Modelo 1.3
Para el modelo 1.3 se realizan transformaciones sobre los datos de entrada

In [None]:
variables = ['Age', 'HoursPerWeek', 'TotalHours', 'APM',
       'SelectByHotkeys', 'AssignToHotkeys', 'UniqueHotkeys', 'MinimapAttacks',
       'MinimapRightClicks', 'NumberOfPACs', 'GapBetweenPACs', 'ActionLatency',
       'ActionsInPAC', 'TotalMapExplored', 'WorkersMade', 'UniqueUnitsMade',
       'ComplexUnitsMade', 'ComplexAbilitiesUsed']
i = 0
for var in variables:
    i += 1
    X_temp = X_train[[var]]**0.5
    X_test_temp = X_test[[var]]**0.5
  
    
    model = LinearRegression()
    model.fit(X_temp, Y_train)

    y_pred = model.predict(X_test_temp)
    
   
    
    plt.rcParams['figure.figsize'] = [20, 40]
    plt.subplot(10, 4, i)
    plt.xlabel(var, fontsize=14)
    plt.ylabel("LeagueIndex", fontsize=14)
    plt.plot(X_test_temp, y_pred, X_test_temp , Y_test, 'k.')
    
    
    
    
    i += 1
    
    X_temp =X_train[[var]]
    X_test_temp =X_test[[var]]
    model = LinearRegression()
    model.fit(X_temp, Y_train)

    y_pred = model.predict(X_test_temp)
    plt.rcParams['figure.figsize'] = [20, 20]
    plt.subplot(10, 4, i)
    plt.xlabel(var, fontsize=14)
    plt.ylabel("LeagueIndex", fontsize=14)
    plt.plot(X_test_temp, y_pred, X_test_temp , Y_test, 'k.')
    
plt.rcParams['figure.figsize'] = [6.4, 4.8]

In [None]:
mse_list=[]
score=[]
variables = ["ActionLatency", "APM", "NumberOfPACs", "GapBetweenPACs", "AssignToHotkeys", "SelectByHotkeys", "MinimapAttacks"]
variables=['TotalHours', 'APM',
       'SelectByHotkeys', 'AssignToHotkeys', 'MinimapAttacks',
       'MinimapRightClicks', 'NumberOfPACs', 'GapBetweenPACs', 'ActionLatency',
       'ActionsInPAC', 'TotalMapExplored', 'WorkersMade']
pipe = create_pipe(variables, transform=True)
print("R2: ",pipe.score(X_test, Y_test))
print("ECM: " , mse(Y_test, pipe.predict(X_test)))



# Exportar e Importar el Modelo

In [None]:
df_recent=pd.read_csv('recent.csv', sep=',', encoding = 'utf-8', header=0, na_values=["NA-VALUE"])

In [None]:
df_recent.shape

In [None]:
 # Usamos la lbreria joblib
filename = 'pipeline.joblib'
# Se guarda
dump(pipeline, filename)   # Se lee
p2 = load(filename)

p2  # Clasificamos los datos recientes
df_recent['LeagueIndex'] = p2.predict(df_recent)  
sns.histplot(df_recent['LeagueIndex'])

# Coeficientes

In [None]:
pipeline['model'].coef_  # En DataFrame
pd.DataFrame({'columns':selected_cols, 'coef':pipeline['model'].coef_})