# Flujo Completo CLASIFICACIÓN

In [1]:
import pandas as pd
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# Importamos datos en un Dataframe

In [3]:
df_data = pd.read_excel('Ingresos_por_persona.xlsx')

In [4]:
df_data.head() # Variable output --> "Ingresos"

Unnamed: 0,edad,nivel_educ,raza,sexo,ganancias_capital,perdidas_capital,Horas_semana,Ingresos
0,39,13,White,Male,2174,0,40,<=50K
1,50,13,White,Male,0,0,13,<=50K
2,38,9,White,Male,0,0,40,<=50K
3,53,7,Black,Male,0,0,40,<=50K
4,28,13,Black,Female,0,0,40,<=50K


In [5]:
df_data = df_data[0:1000]

In [6]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   edad               1000 non-null   int64 
 1   nivel_educ         1000 non-null   int64 
 2   raza               1000 non-null   object
 3   sexo               1000 non-null   object
 4   ganancias_capital  1000 non-null   int64 
 5   perdidas_capital   1000 non-null   int64 
 6   Horas_semana       1000 non-null   int64 
 7   Ingresos           1000 non-null   object
dtypes: int64(5), object(3)
memory usage: 62.6+ KB


# Convertir Variables categóricas de entrada (X)

In [7]:
df_X = df_data.drop('Ingresos', axis=1)

In [8]:
df_X = pd.get_dummies(df_X)

In [9]:
df_X.head()

Unnamed: 0,edad,nivel_educ,ganancias_capital,perdidas_capital,Horas_semana,raza_ Amer-Indian-Eskimo,raza_ Asian-Pac-Islander,raza_ Black,raza_ Other,raza_ White,sexo_ Female,sexo_ Male
0,39,13,2174,0,40,False,False,False,False,True,False,True
1,50,13,0,0,13,False,False,False,False,True,False,True
2,38,9,0,0,40,False,False,False,False,True,False,True
3,53,7,0,0,40,False,False,True,False,False,False,True
4,28,13,0,0,40,False,False,True,False,False,True,False


# Crear un array (dataframe) con las variables de entrada (X) y otro para la variable de salida (y)

In [10]:
X = df_X.values
y = df_data['Ingresos'].values

# Dividir datos en conjunto de "Training" (ej: 80%) y conjunto de "Test" (ej:20%)

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y) #stratify --> datos_etiquetados
#Escalar los datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # X es tu matriz de características

# Construir Modelos en base a los diferentes algoritmos

In [12]:
models = [('LR', LogisticRegression()),
          ('LDA', LinearDiscriminantAnalysis()),
          ('KNN', KNeighborsClassifier()),
          ('CART', DecisionTreeClassifier()),
          ('NB', GaussianNB()),
          ('SVM', SVC())]

# Evaluar cada modelo

In [13]:
results = []
names = []
for name, model in models:
 kf = KFold(n_splits=10, shuffle=True, random_state=42)
 cv_results = model_selection.cross_val_score(model, X_scaled, y, cv=kf)
 results.append(cv_results)
 names.append(name)
 msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
 print(msg)

LR: 0.813000 (0.028653)
LDA: 0.812000 (0.025219)
KNN: 0.807000 (0.027946)
CART: 0.769000 (0.036180)
NB: 0.307000 (0.032879)
SVM: 0.820000 (0.024900)


# Seleccionar mejor modelo tras benchmarking

In [48]:
svc = SVC()

# Optimizar y entrenar el modelo

In [66]:
import numpy as np
from sklearn.model_selection import GridSearchCV

Cs = [0.1, 1] # Menores combinaciones para mayor rapidez [0.001, 0.01, 0.1, 1, 10]
gammas = [0.1, 1] # Menores combinaciones para mayor rapidez [0.001, 0.01, 0.1, 1]
param_grid = {'C': Cs, 'gamma' : gammas}
svc_cv = GridSearchCV(svc, param_grid, cv=5)
svc_cv.fit(X, y)
svc_cv.best_params_
svc_cv.best_score_

np.float64(0.774)

# Predecir Resultados de salida (y_prediction) a partir de nuevos datos de entrada (X_new)

In [67]:
df_new = pd.read_excel('Ingresos_nuevos_datos.xlsx')
df_X_new = pd.get_dummies(df_new)
X_new = df_X_new.values

In [68]:
X_new

array([[52, 9, 0, 0, 45, False, False, True, False, False, False, True],
       [43, 13, 0, 1500, 25, False, True, False, False, False, True,
        False],
       [40, 9, 2000, 0, 40, False, False, False, True, False, False,
        True],
       [30, 12, 0, 0, 40, True, False, False, False, False, False, True],
       [22, 13, 0, 0, 50, False, False, False, False, True, True, False],
       [37, 14, 0, 550, 40, False, False, True, False, False, True,
        False],
       [47, 15, 0, 1902, 60, False, False, True, False, False, True,
        False],
       [50, 13, 0, 0, 55, False, False, True, False, False, False, True],
       [43, 10, 0, 0, 40, False, False, True, False, False, False, True],
       [42, 16, 0, 0, 45, False, False, True, False, False, False, True],
       [53, 9, 0, 0, 40, False, False, True, False, False, True, False],
       [49, 10, 0, 0, 50, False, False, True, False, False, False, True],
       [29, 13, 0, 0, 70, False, False, True, False, False, False, True]

In [69]:
y_prediction = svc_cv.predict(X_new)
print("Prediccion: {}".format(y_prediction))

Prediccion: [' >50K' ' <=50K' ' <=50K' ' <=50K' ' <=50K' ' <=50K' ' >50K' ' >50K'
 ' <=50K' ' >50K' ' <=50K' ' >50K' ' <=50K' ' >50K']
