Luego de verificar los datos se procedio a implementar un modelo de arbol de decisión clasificador

In [63]:
# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [64]:
# Preprocesado y modelado
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score

In [65]:
# Lectura de datos
# ------------------------------------------------------------------------------

data = pd.read_excel(r'E-Commerce_train.xlsx')
data_test = pd.read_excel(r'E-Commerce_test.xlsx')

df = data
df_test = data_test

Al tener variables categoricas en ambos datasets se procedio a reemplazar sus valores numericos

In [66]:
df['Warehouse_block']=df['Warehouse_block'].map({'A':0,'B':1,'C':2,'D':3,'F':4})
df['Mode_of_Shipment']=df['Mode_of_Shipment'].map({'Flight':0,'Ship':1,'Road':2})
df['Product_importance']=df['Product_importance'].map({'low':0,'medium':1,'high':2})
df=pd.get_dummies(df,columns=['Gender'],drop_first=True)

In [67]:
df_test['Warehouse_block']=df_test['Warehouse_block'].map({'A':0,'B':1,'C':2,'D':3,'F':4})
df_test['Mode_of_Shipment']=df_test['Mode_of_Shipment'].map({'Flight':0,'Ship':1,'Road':2})
df_test['Product_importance']=df_test['Product_importance'].map({'low':0,'medium':1,'high':2})
df_test=pd.get_dummies(df_test,columns=['Gender'],drop_first=True)

In [68]:
# Se dropea las columnas ID de ambos datasets

df.drop(columns=['ID'])
df_test.drop(columns=['ID'])

Unnamed: 0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Discount_offered,Weight_in_gms,Gender_M
0,4,1,5,1,189,2,1,10,5248,1
1,3,1,5,3,150,2,2,9,4446,0
2,4,1,3,3,200,3,0,3,5739,1
3,0,1,5,1,133,2,1,1,4643,1
4,1,1,4,2,260,3,0,4,5504,1
...,...,...,...,...,...,...,...,...,...,...
1995,0,1,4,1,252,5,1,1,1538,0
1996,1,1,4,1,232,5,1,6,1247,0
1997,2,1,5,4,242,5,0,4,1155,0
1998,4,1,5,2,223,6,1,2,1210,1


In [69]:
# División de los datos en train y test
# ------------------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
                                        df.drop(columns ='Reached.on.Time_Y.N'),
                                        df['Reached.on.Time_Y.N'], test_size=0.3,
                                        random_state = 42
                                    )

A continuación, se crearon tres pipelines, con tres modelos a evaluar, los modelos elegidos fueron: regresión logística, Decision Tree Classifier y Random Forest Classifier. Respecto al preprocesamiento, aplicaremos un escalado de datos con `Standard Scaler` y haremos reducción de dimensionalidad con `PCA`.

In [70]:
# Creamos el primer pipeline

pipe_lr = Pipeline([('scl', StandardScaler()),
			('pca', PCA(n_components = 2)),
			('clf', LogisticRegression(random_state = 42))])

In [71]:
# Creamos el segundo pipeline

pipe_dt = Pipeline([('scl', StandardScaler()),
			('pca', PCA(n_components = 2)),
			('clf', DecisionTreeClassifier( max_depth = 10,
    										random_state = 42))])

In [72]:
# Creamos el tercer pipeline

pipe_rf = Pipeline([('scl', StandardScaler()),
			('pca', PCA(n_components = 2)),
			('clf', RandomForestClassifier( max_depth = 8,
    										random_state = 42))])

In [73]:
# Los guardamos en una lista y en un diccionario

pipelines = [pipe_lr, pipe_dt, pipe_rf]
pipe_dict = {0: 'Regresión Logística', 1: 'Árbol de decisión', 2: 'Random Forest'}

In [74]:
# Entrenamos

for pipe in pipelines:
	pipe.fit(X_train, y_train)

In [75]:
# Evaluamos

for idx, val in enumerate(pipelines):
	print('%s pipeline accuracy en test: %.3f' % (pipe_dict[idx], val.score(X_test, y_test)))

Regresión Logística pipeline accuracy en test: 0.676
Árbol de decisión pipeline accuracy en test: 0.696
Random Forest pipeline accuracy en test: 0.704


In [76]:
# Identificamos el mejor modelo para el set de testeo

best_acc = 0.0
best_clf = 0
best_pipe = ''
for idx, val in enumerate(pipelines):
	if val.score(X_test, y_test) > best_acc:
		best_acc = val.score(X_test, y_test)
		best_pipe = val
		best_clf = idx
print('Modelo con el mejor accuracy: %s' % pipe_dict[best_clf])

Modelo con el mejor accuracy: Random Forest


In [77]:
# Guardamos el mejor modelo en una nueva variable
modelo = pipelines[best_clf].fit(X_train, y_train)

In [78]:
# Error de test del modelo
#-------------------------------------------------------------------------------
predicciones = modelo.predict(X = X_test,)

print("Matriz de confusión")
print("-------------------")
confusion_matrix(
    y_true    = y_test,
    y_pred    = predicciones
)

Matriz de confusión
-------------------


array([[ 871,   73],
       [ 726, 1030]], dtype=int64)

In [79]:
accuracy = accuracy_score(
            y_true    = y_test,
            y_pred    = predicciones,
            normalize = True
            )
print(f"El accuracy de test es: {accuracy}")

El accuracy de test es: 0.7040740740740741


In [80]:
print(f'El score del modelo es:, {recall_score(y_test, predicciones)}')

El score del modelo es:, 0.5865603644646925


In [81]:
# Se realizo la prediccion y se creo el dataframe que lo contega
# ------------------------------------------------------------------------------

predicciones2 = pipe_rf.predict(X = df_test,)
j4carlos = pd.DataFrame(predicciones2, columns=['pred'])

In [82]:
# Se genera el .csv para exportar
# ------------------------------------------------------------------------------
j4carlos.to_csv("j4carlos.csv",index=False)