In [1]:
import pandas as pd
import numpy as np
# For compatibility across multiple platforms


In [2]:
base = pd.read_csv('train.csv')
base.head()

FileNotFoundError: File b'train.csv' does not exist

In [None]:
base.info()

In [None]:
base.describe()

In [None]:
features = list(base.columns.drop('Purchase'))
#Verificamos la cantidad de datos y los nulos
print ('Cantidad de datos: ',len(base))
print ('% Nulos:')
print (base.isnull().sum()*100/len(base))

#### Imputacion de nulos

In [None]:
# Las categorias Product_Category tienen nulos, sin embargo observamos que las categorias 2 y 3 son constantes por producto (fijate fijate!).
base[base.Product_ID == base.Product_ID.unique()[0]]

In [None]:
# Podriamos considerar el NAN como otra categoria
print (base.Product_Category_1.unique())
print (base.Product_Category_2.unique())

In [None]:
# Imputamos el nulo con 0
base.Product_Category_2.fillna(0,inplace = True)
base.Product_Category_3.fillna(0,inplace = True)

### Clasificacion de variables

In [None]:
#features.remove('Product_Category_2','Product_Category_2')
num_features = list(base[features].describe().columns)
cat_features = list(base[features].drop(num_features, axis=1).columns)
print ('Variables Numericas')
print (num_features)
print ('\nVariables Categoricas')
print (cat_features)

In [None]:
import matplotlib.pyplot as plt
for feature in num_features:
    base[feature].hist(bins=20)
    plt.xlabel(feature)
    plt.ylabel('Frecuencia')
    plt.show()

In [None]:
# verificamos los percentiles
vector_percentiles = [0,10,50,90,95,99,100]
for feature in num_features:
    per = np.nanpercentile(base[feature],vector_percentiles)
    print(feature,per)

In [None]:
base['Purchase'].hist(bins=20)
plt.xlabel(feature)
plt.ylabel('Frecuencia')
plt.show()

In [None]:
per = np.nanpercentile(base['Purchase'],vector_percentiles)
print('Purchase',per)

In [None]:
base.info()

In [None]:
# Copiamos la tabla base a data
data = base.copy()

### Conversion de categoricos

In [None]:
print (data['City_Category'].unique())
print (data['Stay_In_Current_City_Years'].unique())

In [None]:
# Convertimos Stay_In_Current_City_Years
dict_temp = {'0':0,'1':1,'2':2,'3':3,'4+':4}
data['Stay_In_Current_City_Years'] = data['Stay_In_Current_City_Years'].map(dict_temp)

In [None]:
dum = pd.get_dummies(data.City_Category,prefix='ccat',drop_first = True)
data = pd.concat([data,dum],axis=1)
data.drop('City_Category',axis= 1, inplace = True)
data.head()

In [None]:
print (data['Age'].unique())
print (data['Gender'].unique())

In [None]:
dict_temp = {'0-17':1, '55+':7 ,'26-35':3, '46-50':5, '51-55':6, '36-45':4, '18-25':2}
data['Age'] = data['Age'].map(dict_temp)
dict_temp = {'M':1, 'F':0}
data['Gender'] = data['Gender'].map(dict_temp)
data.head()

In [None]:
data.info()

In [None]:
# Convertimos las variables de float a int
data.Product_Category_2 = data.Product_Category_2.astype('int64')
data.Product_Category_3 = data.Product_Category_3.astype('int64')

## Modelamiento

In [None]:
from sklearn.model_selection import train_test_split
data_train, data_test = train_test_split(data, test_size = 0.3, random_state=99)

In [None]:
features = list(data.columns)
features

In [None]:
vars_to_model = ['Gender',
 'Age',
 'Occupation',
 'Stay_In_Current_City_Years',
 'Marital_Status',
 'Product_Category_1',
 'Product_Category_2',
 'Product_Category_3',
 'ccat_B',
 'ccat_C']
target = 'Purchase'

In [None]:
X_train = data_train[vars_to_model]
y_train = data_train[target]
X_test = data_test[vars_to_model]
y_test = data_test[target]

In [None]:
# Creamos columnas para la tabla de resultados
models=[]
mse=[]
r2=[]

X_train.head()

### Regresion Lineal

In [None]:
%%time 
models.append('Regresion Lineal')
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
lregr = linear_model.LinearRegression()

# Train the model using the training sets
lregr.fit(X_train,y_train)

# Make predictions using the testing set
y_pred = lregr.predict(X_test)

# The coefficients
print (vars_to_model)
print('Coefficients: \n', lregr.coef_)
# The mean squared error
mse.append(mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
r2.append(r2_score(y_test, y_pred))


### Regresion Ridge

In [None]:
%%time 
from sklearn.linear_model import Ridge
clf = Ridge(alpha=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# The coefficients
print (vars_to_model)
print('Coefficients: \n', clf.coef_)
models.append('Regresion Ridge')
# The mean squared error
mse.append(mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
r2.append(r2_score(y_test, y_pred))

### Regresion de grado superior

In [None]:
%%time 
name ='Regresion x^'
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
for k in range(2,5):
    poly_reg = PolynomialFeatures(degree = k)
    X_poly = poly_reg.fit_transform(X_train)
    lin_reg = LinearRegression()
    lin_reg.fit(X_poly, y_train)
    ylin = lin_reg.predict(poly_reg.fit_transform(X_test))
    # Fill table
    models.append(name+str(k))
    mse.append(mean_squared_error(y_test, ylin))
    r2.append(r2_score(y_true=y_test, y_pred=ylin))

### Multi Layer Perceptron

In [None]:
# Normalizacion de los datos
from sklearn.preprocessing import StandardScaler
scx = StandardScaler()
X = scx.fit_transform(data[vars_to_model])
scy = StandardScaler()
y = scy.fit_transform(data[target].values.reshape(-1,1))

from sklearn.model_selection import train_test_split
X1_train, X1_test, y1_train,y1_test = train_test_split(X, y, test_size = 0.3, random_state=99)

In [None]:
%%time 
from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor( hidden_layer_sizes=(6,4), activation='relu', solver='lbfgs', max_iter = 5000)
mlp.fit(X1_train,y1_train)
y1_pred = mlp.predict(X1_test)

models.append('MLP')
# The mean squared error
mse.append(mean_squared_error(y1_test, y1_pred))
# Explained variance score: 1 is perfect prediction
r2.append(r2_score(y1_test, y1_pred))


### Support Vector Machine

In [None]:
%%time
batch_size = 50000 # El proceso es muy lento asi que tomamos una muestra.
from sklearn.svm import SVR
svr_regressor = SVR(kernel = 'rbf') #Gaussian kernel
svr_regressor.fit(X1_train[:batch_size], y1_train[:batch_size])
models.append('SVM - rbf')
y1_pred = svr_regressor.predict(X1_test[:batch_size])

# The mean squared error
mse.append(mean_squared_error(y1_test[:batch_size], y1_pred))
# Explained variance score: 1 is perfect prediction
r2.append(r2_score(y1_test[:batch_size], y1_pred))

### Metricas por modelos

In [None]:
results = pd.DataFrame(models,columns = ['Modelo'])
results['R2'] = r2
results['MSE'] = mse
results