In [1]:
import pandas as pd
import numpy as np
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.pipeline import make_pipeline
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    
from sklearn.preprocessing import StandardScaler

In [2]:
#path1 = "https://raw.githubusercontent.com/jaarciah/Proyecto-Integrador-2020-periodo-2/master/Data/Preprocessing/hurto_motos_categorias_ingCaracteristicas.csv"
path2 = "https://raw.githubusercontent.com/jaarciah/Proyecto-Integrador-2020-periodo-2/master/Data/Modeling/hurto_preprocesado_C.csv"

#hurtos_moto = pd.read_csv(path1, sep=";", low_memory=False)
hurto_preprocesado = pd.read_csv(path2, sep=",", low_memory=False)

In [3]:
hurto_preprocesado.modalidad.value_counts()

0    33100
1    25455
Name: modalidad, dtype: int64

In [4]:
hurto_preprocesado.head()

Unnamed: 0,latitud,longitud,estado_civil,medio_transporte,modalidad,nombre_barrio,codigo_comuna,lugar,sede_receptora,modelo,...,anho,mes,dia,dia_semana,hora,festivos,quincena,week_number,ferias_fiestas,franja_horaria
0,6.26804,-75.549892,0,0,0,0,0,0,0,0,...,2017,1,1,6,23,1,1,52,0,2
1,6.268129,-75.557882,0,0,0,4,3,2,3,4,...,2017,1,1,6,16,1,1,52,0,1
2,6.268399,-75.549576,0,0,0,0,0,0,0,4,...,2017,1,1,6,10,1,1,52,0,0
3,6.263576,-75.558601,3,0,0,4,3,0,3,4,...,2017,1,3,1,8,0,0,1,0,0
4,6.263299,-75.556534,0,1,1,4,3,0,3,0,...,2017,1,7,5,3,0,0,1,0,2


In [5]:
hurto_preprocesado.columns

Index(['latitud', 'longitud', 'estado_civil', 'medio_transporte', 'modalidad',
       'nombre_barrio', 'codigo_comuna', 'lugar', 'sede_receptora', 'modelo',
       'fecha', 'anho', 'mes', 'dia', 'dia_semana', 'hora', 'festivos',
       'quincena', 'week_number', 'ferias_fiestas', 'franja_horaria'],
      dtype='object')

In [6]:
# variables escogidas despues del proceso de ingenieria de caracteristica.
variables = ["latitud","longitud","estado_civil","medio_transporte","modalidad","nombre_barrio","codigo_comuna","lugar","sede_receptora","modelo", "fecha"]

hurto_preprocesado_new = hurto_preprocesado[variables]

# Proceso de cross-validation.

In [8]:
y = hurto_preprocesado_new.modalidad
x = hurto_preprocesado_new.drop(['modalidad'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=8)

# 3. Stochastic Gradient Descent.
https://scikit-learn.org/stable/modules/sgd.html#classification

In [33]:
modelSGD = Pipeline([('SGD', SGDClassifier())
                    ])

alpha = np.logspace(-9, 3, 4)

params = {'SGD__penalty':('l2', 'l1', 'elasticnet'),
          'SGD__alpha':(alpha),
          'SGD__average':(True, False)
          }

In [34]:
grid_modelSGD = GridSearchCV(modelSGD, params, cv=5)

In [35]:
grid_modelSGD.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('SGD',
                                        SGDClassifier(alpha=0.0001,
                                                      average=False,
                                                      class_weight=None,
                                                      early_stopping=False,
                                                      epsilon=0.1, eta0=0.0,
                                                      fit_intercept=True,
                                                      l1_ratio=0.15,
                                                      learning_rate='optimal',
                                                      loss='hinge',
                                                      max_iter=1000,
                                                      n_iter_no_change=5,
                                                      n_jobs=None, penalty='

In [36]:
ypred = grid_modelSGD.predict(X_test)

In [37]:
print(classification_report(y_test, ypred))

              precision    recall  f1-score   support

           0       0.57      1.00      0.72      6657
           1       0.00      0.00      0.00      5054

    accuracy                           0.57     11711
   macro avg       0.28      0.50      0.36     11711
weighted avg       0.32      0.57      0.41     11711



In [38]:
grid_modelSGD.best_estimator_

Pipeline(memory=None,
         steps=[('SGD',
                 SGDClassifier(alpha=1e-09, average=True, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                               max_iter=1000, n_iter_no_change=5, n_jobs=None,
                               penalty='l2', power_t=0.5, random_state=None,
                               shuffle=True, tol=0.001, validation_fraction=0.1,
                               verbose=0, warm_start=False))],
         verbose=False)

In [39]:
# Save to file in the current working directory
pkl_filename = "/home/jaarciah/PI/Clasificación /modelos/iteración_3/grid_modelSGD.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(grid_modelSGD, file)