In [1]:
import pandas as pd
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [4]:
path1 = "https://raw.githubusercontent.com/jaarciah/Proyecto-Integrador-2020-periodo-2/master/Data/Preprocessing/hurto_motos_categorica_exploratorio.csv"
path2 = "https://raw.githubusercontent.com/jaarciah/Proyecto-Integrador-2020-periodo-2/master/Data/Preprocessing/hurto_motos_dummy_exploratorio.csv"

hurtos_moto = pd.read_csv(path1, sep=";", low_memory=False)
hurto_preprocesado = pd.read_csv(path2, sep=";", low_memory=False)

In [5]:
hurtos_moto.head()

Unnamed: 0,cantidad,latitud,longitud,sexo,edad,estado_civil,grupo_actor,actividad_delictiva,parentesco,ocupacion,...,categoria_bien,grupo_bien,modelo,color,permiso,unidad_medida,dia,mes,anho,dia_semana
0,1.0,6.26804,-75.549892,Hombre,29.0,Soltero(a),Sin dato,Sin dato,Sin dato,Sin dato,...,Vehículos de 2 o 4 ruedas,Vehículo,2014,Azul,Sin dato,Sin dato,1,1,2017,6
1,1.0,6.290076,-75.556893,Hombre,29.461792,Unión marital de hecho,Sin dato,Sin dato,Sin dato,Sin dato,...,Vehículos de 2 o 4 ruedas,Vehículo,2005,Rojo,Sin dato,Sin dato,1,1,2017,6
2,1.0,6.304187,-75.55176,Hombre,23.0,Unión marital de hecho,Sin dato,Sin dato,Sin dato,Sin dato,...,Vehículos de 2 o 4 ruedas,Vehículo,2011,Verde,Sin dato,Sin dato,1,1,2017,6
3,1.0,6.248002,-75.574849,Hombre,-1.0,Sin dato,Sin dato,Sin dato,Sin dato,Sin dato,...,Vehículos de 2 o 4 ruedas,Vehículo,-1,Negro,Sin dato,Sin dato,1,1,2017,6
4,1.0,6.268129,-75.557882,Hombre,34.0,Soltero(a),Sin dato,Sin dato,Sin dato,Sin dato,...,Vehículos de 2 o 4 ruedas,Vehículo,2015,Negro,Sin dato,Sin dato,1,1,2017,6


In [20]:
hurtos_moto.modalidad.value_counts()

Halado    33091
Atraco    25449
Otros      5554
Name: modalidad, dtype: int64

In [6]:
hurto_preprocesado.head()

Unnamed: 0,cantidad,latitud,longitud,sexo,edad,estado_civil,grupo_actor,actividad_delictiva,parentesco,ocupacion,...,categoria_bien,grupo_bien,modelo,color,permiso,unidad_medida,dia,mes,anho,dia_semana
0,1.0,6.26804,-75.549892,0,29.0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,2017,6
1,1.0,6.290076,-75.556893,0,29.461792,1,0,0,0,0,...,0,0,1,1,0,0,1,1,2017,6
2,1.0,6.304187,-75.55176,0,23.0,1,0,0,0,0,...,0,0,2,2,0,0,1,1,2017,6
3,1.0,6.248002,-75.574849,0,-1.0,2,0,0,0,0,...,0,0,3,3,0,0,1,1,2017,6
4,1.0,6.268129,-75.557882,0,34.0,0,0,0,0,0,...,0,0,4,3,0,0,1,1,2017,6


## Para el modelo de clasificación "Modalidad"

In [7]:
y = hurto_preprocesado.modalidad
x = hurto_preprocesado.drop(['modalidad'], axis=1)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=8)

In [9]:
model1 = Pipeline([('randomF', RandomForestClassifier())
                  ])

params = {'randomF__n_estimators':(100,150,200),
         'randomF__criterion':(["gini", "entropy"]),
         'randomF__min_samples_split':(2, 5, 10),
         'randomF__max_features':(["auto", "sqrt", "log2"]),
          'randomF__max_depth':(5, 8)    
         }

In [10]:
grid1 = GridSearchCV(model1, params, cv=5)

In [11]:
grid1.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('randomF',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                                                               max_samples=None,
                                                               min_impurity_decrease=0.0,
                                                               min_impurity_split=None,
                                          

In [12]:
ypred = grid1.predict(X_test)

In [13]:
print(classification_report(y_test, ypred))

              precision    recall  f1-score   support

           0       0.95      0.98      0.96      6640
           1       0.97      0.98      0.97      5097
           2       1.00      0.69      0.82      1082

    accuracy                           0.96     12819
   macro avg       0.97      0.89      0.92     12819
weighted avg       0.96      0.96      0.96     12819



In [14]:
grid1.best_estimator_

Pipeline(memory=None,
         steps=[('randomF',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='entropy',
                                        max_depth=8, max_features='sqrt',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=5,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=200, n_jobs=None,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False))],
         verbose=False)

In [15]:
import pickle
# Save to file in the current working directory
pkl_filename = "grid1.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(grid1, file)

In [None]:
# Cargar el modelo
pkl_filename = "grid1.pkl"
with open(pkl_filename, 'rb') as file:
    grid1 = pickle.load(file)

In [17]:
model1=sfs(RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='entropy',
                                        max_depth=8, max_features='sqrt',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=5,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=200, n_jobs=None,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False),
           k_features=10,
           forward=True,
           verbose=2,
           cv=10,n_jobs=-1,
           scoring='r2')
model1.fit(x,y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  37 | elapsed:   19.7s remaining:   20.8s
[Parallel(n_jobs=-1)]: Done  37 out of  37 | elapsed:  2.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  37 out of  37 | elapsed:  2.6min finished

[2020-11-25 22:50:19] Features: 1/10 -- score: 0.31079482161574895[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 out of  36 | elapsed:   22.5s remaining:   25.2s
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  1.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  1.9min finished

[2020-11-25 22:52:16] Features: 2/10 -- score: 0.6841879124509757[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  35 | elapsed:   31.3s remaining:   60.0s
[Parallel(n_jobs=-1)]: Done  30 out of  35 | elapsed:   37.2s remaining:    6.2s
[Paralle

SequentialFeatureSelector(clone_estimator=True, cv=10,
                          estimator=RandomForestClassifier(bootstrap=True,
                                                           ccp_alpha=0.0,
                                                           class_weight=None,
                                                           criterion='entropy',
                                                           max_depth=8,
                                                           max_features='sqrt',
                                                           max_leaf_nodes=None,
                                                           max_samples=None,
                                                           min_impurity_decrease=0.0,
                                                           min_impurity_split=None,
                                                           min_samples_leaf=1,
                                                           min_samples_split=5,


In [18]:
#Encontramos los índices de los atributos seleccionados.
model1.k_feature_idx_

(9, 12, 13, 14, 18, 19, 20, 25, 28, 32)

In [19]:
#Encontramos los nombres de las columnas de los atributos seleccionados.
model1.k_feature_names_

('ocupacion',
 'medio_transporte',
 'nivel_academico',
 'testigo',
 'arma_medio',
 'articulo_penal',
 'categoria_penal',
 'sede_receptora',
 'grupo_bien',
 'unidad_medida')

In [21]:
random = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='entropy',
                                        max_depth=8, max_features='sqrt',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=5,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=200, n_jobs=None,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False)

In [22]:
random.fit(x,y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=8, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [30]:
import numpy as np

In [41]:
result = [hurto_preprocesado.columns.tolist(), random.feature_importances_.tolist()]

In [45]:
result = pd.DataFrame(result).T

In [61]:
result.sort_values(1, ascending=False)

Unnamed: 0,0,1
18,conducta_especial,0.45079
12,medio_transporte,0.220108
25,lugar,0.135892
35,mes,0.0714397
5,estado_civil,0.0324935
29,grupo_bien,0.0266736
24,codigo_comuna,0.0228027
17,caracterizacion,0.00965634
23,codigo_barrio,0.00682794
2,longitud,0.00546937
