# Guía de ciencia de datos y machine learning

Esta guía se centra en los principales pasos a realizar a la hora de realizar un proyecto de ciencia de datos o machine learning.

## Cargando librerias a usar para clasificación

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from scipy import stats # tratamiento de outliers
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score # metricas de clasificacion
from sklearn.metrics import jaccard_score, log_loss, confusion_matrix, roc_curve, roc_auc_score # metricas de clasificacion
from sklearn.model_selection import train_test_split, GridSearchCV # entrenamiento e hiperparametros
from sklearn.preprocessing import LabelEncoder # label enconding
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE # tratamiento de desbalanceo de las clases
import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None) # visualizar todas las columnas

## Cargando librerias a usar para regresión

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing # label encoding
from sklearn import metrics
from scipy import stats # tratamiento de outliers
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score # metricas de regresion
from sklearn.model_selection import train_test_split # entrenamiento
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from sklearn.linear_model import Ridge
import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None) # visualizar todas las columnas

## Cargando dataset en un dataframe(csv)

In [37]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,FRAUDE,VALOR,HORA_AUX,Dist_max_NAL,Canal1,FECHA,COD_PAIS,CANAL,DIASEM,DIAMES,FECHA_VIN,OFICINA_VIN,SEXO,SEGMENTO,EDAD,INGRESOS,EGRESOS,NROPAISES,Dist_Sum_INTER,Dist_Mean_INTER,Dist_Max_INTER,NROCIUDADES,Dist_Mean_NAL,Dist_HOY,Dist_sum_NAL
0,9000000001,1,0.0,13,659.13,ATM_INT,20150501,US,ATM_INT,5,1,20120306.0,392.0,M,Personal Plus,29.0,1200000.0,1200000.0,1,,,,6,474.94,4552.41,5224.36
1,9000000002,1,0.0,17,594.77,ATM_INT,20150515,US,ATM_INT,5,15,20050415.0,716.0,M,Personal Plus,29.0,5643700.0,500000.0,1,,,,5,289.99,4552.41,2029.9
2,9000000003,1,0.0,13,659.13,ATM_INT,20150501,US,ATM_INT,5,1,20120306.0,392.0,M,Personal Plus,29.0,1200000.0,1200000.0,1,,,,6,474.94,4552.41,5224.36
3,9000000004,1,0.0,13,659.13,ATM_INT,20150501,US,ATM_INT,5,1,20120306.0,392.0,M,Personal Plus,29.0,1200000.0,1200000.0,1,,,,6,474.94,4552.41,5224.36
4,9000000005,1,0.0,0,1.0,ATM_INT,20150510,CR,ATM_INT,0,10,20141009.0,788.0,M,Personal,25.0,0.0,0.0,1,,,,1,,1482.35,1.0


## Cargando dataset en un dataframe(txt)

In [38]:
# Leer el archivo de texto usando delimitacion por tabulacion, omitiendo encabezado y quitando la primera fila
df_txt = pd.read_csv('OFEI1204.txt', sep='\t', header=None, skiprows=1)
df_txt = df_txt.dropna() # Eliminar filas vacías
df_txt = df_txt[0].str.split(',') # Dividir en columnas por delimitacion por coma
df_txt= df_txt.apply(pd.Series) # Transformar a dataframe
df_txt.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25
0,AGENTE: AES CHIVOR,,,,,,,,,,,,,,,,,,,,,,,,,
1,CHIVOR1,D,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0
2,CHIVOR2,D,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0
3,CHIVOR3,D,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0
4,CHIVOR4,D,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0


## Visualizando información del dataframe

In [39]:
df.info() # informacion general del dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2965 entries, 0 to 2964
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               2965 non-null   int64  
 1   FRAUDE           2965 non-null   int64  
 2   VALOR            2965 non-null   float64
 3   HORA_AUX         2965 non-null   int64  
 4   Dist_max_NAL     2965 non-null   float64
 5   Canal1           2965 non-null   object 
 6   FECHA            2965 non-null   int64  
 7   COD_PAIS         2965 non-null   object 
 8   CANAL            2965 non-null   object 
 9   DIASEM           2965 non-null   int64  
 10  DIAMES           2965 non-null   int64  
 11  FECHA_VIN        2941 non-null   float64
 12  OFICINA_VIN      2941 non-null   float64
 13  SEXO             2910 non-null   object 
 14  SEGMENTO         2941 non-null   object 
 15  EDAD             2941 non-null   float64
 16  INGRESOS         2941 non-null   float64
 17  EGRESOS       

In [43]:
print(f'Este dataframe contiene {df.shape[0]} filas y {df.shape[1]} columna') # estructura del dataframe

Este dataframe contiene 2965 filas y 26 columna


## Procesamiento de los datos

In [45]:
# cambiando tipo de datos de las columnas
#df['FRAUDE'] = df['FRAUDE'].astype(str) # cambiando el tipo de dato a string
#df["FECHA"] = pd.to_datetime(df["FECHA"]) # cambiando el tipo de dato a datetime
#df['FECHA'] = df['FECHA'].dt.date # cambiando el tipo de dato a date
#df['HORA'] = df['FECHA'].dt.time # creando columna con tipo de dato time

In [55]:
# verificando las columnas con valores nulos
null = df.isnull().sum() * 100 / df.shape[0]
null[null > 0].sort_values(ascending=False)

Dist_Sum_INTER     52.175379
Dist_Mean_INTER    52.175379
Dist_Max_INTER     52.175379
Dist_Mean_NAL      15.413153
SEXO                1.854975
FECHA_VIN           0.809444
OFICINA_VIN         0.809444
SEGMENTO            0.809444
EDAD                0.809444
INGRESOS            0.809444
EGRESOS             0.809444
dtype: float64