### Bases de Datos para el Big Data
#### Desarrollado por: **Ing. Julian Quimbayo**
#### Limpieza de Datos - Actividad 1

#### **Paso 1: Importación de Librerías**

In [114]:
##Cargue de las librerías
import pandas as pd
import numpy as np
##Librerías para graficar
import matplotlib.pyplot as plt
import seaborn as sns

#### **Paso 2: Funciones de Limpieza**

In [169]:
# Función para importar data .xlsx o .csv
def importDatos(ruta, archivo, ext, sep):
    if ext == '.csv':
        data = pd.read_csv(ruta+archivo+ext, sep=sep)
        return data
    elif ext == '.xlsx':
        data = pd.read_excel(ruta+archivo+ext)
        return data
    else:
        print('Extensión diferente')

# Función informe sobre los datos NAN
def datos_NA(df):
    if isinstance(df, pd.DataFrame):
        total_na = df.isna().sum().sum()
        print("Dimensiones : %d filas, %d columnas" %
              (df.shape[0], df.shape[1]))
        print("Total Valores NA : %d " % (total_na))
        print("%38s %10s     %10s %10s" %
              ("Nombre Columna", "Tipo de Dato", "#Distintos", "Valores NA"))
        col_name = df.columns
        dtyp = df.dtypes
        uniq = df.nunique()
        na_val = df.isna().sum()
        for i in range(len(df.columns)):
            print("%38s %10s   %10s %10s" %
                  (col_name[i], dtyp[i], uniq[i], na_val[i]))

    else:
        print("Se esperaba dataframe %15s" % (type(df)))

## Frecuencias absolutas para cualquier columna
def revCategoricos(df, nomCol):
    rev = df[nomCol].value_counts()
    return rev

## Frecuencias relativas para cualquier columna
def revCategoricosFreq(df, nomCol, dec):
    return round(df[nomCol].value_counts()/np.float64(len(df)),dec)*100

## Revisión de datos duplicados en cuanto a filas
def duplicados(df):
    duplicados =df.duplicated()
    return duplicados.value_counts()

##Eliminar filas con datos inconsistentes menores al 5% del total de la data
def elimFilas(df,param):
    return df.loc[(df != param).all(axis=1), :]

# Función para datos categóricos modificaciones
def imputacionCat(df, nomCol, busqueda, reempl):
    df[nomCol] = np.where(df[nomCol] == busqueda, reempl, df[nomCol])
    return df[nomCol]
#Función para imputación de datos NA
def imputacionCatNa(df, nomCol):
    df[nomCol] = df[nomCol].fillna(df[nomCol].mode()[0])
    return df[nomCol]

#### **Paso 3: Entendimiento de la data**

In [179]:
##Importando data
data = importDatos('./','data_act_01_Nueva','.csv',';')
data.head()

data.shape

(10051, 12)

In [117]:
## Revisión de la data para identificar datos NAN
datos_NA(data)

Dimensiones : 10051 filas, 12 columnas
Total Valores NA : 10375 
                        Nombre Columna Tipo de Dato     #Distintos Valores NA
                               CrimeId      int64        10047          0
                 OriginalCrimeTypeName     object          575          0
                           OffenseDate     object            9          0
                              CallTime     object         1416          0
                          CallDateTime     object         5116          0
                           Disposition     object           19          0
                               Address     object         5387          0
                                  City     object            8        321
                                 State     object            1          3
                              AgencyId     object            2          0
                                 Range    float64            0      10051
                           AddressType     

In [118]:
## Revisión de filas duplicadas - No existen datos duplicados
duplicados(data)

False    10051
dtype: int64

#### **Paso 4: Casos puntuales por columna**

In [119]:
##Columna 1 - CrimeId
revCategoricos(data, 'CrimeId')
##Columna reportada de manera correcta, debido a que es un Id sobre crimenes en forma consecutiva.

##Columna 2 - OriginalCrimeTypeName
revCategoricos(data, 'OriginalCrimeTypeName')
##Presenta datos inconsistentes en la columna debido a que esta columna representa el tipo de crimen original donde
##se evidencian numeros que no corresponden

##se mantiene el principio de que si las frecuencias son menores al 5% no se tendrán en cuenta para futuros análisis.
##de igual forma acorde al modelo de negocio no son lo adecuado.

revCategoricosFreq(data, 'OriginalCrimeTypeName',3)

##Se crean filtros por cada letra del alfabeto revisando cada categoria de crimen = evidencia att se cambia por attempt, eliminar fila awol

dataA = data[data['OriginalCrimeTypeName'].str.startswith("A")]

revCategoricos(dataA, 'OriginalCrimeTypeName')

data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Assault / Battery Dv','Assault / Battery')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Agg Assault / Adw','Assault / Battery')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Att','Attempt')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Attempt Report','Attempt')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Att Only','Attempt')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Att Jo','Attempt')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Att Susp','Attempt')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Att/240 Jo','Attempt')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Attemp','Attempt')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'At The Atm','Attempt')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Agressive','Aggressive')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Aggr Soliciting','Aggressive')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Aggressive 601','Aggressive')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Alarm','Audible Alarm')


dataA = data[data['OriginalCrimeTypeName'].str.startswith("A")]

revCategoricos(dataA, 'OriginalCrimeTypeName')

##Eliminacion de filas menores del 5% de la data completa
data = data.drop(data[(data.OriginalCrimeTypeName=='Awol') | (data.OriginalCrimeTypeName == 'Adv') | 
                                                (data.OriginalCrimeTypeName == 'Adv To 0123')| (data.OriginalCrimeTypeName == 'A')
                                                | (data.OriginalCrimeTypeName == 'Atc')| (data.OriginalCrimeTypeName == 'Ams')
                                                | (data.OriginalCrimeTypeName == 'Areport')| (data.OriginalCrimeTypeName == 'Amplified')
                                                | (data.OriginalCrimeTypeName == 'Adv To Co A')].index)


dataA = data[data['OriginalCrimeTypeName'].str.startswith("A")]

revCategoricos(dataA, 'OriginalCrimeTypeName')

Audible Alarm           271
Auto Boost / Strip      166
Assault / Battery       160
Aggressive               44
Arrest Made              22
Aided Case               19
Attempt                  18
At Risk                   3
Ambulance                 2
Agg Assault / Adw Dv      1
Name: OriginalCrimeTypeName, dtype: int64

In [120]:
dataB = data[data['OriginalCrimeTypeName'].str.startswith("B")]

revCategoricos(dataB, 'OriginalCrimeTypeName')

##Eliminacion de palabras como busn, band, Bl, busn/voip, Bart, Bat

Burglary               80
Busn                    3
Barking                 2
Band                    2
Boombox                 2
Bl                      1
Bicylist                1
Brewing/pay Dispute     1
Broken Window           1
Busy                    1
Busn/voip               1
Bottles                 1
Bart                    1
Bat                     1
Bomb Threat             1
Bar Check               1
Brewing                 1
Name: OriginalCrimeTypeName, dtype: int64

In [121]:
data = data.drop(data[(data.OriginalCrimeTypeName=='Busn') | (data.OriginalCrimeTypeName == 'Band') | 
                                                (data.OriginalCrimeTypeName == 'Boombox')| (data.OriginalCrimeTypeName == 'Bl')
                                                | (data.OriginalCrimeTypeName == 'Busn/voip')| (data.OriginalCrimeTypeName == 'Bart')
                                                | (data.OriginalCrimeTypeName == 'Bat')].index)

dataB = data[data['OriginalCrimeTypeName'].str.startswith("B")]

revCategoricos(dataB, 'OriginalCrimeTypeName')

Burglary               80
Barking                 2
Bar Check               1
Bomb Threat             1
Bottles                 1
Busy                    1
Broken Window           1
Brewing/pay Dispute     1
Bicylist                1
Brewing                 1
Name: OriginalCrimeTypeName, dtype: int64

In [128]:
dataC = data[data['OriginalCrimeTypeName'].str.startswith("C")]

revCategoricos(dataC, 'OriginalCrimeTypeName')

##Correción de palabras como cassing, caser, casers, camp

data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Complaint Unkn','Completely Unknown')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Caser','Casing')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Casers','Casing')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Cassing','Casing')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Casing/852','Casing')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Casing/917','Casing')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Child','Children')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Camper','Campers')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Camping','Campers')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Campers W/bikes','Campers')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Camp','Campers')

data = data.drop(data[(data.OriginalCrimeTypeName=='Cz') | (data.OriginalCrimeTypeName == 'Cw') | 
                                                (data.OriginalCrimeTypeName == 'Check For Ped')| (data.OriginalCrimeTypeName == 'Cane')
                                                | (data.OriginalCrimeTypeName == 'Chp 1030 Veh')| (data.OriginalCrimeTypeName == 'Chp')
                                                | (data.OriginalCrimeTypeName == 'City Veh')| (data.OriginalCrimeTypeName == 'Curb')].index)



In [129]:
dataC = data[data['OriginalCrimeTypeName'].str.startswith("C")]

revCategoricos(dataC, 'OriginalCrimeTypeName')

Completely Unknown    98
Casing                39
Campers               22
Citizen Standby       22
Citizen Arrest        11
Children               7
Custody                4
Construction           3
Crosswalk              2
Chop Shop              2
Crack                  1
Car Campers            1
Name: OriginalCrimeTypeName, dtype: int64

In [134]:
dataD = data[data['OriginalCrimeTypeName'].str.startswith("D")]

revCategoricos(dataD, 'OriginalCrimeTypeName')

data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Drinkers','Drinking')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Drops//busn','Drop')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Drugs/915','Drugs')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Dirtbikes/586','Dirtbikes')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Drugs/dealing','Drugs')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Dog Barking','Dogs Barking')

data = data.drop(data[(data.OriginalCrimeTypeName=='Dw') | (data.OriginalCrimeTypeName == 'Dw/rz') | 
                                                (data.OriginalCrimeTypeName == 'Dp')| (data.OriginalCrimeTypeName == 'Dog')
                                                | (data.OriginalCrimeTypeName == 'Dw Tow')| (data.OriginalCrimeTypeName == 'Doorway/917')
                                                | (data.OriginalCrimeTypeName == 'Drummer')| (data.OriginalCrimeTypeName == 'Dropped Call')| 
                                                (data.OriginalCrimeTypeName == 'Drp')| 
                                                (data.OriginalCrimeTypeName == 'Dancer')].index)

dataD = data[data['OriginalCrimeTypeName'].str.startswith("D")]

revCategoricos(dataD, 'OriginalCrimeTypeName')


Drugs                  61
Drop                   27
Drunk Driver           10
Demo / Protest          9
Drinking                6
Dirtbikes               6
Death / Coroner         3
Dogs Barking            2
Drugs Needles           1
Driveway                1
Drugdealer              1
Dismember Body Part     1
Drug Use                1
Dog Bite                1
Death                   1
Name: OriginalCrimeTypeName, dtype: int64

In [136]:
dataE = data[data['OriginalCrimeTypeName'].str.startswith("E")]

revCategoricos(dataE, 'OriginalCrimeTypeName')

data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Encampments','Encampment')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Encampent','Encampment')

data = data.drop(data[(data.OriginalCrimeTypeName=='Elderly X')].index)

dataE = data[data['OriginalCrimeTypeName'].str.startswith("E")]

revCategoricos(dataE, 'OriginalCrimeTypeName')


Encampment         68
Escalating          1
Elevator            1
Explosive Found     1
Explosion           1
Name: OriginalCrimeTypeName, dtype: int64

In [139]:
dataF = data[data['OriginalCrimeTypeName'].str.startswith("F")]

revCategoricos(dataF, 'OriginalCrimeTypeName')

data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Fight No Weapon Dv','Fight No Weapon')

data = data.drop(data[(data.OriginalCrimeTypeName=='Family')|(data.OriginalCrimeTypeName == 'Female')
| (data.OriginalCrimeTypeName == 'Fi**expedite**')| (data.OriginalCrimeTypeName == 'Fp')| (data.OriginalCrimeTypeName == 'Follow Up')].index)

dataF = data[data['OriginalCrimeTypeName'].str.startswith("F")]

revCategoricos(dataF, 'OriginalCrimeTypeName')


Fight No Weapon    286
Fraud               54
Fire                 3
Fare                 1
Found Drugs          1
Fireworks            1
Name: OriginalCrimeTypeName, dtype: int64

In [143]:
dataG = data[data['OriginalCrimeTypeName'].str.startswith("G")]

revCategoricos(dataG, 'OriginalCrimeTypeName')

data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Gambling','Gamblers')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Get Inside The Bldg','Get Inside The Building')

data = data.drop(data[(data.OriginalCrimeTypeName=='Guitar')|(data.OriginalCrimeTypeName == 'Gz')].index)

dataG = data[data['OriginalCrimeTypeName'].str.startswith("G")]

revCategoricos(dataG, 'OriginalCrimeTypeName')

Grand Theft                5
Graffiti Vandalism         5
Gamblers                   2
Glass Bottles              1
Get Inside The Building    1
Name: OriginalCrimeTypeName, dtype: int64

In [145]:
dataH = data[data['OriginalCrimeTypeName'].str.startswith("H")]

revCategoricos(dataH, 'OriginalCrimeTypeName')

data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'H&r Veh Accident','Vehicle Accident')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'H&r Injury Accident','Vehicle Accident')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Hold-Up','Hold Up')

data = data.drop(data[(data.OriginalCrimeTypeName=='Hu')|(data.OriginalCrimeTypeName == 'Hot')
|(data.OriginalCrimeTypeName == 'H&r')|(data.OriginalCrimeTypeName == 'H/r')|(data.OriginalCrimeTypeName == 'Haz')].index)

dataH = data[data['OriginalCrimeTypeName'].str.startswith("H")]

revCategoricos(dataH, 'OriginalCrimeTypeName')

Homeless Complaint    585
Hold Up                 2
Hammer                  1
Heated                  1
Home Invasion           1
House                   1
Name: OriginalCrimeTypeName, dtype: int64

In [147]:
dataI = data[data['OriginalCrimeTypeName'].str.startswith("I")]

revCategoricos(dataI, 'OriginalCrimeTypeName')

data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Injury Veh Accident','Vehicle Accident')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Illegal Vendor','Illegal Vendors')

data = data.drop(data[(data.OriginalCrimeTypeName=='Ip')|(data.OriginalCrimeTypeName == 'I/p')
|(data.OriginalCrimeTypeName == 'Iph')|(data.OriginalCrimeTypeName == 'In Svc/ On Foot')|(data.OriginalCrimeTypeName == 'Ifo')].index)

dataI = data[data['OriginalCrimeTypeName'].str.startswith("I")]

revCategoricos(dataI, 'OriginalCrimeTypeName')

Intoxicated Person    30
Indecent Exposure     30
Illegal Vendors        3
Illegal Gambling       1
Name: OriginalCrimeTypeName, dtype: int64

In [149]:
dataJ = data[data['OriginalCrimeTypeName'].str.startswith("J")]

revCategoricos(dataJ, 'OriginalCrimeTypeName')

data = data.drop(data[(data.OriginalCrimeTypeName=='Jo')|(data.OriginalCrimeTypeName == 'Jjo')
|(data.OriginalCrimeTypeName == 'J/o')|(data.OriginalCrimeTypeName == 'Jo/bolo')|(data.OriginalCrimeTypeName == "Juv's")
|(data.OriginalCrimeTypeName == 'Jump')|(data.OriginalCrimeTypeName == "Jumper")].index)

dataJ = data[data['OriginalCrimeTypeName'].str.startswith("J")]

revCategoricos(dataJ, 'OriginalCrimeTypeName')

Juvenile Disturbance    15
Juve Beyond Control     11
Name: OriginalCrimeTypeName, dtype: int64

In [150]:
dataK = data[data['OriginalCrimeTypeName'].str.startswith("K")]

revCategoricos(dataK, 'OriginalCrimeTypeName')

Keys          1
Kidnapping    1
Name: OriginalCrimeTypeName, dtype: int64

In [152]:
dataL = data[data['OriginalCrimeTypeName'].str.startswith("L")]

revCategoricos(dataL, 'OriginalCrimeTypeName')

data = data.drop(data[(data.OriginalCrimeTypeName=='Lltd')|(data.OriginalCrimeTypeName == 'Lp')
|(data.OriginalCrimeTypeName == 'Lltn')|(data.OriginalCrimeTypeName == 'Ll/td')|(data.OriginalCrimeTypeName == "Ll/ll")
|(data.OriginalCrimeTypeName == 'Ld')|(data.OriginalCrimeTypeName == "Loud Subj's")|(data.OriginalCrimeTypeName == "Ll/ Tenant")].index)

dataL = data[data['OriginalCrimeTypeName'].str.startswith("L")]

revCategoricos(dataL, 'OriginalCrimeTypeName')

Loud Party      3
Loud Music      2
Live Band       2
Loitering       1
Living          1
Loading Dock    1
Loud Bass       1
Loud Talking    1
Name: OriginalCrimeTypeName, dtype: int64

In [155]:
dataM = data[data['OriginalCrimeTypeName'].str.startswith("M")]

revCategoricos(dataM, 'OriginalCrimeTypeName')

data = data.drop(data[(data.OriginalCrimeTypeName=='Male')|(data.OriginalCrimeTypeName == 'Music')
|(data.OriginalCrimeTypeName == 'Mc')|(data.OriginalCrimeTypeName == 'Mace')|(data.OriginalCrimeTypeName == "Muni Alarm")
|(data.OriginalCrimeTypeName == "Mc's")|(data.OriginalCrimeTypeName == "Muni")|(data.OriginalCrimeTypeName == "Music/909")
|(data.OriginalCrimeTypeName == "Male/poss Hazard")|(data.OriginalCrimeTypeName == "Mal")|(data.OriginalCrimeTypeName == "Maced")].index)


data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Meet W/citizen','Meet Citizen')
data['OriginalCrimeTypeName']= imputacionCat(data, 'OriginalCrimeTypeName', 'Meet W/officer','Meet Officer')

dataM = data[data['OriginalCrimeTypeName'].str.startswith("M")]

revCategoricos(dataM, 'OriginalCrimeTypeName')

Muni Inspection       294
Meet Citizen          160
Mentally Disturbed    124
Missing Adult          34
Missing Juvenile       16
Meet Officer           13
Making A Mess           1
Name: OriginalCrimeTypeName, dtype: int64

In [157]:
dataN = data[data['OriginalCrimeTypeName'].str.startswith("N")]

revCategoricos(dataN, 'OriginalCrimeTypeName')

data = data.drop(data[(data.OriginalCrimeTypeName=='Npat')|(data.OriginalCrimeTypeName == 'Nabor')
|(data.OriginalCrimeTypeName == 'Nabors')|(data.OriginalCrimeTypeName == 'Npa')|(data.OriginalCrimeTypeName == "Naked")
|(data.OriginalCrimeTypeName == "Nabo")].index)

dataN = data[data['OriginalCrimeTypeName'].str.startswith("N")]

revCategoricos(dataN, 'OriginalCrimeTypeName')

Noise Nuisance    174
Neighbors           2
Name: OriginalCrimeTypeName, dtype: int64

In [159]:
dataO = data[data['OriginalCrimeTypeName'].str.startswith("O")]

revCategoricos(dataO, 'OriginalCrimeTypeName')

data = data.drop(data[(data.OriginalCrimeTypeName=='Opp')|(data.OriginalCrimeTypeName == 'Open')
|(data.OriginalCrimeTypeName == 'Open Line')|(data.OriginalCrimeTypeName == 'Openline')].index)

dataO = data[data['OriginalCrimeTypeName'].str.startswith("O")]

revCategoricos(dataO, 'OriginalCrimeTypeName')

Opportunist    3
Name: OriginalCrimeTypeName, dtype: int64

In [170]:
##Se extrae el dataframe para terminar las demas inconsistencias
data.to_csv('dataLimpia.csv', index=False)
## Se extrae a .csv, terminando de limpiar las categorias y quitando filas vacías
##se vuelve a cargar para verificar paso final

dataFinal = importDatos('./','DataLimpiaDos','.csv',';')
datos_NA(dataFinal)

##Correcion de datos san francisco
revCategoricos(dataFinal, 'City')


dataFinal['City']= imputacionCat(dataFinal, 'City', 'SAN FRANCISCO','San Francisco')
dataFinal['City']= imputacionCat(dataFinal, 'City', 'Treasure Isla','Treasure Island')

dataFinal = dataFinal.drop(dataFinal[(dataFinal.City=='Daly City')|(dataFinal.City == 'Yerba Buena')
|(dataFinal.City == 'Presidio')|(dataFinal.City == ' S')|(dataFinal.City == 'Brisbane')].index)

revCategoricos(dataFinal, 'City')

dataFinal['City'] = imputacionCatNa(dataFinal,'City')



Dimensiones : 8416 filas, 11 columnas
Total Valores NA : 321 
                        Nombre Columna Tipo de Dato     #Distintos Valores NA
                               CrimeId      int64         8412          0
                 OriginalCrimeTypeName     object          291          0
                           OffenseDate     object            9          0
                              CallTime     object         1406          0
                          CallDateTime     object         4727          0
                           Disposition     object           19          0
                               Address     object         4601          0
                                  City     object            8        321
                                 State     object            1          0
                              AgencyId      int64            1          0
                           AddressType     object            6          0


In [171]:
datos_NA(dataFinal)

Dimensiones : 8405 filas, 11 columnas
Total Valores NA : 0 
                        Nombre Columna Tipo de Dato     #Distintos Valores NA
                               CrimeId      int64         8401          0
                 OriginalCrimeTypeName     object          291          0
                           OffenseDate     object            9          0
                              CallTime     object         1406          0
                          CallDateTime     object         4726          0
                           Disposition     object           19          0
                               Address     object         4590          0
                                  City     object            2          0
                                 State     object            1          0
                              AgencyId      int64            1          0
                           AddressType     object            6          0


In [174]:
revCategoricos(dataFinal, 'Disposition')

dataFinal['Disposition']= imputacionCat(dataFinal, 'Disposition', 'Not recorded','HAN')
dataFinal['Disposition']= imputacionCat(dataFinal, 'Disposition', '22','HAN')

revCategoricos(dataFinal, 'Disposition')

HAN    3095
CIT    1044
ADV    1033
GOA     902
REP     724
ND      385
UTL     312
NOM     243
CAN     217
PAS     166
ABA      81
NCR      76
ARR      63
ADM      44
INC      17
CRT       2
SFD       1
Name: Disposition, dtype: int64

In [178]:
##Extraccion como .csv y json
dataFinal.to_csv('dataLimpia.csv', index=False)

dataFinal.to_json("./dataFinal.json", orient='index')
