# Análisis Exploratorio

    Fuente de información: NUSE
    Archivos: 
    NUSE 934 611(M) 2017-2018.dsv
    NUSE 934-611-611M ENERO2019.csv

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import plotly.express as px
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)  
from wordcloud import WordCloud, STOPWORDS

In [None]:
#Create folder to save figures
import os

if not os.path.exists("figuras_nuse"):
    os.mkdir("figuras_nuse")

In [None]:
def print_Data(df):
    display(df.style.hide_index())

In [None]:
data_location = '/Users/anamaria/Downloads/NUSE 934 611(M) 2017-2018.dsv'
data2018=pd.read_csv(data_location,delimiter="|")

In [None]:
data_location = '/Users/anamaria/Downloads/NUSE 934-611-611M ENERO2019.csv'
data2019=pd.read_csv(data_location,delimiter=";")

## Join datasets

In [None]:
frames = [data2018, data2019]
data = pd.concat(frames)

In [None]:
data.head()

In [None]:
print("Tamaño:",data.shape)
print("size: ",data.size)
print("Porcentaje Celdas Vacias:",data.isna().mean().mean())
print("Porcentaje Celdas con valor '-':",(data == "-").mean().mean())
print("Filas duplicadas",data.duplicated().sum())

In [None]:
data.drop_duplicates(inplace=True)

In [None]:
print("Tamaño:",data.shape)
print("size: ",data.size)
print("Porcentaje Celdas Vacias:",data.isna().mean().mean())
print("Porcentaje Celdas con valor '-':",(data == "-").mean().mean())
print("Filas duplicadas",data.duplicated().sum())

In [None]:
pd.DataFrame({"Tipo de dato":data.dtypes.values,
              "Celdas con valor '-'":(data == '-').sum().values,
             "Celdas vacías": data.isna().sum().values},
             index=data.columns)

### Analisis campo 'FECHA'

In [None]:
data['FECHA'] =  pd.to_datetime(data['FECHA'])
subdata=pd.DataFrame(data.groupby(["FECHA"]).size(),columns=["Cantidad"]).sort_index().reset_index()
topdata=(subdata.sort_values(by=['Cantidad'],ascending=False)[:10])

In [None]:
#Statistical description
pd.DataFrame(subdata.describe())

In [None]:
##Time series
fig = px.line(subdata, x='FECHA', y='Cantidad',
              color_discrete_sequence = px.colors.qualitative.Prism[1:])
fig.update_layout(title_text="Incident reports time series",
                  xaxis_rangeslider_visible=True)
fig.write_image("figuras_nuse/fecha_ts.png")
fig.show()

In [None]:
# Bar figure for top frequent values
fig = px.bar(topdata, x='Cantidad', y='FECHA', orientation='h', 
             color_discrete_sequence = px.colors.qualitative.Prism[2:])
fig.update_layout(title_text="Top dates of incident reports")
fig.write_image("figuras_nuse/fecha_bar.png")
fig.show()

In [None]:
data['day_of_week'] = data["FECHA"].dt.day_name()

In [None]:
#Incident reports by week day
subdata=pd.DataFrame(data.groupby(["day_of_week","ANIO"]).size(),columns=["Cantidad"]).sort_index().reset_index()
fig = px.bar(subdata, x='ANIO', y='Cantidad', color = 'day_of_week', barmode = 'group',
            category_orders={"day_of_week": ["Monday", "Tuesday", "Wednesday", "Thursday","Friday","Saturday","Sunday"]})
fig.update_layout(title_text="Incident reports by day of week")
fig.write_image("figuras_nuse/day_week_year_bar.png")
fig.show()

### Análisis campo 'HORA'

In [None]:
subdata=pd.DataFrame(data.groupby(["HORA"]).size(),columns=["Cantidad"]).sort_index().reset_index()
topdata=(subdata.sort_values(by=['Cantidad'],ascending=False)[:10])

In [None]:
#Statistical description
pd.DataFrame(subdata.describe())

In [None]:
#Format 'HORA' field into datetime to proper visualization
data['HORA']=data["HORA"].astype(str).str.rjust(4,'0')
data['time_stamp']=pd.to_datetime(data['HORA'],format= '%H%M').dt.time

In [None]:
##Time series
subdata=pd.DataFrame(data.groupby(["time_stamp"]).size(),columns=["Cantidad"]).sort_index().reset_index()
fig = px.line(subdata, x='time_stamp', y='Cantidad',
              color_discrete_sequence = px.colors.qualitative.Prism[1:])
fig.update_layout(title_text="Incident reports time series - time (24 hr format)",
                  xaxis_rangeslider_visible=True)
fig.write_image("figuras_nuse/hora_ts.png")
fig.show()

In [None]:
#Incident reports by week day and hour
subdata=pd.DataFrame(data.groupby(["time_stamp","day_of_week"]).size(),columns=["Cantidad"]).sort_index().reset_index()

fig = px.line(subdata, x="time_stamp", y="Cantidad", facet_row="day_of_week", color="day_of_week",
              width=800, height=1400,  color_discrete_sequence = px.colors.qualitative.Prism[1:],
              category_orders={"day_of_week": ["Monday", "Tuesday", "Wednesday", "Thursday","Friday","Saturday","Sunday"]},
              labels={"day_of_week":"day"})
fig.update_layout(title_text="Incident reports by day of week and hour",
                 showlegend=False)
fig.write_image("figuras_nuse/day_week_hour_ts.png")
fig.show()

### Análisis campo 'ANIO'

In [None]:
#Format to string
data['ANIO']=data["ANIO"].astype(str)

In [None]:
subdata=pd.DataFrame(data.groupby(["ANIO"]).size(),columns=["Cantidad"]).sort_index().reset_index()
topdata=(subdata.sort_values(by=['Cantidad'],ascending=False)[:10])

In [None]:
#Statistical description
pd.DataFrame(subdata.describe())

In [None]:
# Bar figure
fig = px.bar(subdata, x='Cantidad', y='ANIO', color='ANIO', orientation='h', 
             color_discrete_sequence = px.colors.qualitative.Prism[1:])
fig.update_layout(title_text="Incident reports by year",
                 yaxis={"type":"category"}, showlegend=False)
fig.write_image("figuras_nuse/año_bar.png")
fig.show()

### Análisis campo 'MES'

In [None]:
data['MES']=data["MES"].astype(str)

In [None]:
subdata=pd.DataFrame(data.groupby(["MES"]).size(),columns=["Cantidad"]).sort_index().reset_index()
topdata=(subdata.sort_values(by=['Cantidad'],ascending=False)[:10])

In [None]:
#Statistical description
pd.DataFrame(subdata.describe())

In [None]:
# Bar figure
fig = px.bar(subdata, x='Cantidad', y='MES', orientation='h', color='MES',
             color_discrete_sequence = px.colors.qualitative.Prism[0:],
             category_orders={"MES": ["1","2","3","4","5","6","7","8","9","10","11","12"]})
fig.update_layout(title_text="Incident reports by month",
                 yaxis={"type":"category"}, showlegend=False)
fig.write_image("figuras_nuse/mes_bar.png")
fig.show()

In [None]:
#Graph incidents by month and day of week
subdata=pd.DataFrame(data.groupby(["MES","day_of_week"]).size(),columns=["Cantidad"]).sort_index().reset_index()

fig = px.bar(subdata, x="MES", y="Cantidad", color="day_of_week",
            category_orders={"day_of_week": ["Monday", "Tuesday", "Wednesday", "Thursday","Friday","Saturday","Sunday"]},
            labels={"day_of_week":"day"},color_discrete_sequence = px.colors.qualitative.Prism[1:])
fig.update_layout(title_text="Incident reports by day of week and month")
fig.write_image("figuras_nuse/day_week_month_bar.png")
fig.show()

### Análisis campo 'PERIODO_TS'

In [None]:
subdata=pd.DataFrame(data.groupby(["PERIODO_TS"]).size(),columns=["Cantidad"]).sort_index().reset_index()
topdata=(subdata.sort_values(by=['Cantidad'],ascending=False)[:10])

In [None]:
#Statistical description
pd.DataFrame(subdata.describe())

In [None]:
fig = px.bar(subdata, y="PERIODO_TS", x="Cantidad", orientation='h', color="PERIODO_TS",
            color_discrete_sequence = px.colors.qualitative.Prism[1:])
fig.update_layout(title_text="Incident reports by 'PERIODO_TS'",showlegend=False)
fig.write_image("figuras_nuse/periodo_bar.png")
fig.show()

### Análisis campo 'COD_LOCALIDAD' y 'LOCALIDAD'

In [None]:
subdata=pd.DataFrame(data.groupby(["COD_LOCALIDAD","LOCALIDAD"]).size(),columns=["Cantidad"]).sort_index().reset_index()
topdata=(subdata.sort_values(by=['Cantidad'],ascending=False)[:20])

In [None]:
#Statistical description
pd.DataFrame(subdata.describe())

In [None]:
# Bar figure
fig = px.bar(topdata, x='Cantidad', y='LOCALIDAD', orientation='h', color='LOCALIDAD',
             color_discrete_sequence = px.colors.qualitative.Prism[0:])
fig.update_layout(title_text="Top incident reports by 'localidades'",
                 yaxis={"type":"category"}, showlegend=False)
fig.write_image("figuras_nuse/localidad_bar.png")
fig.show()

### Análisis campo 'COD_UPZ' y 'UPZ'

In [None]:
subdata=pd.DataFrame(data.groupby(["COD_UPZ","UPZ"]).size(),columns=["Cantidad"]).sort_index().reset_index()
topdata=(subdata.sort_values(by=['Cantidad'],ascending=False)[:10])

In [None]:
#Statistical description
pd.DataFrame(subdata.describe())

In [None]:
# Bar figure
fig = px.bar(topdata, x='Cantidad', y='UPZ', orientation='h', color='UPZ',
             color_discrete_sequence = px.colors.qualitative.Prism[0:])
fig.update_layout(title_text="Top incident reports by 'UPZ'",
                 yaxis={"type":"category"}, showlegend=False)
fig.write_image("figuras_nuse/upz_bar.png")
fig.show()

### Análisis campos 'COD_SEC_CATAST‘ y ‘SEC_CATASTRAL'

In [None]:
subdata=pd.DataFrame(data.groupby(["COD_SEC_CATAST","SEC_CATASTRAL"]).size(),columns=["Cantidad"]).sort_index().reset_index()
topdata=(subdata.sort_values(by=['Cantidad'],ascending=False)[:10])

In [None]:
#Statistical description
pd.DataFrame(subdata.describe())

In [None]:
# Bar figure
fig = px.bar(topdata, x='Cantidad', y='SEC_CATASTRAL', orientation='h', color='SEC_CATASTRAL',
             color_discrete_sequence = px.colors.qualitative.Prism[0:])
fig.update_layout(title_text="Top incident reports by 'UPZ'",
                 yaxis={"type":"category"}, showlegend=False)
fig.write_image("figuras_nuse/sec_catastr_bar.png")
fig.show()

### Análisis campos 'COD_BARRIO' Y 'BARRIO'

In [None]:
subdata=pd.DataFrame(data.groupby(["COD_BARRIO","BARRIO"]).size(),columns=["Cantidad"]).sort_index().reset_index()
topdata=(subdata.sort_values(by=['Cantidad'],ascending=False)[:10])

In [None]:
#Statistical description
pd.DataFrame(subdata.describe())

In [None]:
# Bar figure
fig = px.bar(topdata, x='Cantidad', y='BARRIO', orientation='h', color='BARRIO',
             color_discrete_sequence = px.colors.qualitative.Prism[0:])
fig.update_layout(title_text="Top incident reports by 'BARRIO'",
                 yaxis={"type":"category"}, showlegend=False)
fig.write_image("figuras_nuse/barrio_bar.png")
fig.show()

### Mapa con reporte de incidencias (LATITUD, LONGITUD)

In [None]:
position=pd.DataFrame(data.groupby(['LONGITUD','LATITUD']).size()).reset_index().rename(columns={0:"Cantidad"})

In [None]:
#Statistical description
pd.DataFrame(position.describe())

In [None]:
px.set_mapbox_access_token("")

fig = px.scatter_mapbox(position, lat="LATITUD", lon="LONGITUD",zoom=10,size='Cantidad',color='Cantidad')
fig.write_image("figuras_nuse/mapa.png")
fig.show()

### Análisis campo 'STR_DIRECCION_INCIDENTE'

In [None]:
subdata=pd.DataFrame(data.groupby(["STR_DIRECCION_INCIDENTE"]).size(),columns=["Cantidad"]).sort_index().reset_index()
topdata=(subdata.sort_values(by=['Cantidad'],ascending=False)[:10])
print_Data(topdata)

In [None]:
#Statistical description
pd.DataFrame(subdata.describe())

In [None]:
# Bar figure
fig = px.bar(topdata, x='Cantidad', y='STR_DIRECCION_INCIDENTE', orientation='h', color='STR_DIRECCION_INCIDENTE',
             color_discrete_sequence = px.colors.qualitative.Prism[0:])
fig.update_layout(title_text="Top incident reports by 'STR_DIRECCION_INCIDENTE'",
                 yaxis={"type":"category"}, showlegend=False)
fig.write_image("figuras_nuse/direccion_bar.png")
fig.show()

### Análisis campo 'TIPO_DETALLE'

In [None]:
subdata=pd.DataFrame(data.groupby(["TIPO_DETALLE"]).size(),columns=["Cantidad"]).sort_index().reset_index()
print_Data(subdata)


### Análisis campo 'ESTADO_INCIDENTE'

In [None]:
subdata=pd.DataFrame(data.groupby(["ESTADO_INCIDENTE"]).size(),columns=["Cantidad"]).sort_index().reset_index()
print(subdata.shape)
print_Data(subdata)
pd.DataFrame(np.around([subdata.Cantidad.mean(),subdata.Cantidad.std(),subdata.Cantidad.max(),subdata.Cantidad.min()],2),
             index=['Media',"Desviación Estandar","Máximo","Mínimo"],columns=[""])

### Análisis campo 'LOG_TEXT'

In [None]:
text = data["LOG_TEXT"].str.lower().values
stopwords = set(STOPWORDS)
stopwords.update(['de','la','el','que','y','con','en'])

In [None]:
wordcloud = WordCloud(width = 3000,height = 2000,background_color = 'black',stopwords = stopwords, max_words=1000).generate(str(text))
fig = plt.figure(figsize = (20, 10),facecolor = 'k',edgecolor = 'k')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.savefig("figuras/log_text",dpi=300,bbox_inches = "tight")
plt.show()