# Confidence Interval for Municipality Unemployment Correlation


In [23]:
import numpy as np
import pandas as pd

In [37]:
import unicodedata
from datetime import datetime

START_DATE = datetime(2010, 1, 1)
END_DATE = datetime(2019, 12, 31)


# Convert any Unicode text to ASCII
def normalize_ascii(text):
    return (
        unicodedata.normalize("NFKD", text)
        .encode("ascii", "ignore")
        .decode("ascii")
        .upper()
    )



In [25]:
crimes_df = pd.read_csv(
    "../data/clean/DelitosTipo1/DelitosTipo1-2010-2020.csv", index_col=0
)

# Convert the date column to datetime
crimes_df["Date"] = pd.to_datetime(
    crimes_df["Date"], format="%Y-%m-%d", errors="coerce"
)

crimes_df

Unnamed: 0,Distrito,Tipo I,Ases.,Viol.,Robo,Agr. Grave,Esc.,Apr. I,H. Auto,Date
0,Adjuntas,34,0.0,0.0,0.0,1.0,16.0,16,1.0,2010-01-01
1,Ponce,262,11.0,0.0,28.0,27.0,50.0,125,21.0,2010-01-01
2,Peñuelas,12,0.0,1.0,2.0,1.0,2.0,5,1.0,2010-01-01
3,Patillas,17,0.0,0.0,1.0,3.0,8.0,5,0.0,2010-01-01
4,Orocovis,34,0.0,0.0,0.0,2.0,25.0,6,1.0,2010-01-01
...,...,...,...,...,...,...,...,...,...,...
10291,Orocovis,75,1.0,1.0,1.0,11.0,23.0,35,3.0,2020-12-01
10292,Patillas,66,2.0,2.0,2.0,21.0,17.0,21,1.0,2020-12-01
10293,Ponce,754,24.0,3.0,32.0,186.0,125.0,365,18.0,2020-12-01
10294,Mayagüez,241,15.0,4.0,8.0,58.0,47.0,93,16.0,2020-12-01


In [45]:
Districs = list(crimes_df["Distrito"].unique())
distric_df_yearly = pd.DataFrame()  # Create an empty dataframe
for distric in Districs:
    distric_df = crimes_df.loc[crimes_df["Distrito"] == distric]
    distric_df_yearly = pd.concat(
        [distric_df_yearly, distric_df.loc[distric_df["Date"].dt.month == 12]],
        ignore_index=True,
    )
distric_df_yearly = distric_df_yearly.loc[distric_df_yearly["Date"] <= END_DATE]
distric_df_yearly["Distrito"] = distric_df_yearly["Distrito"].apply(normalize_ascii)
distric_df_yearly



Unnamed: 0,Distrito,Tipo I,Ases.,Viol.,Robo,Agr. Grave,Esc.,Apr. I,H. Auto,Date
0,ADJUNTAS,294,0.0,0.0,16.0,12.0,137.0,121,8.0,2010-12-01
1,ADJUNTAS,267,0.0,0.0,5.0,9.0,114.0,136,3.0,2011-12-01
2,ADJUNTAS,314,1.0,0.0,7.0,23.0,108.0,169,6.0,2012-12-01
3,ADJUNTAS,198,0.0,0.0,9.0,12.0,62.0,109,6.0,2013-12-01
4,ADJUNTAS,117,0.0,0.0,6.0,5.0,32.0,69,5.0,2014-12-01
...,...,...,...,...,...,...,...,...,...,...
852,GUAYAMA,555,7.0,3.0,43.0,61.0,135.0,289,17.0,2015-12-01
853,GUAYAMA,518,12.0,3.0,27.0,70.0,145.0,253,8.0,2016-12-01
854,GUAYAMA,454,9.0,2.0,18.0,83.0,157.0,172,13.0,2017-12-01
855,GUAYAMA,326,7.0,4.0,16.0,57.0,62.0,175,5.0,2018-12-01


In [27]:
unemployment_df = pd.read_csv(
    "../data/clean/Tasa de Desempleos/tasa_de_desempleos_y_mas_limpio.csv"
)
unemployment_df

Unnamed: 0,Date,Municipio o Area,Tasa de Desempleo,Num. Personas Desempleadas,Num. Personas Empleadas,Num. Personas Grupo Trabajador
0,2020-08-01,Puerto Rico SA,8.3,86123.0,951707.0,1037830.0
1,2020-07-01,Puerto Rico SA,8.5,87804.0,946949.0,1034753.0
2,2020-06-01,Puerto Rico SA,8.9,92311.0,942830.0,1035141.0
3,2020-05-01,Puerto Rico SA,9.0,93122.0,939024.0,1032146.0
4,2020-02-01,Puerto Rico SA,8.5,92713.0,975464.0,1048791.0
...,...,...,...,...,...,...
22700,2000-12-01,"San Juan-Carolina-Caguas, PR MSA",9.0,67176.0,758365.0,845088.0
22701,2000-11-01,"San Juan-Carolina-Caguas, PR MSA",8.6,66478.0,765566.0,838129.0
22702,2000-10-01,"San Juan-Carolina-Caguas, PR MSA",9.9,75071.0,756275.0,833436.0
22703,2000-09-01,"San Juan-Carolina-Caguas, PR MSA",8.8,71936.0,780914.0,837502.0


In [42]:
unemployment_df["Municipio o Area"] = unemployment_df.apply(
    lambda row: normalize_ascii(row["Municipio o Area"]), axis=1
)
unemployment_df["Date"] = pd.to_datetime(
    unemployment_df["Date"], format="%Y-%m-%d", errors="coerce"
)
region_blocklist = ["SAN JUAN-CAROLINA-CAGUAS, PR MSA", "PUERTO RICO SA", "PUERTO RICO"]
character_blocklist = ["-", ","]
droppable_entries = unemployment_df[
    unemployment_df["Municipio o Area"].str.contains("|".join(region_blocklist + character_blocklist))
]
unemployment_df = unemployment_df.drop(droppable_entries.index)
# Get the data that's relevant to the dates chosen
unemployment_df = unemployment_df.loc[(unemployment_df["Date"].dt.month == 12) & (unemployment_df["Date"] > START_DATE) & (unemployment_df["Date"] <= END_DATE)]
unemployment_df.reset_index(drop=True, inplace=True)
# (unemployment_df["Municipio o Area"].unique(), len(unemployment_df["Municipio o Area"].unique()))
unemployment_df

#crear fn que compare la correlation pearsonr de cada uno y outputs if null hypotehsis

Unnamed: 0,Date,Municipio o Area,Tasa de Desempleo,Num. Personas Desempleadas,Num. Personas Empleadas,Num. Personas Grupo Trabajador
0,2019-12-01,ADJUNTAS,14.8,755.0,3451.0,4154.0
1,2018-12-01,ADJUNTAS,11.3,654.0,3500.0,4214.0
2,2017-12-01,ADJUNTAS,14.1,647.0,3588.0,4399.0
3,2016-12-01,ADJUNTAS,16.2,832.0,3704.0,4663.0
4,2015-12-01,ADJUNTAS,19.0,721.0,3674.0,4572.0
...,...,...,...,...,...,...
775,2014-12-01,YAUCO,19.5,2025.0,8852.0,10841.0
776,2013-12-01,YAUCO,21.1,2706.0,8975.0,11438.0
777,2012-12-01,YAUCO,20.5,2314.0,9133.0,11423.0
778,2011-12-01,YAUCO,21.6,2169.0,9125.0,11692.0


In [29]:
filtered = unemployment_df[unemployment_df['Municipio o Area'].str.contains('|'.join(character_blocklist), regex=True)]
filtered

Unnamed: 0,Date,Municipio o Area,Tasa de Desempleo,Num. Personas Desempleadas,Num. Personas Empleadas,Num. Personas Grupo Trabajador
