# Import Libraries

In [None]:
import pandas as pd
import pathlib as pl
import plotly.express as px
import folium

In [None]:
def create_output_directories(base_path):
    """
    Creates necessary output directories for data img and notebooks.
    """
    
    folders_to_create = ['data','img','notebooks']
    
    list_of_folders = []
    
    for main_folder in folders_to_create:
            folder_path = base_path.joinpath(main_folder)
            folder_path.mkdir(parents=True, exist_ok=True)
            list_of_folders.append(folder_path)
    
    return list_of_folders

In [None]:
def create_data_directories(base_path):
    """
    Creates necessary output directories for data img and notebooks.
    """
    
    folders_to_create = ['raw','processed']
    
    list_of_folders = []
    
    for main_folder in folders_to_create:
            folder_path = base_path.joinpath(main_folder)
            folder_path.mkdir(parents=True, exist_ok=True)
            list_of_folders.append(folder_path)
    
    return list_of_folders

In [None]:
current_path    = pl.Path.cwd().parent
output_folders  = create_output_directories(current_path)
data_folders    = create_data_directories(output_folders[0])
docs_list       = list(data_folders[0].glob('*.xlsx'))

# Import the dataset

In [None]:
df          = pd.read_excel(docs_list[0], sheet_name='Sheet0')
df          =    df.drop(0, axis=0)
df['Age']   = df['Age'].str.replace(r'(años|\. Años|AÑOS|de|Y i)', '', regex=True)
df          = df.dropna(subset=['Age']) # Drop rows with NaN values in Age column
df['Age']   = pd.to_numeric(df['Age'], errors='coerce')
df          = df[(df['Age'] >= 30) & (df['Age'] <= 64)] # Set a fitler for age between 30 and 64
df.head()

In [None]:
df.dtypes

In [None]:
columnas = df.columns # variable para borrar
print(columnas)

In [None]:
print(f'Number of rows: {df.shape[0]}')
print(f'Number of columns: {df.shape[1]}')
print(f'Loss information: {100*(1-df.shape[0]/1151)}%')

In [None]:
df['Q69'] = df['Q69'].str.strip()
df['Q69'] = df['Q69'].str.upper() 

#Replace the values of the colunm Q69 in the dataframe df named 'SANTIAGO DE CALI' with 'CALI'
df['Q69'] = df['Q69'].str.strip().str.upper()

# replace_dict = {
#     'SANTIAGO DE CALI': 'CALI',
#     'JAMUNDI': 'CALI',
#     'PALMIRA': 'CALI',
#     'YUMBO': 'CALI',
#     'VALLE DEL CAUCA': 'CALI',
#     'CAÑASGORDAS': 'CALI',
#     'EL CERRITO': 'CALI',
#     'JAMUNDÍ': 'CALI',
#     'JAMUNDI, VALLE': 'CALI',
#     'VALLE': 'CALI',
#     'CALI VALLE': 'CALI',
#     'CALI VALLE DEL CAUCA': 'CALI',
#     'EN SANTIAGO SDE CALL': 'CALI',
#     'BUENAVENTURA': 'CALI',
#     'BUGA': 'CALI',
#     'ZARZAL': 'CALI',
#     'CALIMA': 'CALI',
#     'TULUA': 'CALI',
#     'MEDELLIN': 'MEDELLÍN',
#     'MEDELIN': 'MEDELLÍN',
#     'ENVIGADO': 'MEDELLÍN',
#     'RIONEGRO': 'MEDELLÍN',
#     'SABANETA': 'MEDELLÍN',
#     'BELLO': 'MEDELLÍN',
#     'GIRARDOT': 'MEDELLÍN',
#     'ITAGUI': 'MEDELLÍN',
#     'ITAGÜÍ': 'MEDELLÍN',
#     'BOGOTA': 'BOGOTÁ',
#     'BOGOTÁ D.C': 'BOGOTÁ',
#     'BOGOTÁ D.C.': 'BOGOTÁ'
# }
replace_dict = {
    'SANTIAGO DE CALI': 'CALI',
    'CAÑASGORDAS': 'CALI',
    'CALI VALLE': 'CALI',
    'CALI VALLE DEL CAUCA': 'CALI',
    'EN SANTIAGO SDE CALL': 'CALI',
    'MEDELLIN': 'MEDELLÍN',
    'MEDELIN': 'MEDELLÍN',
    'ENVIGADO': 'MEDELLÍN',
    'RIONEGRO': 'MEDELLÍN',
    'SABANETA': 'MEDELLÍN',
    'BELLO': 'MEDELLÍN',
    'GIRARDOT': 'MEDELLÍN',
    'ITAGUI': 'MEDELLÍN',
    'ITAGÜÍ': 'MEDELLÍN',
    'BOGOTA': 'BOGOTÁ',
    'BOGOTÁ D.C': 'BOGOTÁ',
    'BOGOTÁ D.C.': 'BOGOTÁ'
}
df['Q69'] = df['Q69'].replace(replace_dict)


ciudades = df['Q69'].value_counts() # variable temporal para borrar
print(ciudades)


In [None]:
ciudades.to_csv(data_folders[1].joinpath('ciudades.csv'))

In [None]:
def create_bar_chart(df, column_name, file_name, plot_title,output_folder):
    unique_values = df[column_name].value_counts()

    # Create a bar chart using Plotly Express
    fig = px.bar(x=unique_values.index, y=unique_values.values)

    # Update layout if needed
    fig.update_layout(
        title='Bar Chart of '+ plot_title,
        xaxis_title="Unique Values",
        yaxis_title="Count"
    )

    # Show the plot
    fig.show()

    # Save the plot as an HTML file
    fig.write_html(output_folder.joinpath('bar_chart_'+file_name + '.html'))


In [None]:
create_bar_chart(df, 'Q69', 'cities', 'Cities', output_folders[1])

In [None]:
edades = df['Age'].value_counts().sort_index()
# df['Age'].describe()

In [None]:
cali_df = df[df['Q69'] == 'CALI']
cali_df.to_csv(data_folders[1].joinpath('cali.csv'))
cali_df.to_excel(data_folders[1].joinpath('cali.xlsx'))
cali_df.head()

In [None]:
print(f'Number of rows in Cali: {cali_df.shape[0]}')
print(f'Number of columns in Cali: {cali_df.shape[1]}')

In [None]:
create_bar_chart(cali_df, 'Age', 'age', 'Age', output_folders[1])

In [None]:
cali_df['Q78'] = cali_df['Q78'].str.strip().str.upper()
replace_dict = {
    '(00008909547) LESLIE PATRICIA POLANCO VELASCO': '00008909547-JAVCALI',
    'JavCali(00008909547)': '00008909547-JAVCALI',
    'JAVCALI(00008909547)':'00008909547-JAVCALI',
    'JAVCALI(00008909547)XIOMARA AMU':'00008909547-JAVCALI',
    'JAVCALI00008909547':'00008909547-JAVCALI',
    '00008909547':'00008909547-JAVCALI',
    'JAVCALI(00008909547)':'00008909547-JAVCALI',
    '8968160JAVECALI'   : '00008968160-JAVCALI',
    'JAVCALI 8959170'   : '00008959170-JAVCALI',
    'JAVCALI(8960388)'  : '00008960388-JAVCALI',
    'JAVCALI- (8959446)': '00008959446-JAVCALI',
    'JAVCALI8958506'    : '00008958506-JAVCALI',
    'JAVCALI8959955'    : '00008959955-JAVCALI',
    'JAVCALI8963141'    : '00008963141-JAVCALI',
    'JAVECALI8953122'   : '00008953122-JAVCALI',
    'JAVECALI8958190'   : '00008958190-JAVCALI',
    '8948180'           : '00008948180-JAVCALI',
    '8958113'           : '00008958113-JAVCALI',
    '300000121840'         : '30000121840-USBCALI',
    '30000085639'          : '30000085639-USBCALI',
    '30000116835'          : '30000116835-USBCALI',
    '30000116835 USB CALI' : '30000116835-USBCALI',
    '30000121119'          : '30000121119-USBCALI',
    '30000121119USBCALI'   : '30000121119-USBCALI',
    '30000121234 USBCALI'  : '30000121234-USBCALI',
    '30000121417USBCALI'   : '30000121417-USBCALI',
    '30000122711USB'       : '30000122711-USBCALI',
    '30000124761USBCALI'   : '30000124761-USBCALI',
    '3000122700USBCALI'    : '30000122700-USBCALI',
    '3000124761USBCALI'    : '30000124761-USBCALI',
    'USB-CALI-30000122709' : '30000122709-USBCALI',
    'USB-CALI-30000124779' : '30000124779-USBCALI',
    '3000000121840'        : '30000121840-USBCALI',
    'USB30000124779'       : '30000124779-USBCALI',
}
cali_df['Q78'] = cali_df['Q78'].replace(replace_dict)
cali_df['Q78'].value_counts().sort_index()

In [None]:
create_bar_chart(cali_df, 'Q78', 'code', 'Code', output_folders[1])

# Georeferencing the dataset

In [None]:
def print_map(df, save_name):
    # Remove the rows with missing values in the column 'LocationLatitude'
    df = df.dropna(subset=['LocationLatitude'])

    # Create a folium map
    m = folium.Map(location=[4.570868, -74.082125], zoom_start=6, projection="mercator")

    # Add observations to the map
    for i in range(len(df)):
        # Create a marker with the latitude and longitude of the observation
        marker = folium.Marker([df.iloc[i, 13], df.iloc[i, 14]], popup=df.iloc[i, 18])
        # Add the marker to the map
        m.add_child(marker)

    # Save the map as an HTML file
    m.save(output_folders[1].joinpath(save_name + '.html'))


In [None]:
print_map(cali_df, 'cali_map')
print_map(df, 'colombia_map')

In [None]:
codigos = cali_df['Q78'].value_counts().sum()
total = cali_df.shape[0]
print(f'Porcentaje de personas que asignaron codigo: {100*codigos/total}%')