In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_excel("cordoba_data_23_24.xlsx", engine="openpyxl")

In [3]:
df.dtypes

objectid                                  int64
evento                                   object
Sex                                      object
Birth Date (YYYY-MM-DD)          datetime64[ns]
Age                                     float64
Age Group                                object
Epidemic week of reporting              float64
Week started feeling symptoms            object
Country of residence                     object
Province of residence                    object
Department of residence                  object
City of residence                        object
Case classification                      object
Date Case Reported               datetime64[ns]
Date of Initial Symptoms         datetime64[ns]
ftm                              datetime64[ns]
Travel Status                            object
Neighborhood                             object
id_caso                                   int64
validacion                               object
dtype: object

In [4]:
df.columns = ['ID', 'Event', 'Sex', 'BirthDate', 'Age',
       'AgeGroup', 'EpidemicWeekofReporting',
       'SymptomsWeekofReporting', 'CountryofResidence',
       'ProvinceofResidence', 'DepartmentofResidence', 'CityofResidence',
       'CaseClassification', 'DateCaseReported', 'DateofInitialSymptoms',
       'ftm', 'TravelStatus', 'Neighborhood', 'id_caso', 'Validation']

### Rename Columns

In [5]:
df = df.sort_values("ID") ### Sort ascending by ID

In [6]:
df['ID'] = df.ID.map(lambda i: i - 14455) ### Subtract 14455 from 'ID' to start from 1

In [7]:
df.isnull().any() ### check for missing data

ID                         False
Event                      False
Sex                         True
BirthDate                   True
Age                         True
AgeGroup                   False
EpidemicWeekofReporting     True
SymptomsWeekofReporting     True
CountryofResidence          True
ProvinceofResidence        False
DepartmentofResidence       True
CityofResidence             True
CaseClassification         False
DateCaseReported           False
DateofInitialSymptoms       True
ftm                         True
TravelStatus                True
Neighborhood                True
id_caso                    False
Validation                  True
dtype: bool

In [8]:
df['Age'] = df['Age'].fillna(0) ### fill in missing data in 'Age' with 0

In [9]:
df['BirthDate'] = df['BirthDate'].fillna(0) ### fill in missing data in 'BirthDate' with 0
df['Sex'] = df['Sex'].fillna('n/a') ### fill in missing data in 'Sex' with 'n/q'
df['EpidemicWeekofReporting'] = df['EpidemicWeekofReporting'].fillna(0) ### fill in missing data in 'EpidemicWeekofReporting' with 0
df['SymptomsWeekofReporting'] = df['SymptomsWeekofReporting'].fillna(0) ### fill in missing data in 'SymptomsWeekofReporting' with 0
df['CountryofResidence'] = df['CountryofResidence'].fillna('n/a') ### fill in missing data in 'CountryofResidence' with 'n/a'
df['DepartmentofResidence'] = df['DepartmentofResidence'].fillna('n/a') ### fill in missing data in 'DepartmentofResidence' with 'n/a'
df['CityofResidence'] = df['CityofResidence'].fillna('n/a') ### fill in missing data in 'CityofResidence' with 'n/a'
df['DateofInitialSymptoms'] = df['DateofInitialSymptoms'].fillna(0) ### fill in missing data in 'DateofInitialSymptoms' with 0
df['TravelStatus'] = df['TravelStatus'].fillna('n/a') ### fill in missing data in 'TravelStatus' with 'n/a'
df['Neighborhood'] = df['Neighborhood'].fillna('n/a') ### fill in missing data in 'Neighborhood' with 'n/a'
df['Validation'] = df['Validation'].fillna('n/a') ### fill in missing data in 'Validation' with 'n/a'

In [10]:
df['Age'] = df.Age.map(int) ### change data type of 'Age' column to int

In [11]:
df.AgeGroup = df.AgeGroup.map(lambda s: s.replace('a', '-')) ### replace 'a' with '-' in 'AgeGroup'
df.AgeGroup = df.AgeGroup.map(lambda s: s.replace('05-9', '5-9')) ### replace in 'AgeGroup'
df.AgeGroup = df.AgeGroup.map(lambda s: s.replace('65ymás', '65+')) ### replace in 'AgeGroup'

In [12]:
df

Unnamed: 0,ID,Event,Sex,BirthDate,Age,AgeGroup,EpidemicWeekofReporting,SymptomsWeekofReporting,CountryofResidence,ProvinceofResidence,DepartmentofResidence,CityofResidence,CaseClassification,DateCaseReported,DateofInitialSymptoms,ftm,TravelStatus,Neighborhood,id_caso,Validation
0,1,Dengue,F,2016-12-12 00:00:00,7,5-9,10.0,9,Argentina,Córdoba,Capital,CORDOBA,Caso confirmado por nexo epidemiológico autóctono,2024-03-08,2024-02-28 00:00:00,NaT,Sin antecedente de viaje a zona afectada en lo...,JOSE IGNACIO DIAZ 2A SECCION,1,Localizados
1,2,Dengue,F,1974-06-25 00:00:00,49,45-65,9.0,9,Argentina,Córdoba,Capital,CORDOBA,Caso confirmado por nexo epidemiológico autóctono,2024-02-29,2024-02-27 00:00:00,NaT,,FERROVIARIO MITRE,2,Localizados
2,3,Dengue,F,1999-04-08 00:00:00,24,20-24,9.0,9,Argentina,Córdoba,Capital,CORDOBA,Caso de Dengue en brote con laboratorio (+),2024-02-28,2024-02-25 00:00:00,2024-02-26,Sin antecedente de viaje a zona afectada en lo...,FERROVIARIO MITRE,3,Localizados
3,4,Dengue,F,1986-07-04 00:00:00,37,35-44,10.0,9,Argentina,Córdoba,Capital,CORDOBA,Caso confirmado por nexo epidemiológico autóctono,2024-03-08,2024-02-29 00:00:00,NaT,,SAN PEDRO NOLASCO,4,Localizados
4,5,Dengue,M,2009-12-29 00:00:00,14,10-14,10.0,8,Argentina,Córdoba,Capital,CORDOBA,Caso confirmado por nexo epidemiológico autóctono,2024-03-04,2024-02-24 00:00:00,NaT,Sin antecedente de viaje a zona afectada en lo...,SAN PEDRO NOLASCO,5,Localizados
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15309,21443,Dengue,F,2016-06-10 00:00:00,7,5-9,16.0,16,Argentina,Córdoba,Capital,SD,Caso confirmado por nexo epidemiológico autóctono,2024-04-17,2024-04-14 00:00:00,NaT,1713225600000,,18250,
15310,21444,Dengue,M,2013-12-29 00:00:00,10,10-14,17.0,16,Argentina,Córdoba,Capital,CORDOBA,Caso confirmado por nexo epidemiológico autóctono,2024-04-22,2024-04-19 00:00:00,NaT,1713744000000,,18871,
15311,21445,Dengue,F,1996-10-29 00:00:00,27,25-34,17.0,16,Argentina,Córdoba,Capital,CORDOBA,Caso confirmado por nexo epidemiológico autóctono,2024-04-23,2024-04-18 00:00:00,NaT,1713657600000,,18352,
15312,21446,Dengue,F,1962-12-17 00:00:00,61,45-65,16.0,16,Argentina,Córdoba,Capital,SD,Caso confirmado por nexo epidemiológico autóctono,2024-04-17,2024-04-15 00:00:00,NaT,1713312000000,AUTODROMO,18312,
