In [45]:
# Import libraries
import pandas as pd
import numpy as np

In [46]:
# Read dengue csv file
dengue = pd.read_csv('../Data/dengue_cases_region_final.csv')
dengue.shape

(4996, 3)

In [47]:
# Display dengue dataframe
dengue.head()

Unnamed: 0,region,recent_cases,date
0,Admiralty,5,2013-11-24
1,Admiralty,6,2013-12-01
2,Admiralty,8,2013-12-15
3,Admiralty,5,2013-12-22
4,Admiralty,3,2013-12-29


In [48]:
# Convert date column to datetime format and set as index.
dengue.date = pd.to_datetime(dengue.date)
dengue.set_index('date', inplace=True)

In [49]:
# Check number of unique regions
regions = list(dengue.region.unique())
len(regions)

41

In [50]:
# Function to split the dengue dataframe based on region and sum up cases based on region
def split_df (dataframe, region):
    df = pd.DataFrame(dataframe[dataframe['region']==region]['recent_cases'])
    df = df.groupby('date').sum()
    df['region'] = region
    return df

In [51]:
# Perform the function and concatenate into a single dataframe
dengues = pd.DataFrame()
for r in regions:
    d = split_df(dengue, r)
    dengues = pd.concat([dengues, d], axis=0)

In [52]:
# Reset the index in order to merge with other dataframe easily
dengues=dengues.reset_index()

In [24]:
# Read both Google trends and Weather report
trends = pd.read_csv('../Data/google_trends.csv')
weather = pd.read_csv('../Data/weather.csv')

In [25]:
# Display Google trends data
trends.head()

Unnamed: 0,date,aedes,dengue,fever,headache,nosebleed,vomit
0,3/1/2016,0,78,75,81,0,42
1,10/1/2016,6,74,77,76,0,40
2,17/1/2016,13,68,88,79,48,70
3,24/1/2016,41,94,91,68,30,58
4,31/1/2016,23,85,96,52,31,56


In [33]:
# Display weather data
weather.head()

Unnamed: 0,date,daily_rainfall_total,highest_30_min_rainfall,highest_60_min_rainfall,highest_120_min_rainfall,mean_temperature,maximum_temperature,minimum_temperature,mean_wind_speed,max_wind_speed,...,region,daily_rainfall_total_14,highest_30_min_rainfall_14,highest_60_min_rainfall_14,highest_120_min_rainfall_14,mean_temperature_14,maximum_temperature_14,minimum_temperature_14,mean_wind_speed_14,max_wind_speed_14
0,2022-10-01,22.6,18.6,21.4,21.4,26.8,31.8,24.4,6.0,27.8,...,Admiralty,,,,,,,,,
1,2022-10-02,16.4,13.2,14.2,16.0,27.1,32.6,23.0,5.8,24.4,...,Admiralty,,,,,,,,,
2,2022-10-03,44.2,18.6,33.4,39.6,26.3,31.8,23.1,5.7,52.6,...,Admiralty,,,,,,,,,
3,2022-10-04,1.4,1.0,1.2,1.2,26.2,31.2,24.5,6.3,36.3,...,Admiralty,,,,,,,,,
4,2022-10-05,49.2,17.6,29.2,44.8,26.0,28.6,22.8,7.2,33.5,...,Admiralty,,,,,,,,,


In [38]:
# Convert date columns into datetime format
trends['date'] = pd.to_datetime(trends['date'])
weather['date'] = pd.to_datetime(weather['date'])

In [54]:
# Merge all data into a single dataframe
new_df = pd.merge(weather, trends, how ='left',on=['date'])
new_df = pd.merge(new_df, dengues, how ='left', on=['date','region'])

In [59]:
# Display merged data
new_df.tail()

Unnamed: 0,date,daily_rainfall_total,highest_30_min_rainfall,highest_60_min_rainfall,highest_120_min_rainfall,mean_temperature,maximum_temperature,minimum_temperature,mean_wind_speed,max_wind_speed,...,minimum_temperature_14,mean_wind_speed_14,max_wind_speed_14,aedes,dengue,fever,headache,nosebleed,vomit,recent_cases
130921,2016-01-27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.060244e-15,,,,,,,
130922,2016-01-28,15.4,9.2,10.8,14.8,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.060244e-15,,,,,,,
130923,2016-01-29,1.4,1.0,1.0,1.4,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.060244e-15,,,,,,,
130924,2016-01-30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.060244e-15,,,,,,,
130925,2016-01-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.060244e-15,23.0,85.0,96.0,52.0,31.0,56.0,32.0


In [55]:
# Check the info of merged data
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130926 entries, 0 to 130925
Data columns (total 28 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   date                         130926 non-null  datetime64[ns]
 1   daily_rainfall_total         130926 non-null  float64       
 2   highest_30_min_rainfall      130926 non-null  float64       
 3   highest_60_min_rainfall      130926 non-null  float64       
 4   highest_120_min_rainfall     130926 non-null  float64       
 5   mean_temperature             130926 non-null  float64       
 6   maximum_temperature          130926 non-null  float64       
 7   minimum_temperature          130926 non-null  float64       
 8   mean_wind_speed              130926 non-null  float64       
 9   max_wind_speed               130926 non-null  float64       
 10  year                         130926 non-null  int64         
 11  region                    

In [15]:
# Export to a csv file for EDA and modelling.
new_df.to_csv('../Data/merged_data.csv')