In [1]:
# Import libraries
import pandas as pd

In [2]:
# Load the dataset
data = pd.read_csv('data/rvf_data.csv')

In [3]:
# Display basic information
data_info = data.info()
data_preview = data.head()
data_shape = data.shape
data_info, data_preview, data_shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180288 entries, 0 to 180287
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   divid              180288 non-null  int64  
 1   province           180288 non-null  object 
 2   district           180288 non-null  object 
 3   division           180288 non-null  object 
 4   Year               180288 non-null  int64  
 5   month              180288 non-null  object 
 6   rainfall           180288 non-null  float64
 7   elevation          180288 non-null  float64
 8   slope              180288 non-null  float64
 9   clay               180288 non-null  float64
 10  humidity           180288 non-null  float64
 11  Rift Valley Cases  180288 non-null  int64  
dtypes: float64(5), int64(3), object(4)
memory usage: 16.5+ MB


(None,
    divid province district division  Year      month  rainfall    elevation  \
 0  30402    COAST  MOMBASA  KISAUNI  1981   November -0.424976    19.250099   
 1  30402    COAST  MOMBASA  KISAUNI  1981     August  2.004479    19.250099   
 2  40415  EASTERN  MAKUENI  KIBWEZI  1981  September -0.365544   795.179020   
 3  40415  EASTERN  MAKUENI  KIBWEZI  1981     August  0.162332   795.179020   
 4  40410  EASTERN  MAKUENI     WOTE  1981    October  0.186819  1106.939900   
 
        slope       clay   humidity  Rift Valley Cases  
 0  89.911404  25.494737  74.000000                  1  
 1  89.911404  25.494737  74.000000                  1  
 2  89.959977  28.947223  60.012926                  1  
 3  89.959977  28.947223  60.012926                  1  
 4  89.964235  29.743371  60.762669                  1  ,
 (180288, 12))

# 1. Data Cleaning

In [5]:
# Check for duplicates
duplicates = data.duplicated().sum()
duplicates

0

In [6]:
# Dropping negative readings of rainfall & humidity and non-rift valley fever
data = (data[(data['rainfall'] >= 0) & 
        (data['humidity'] > 0) & 
        (data['Rift Valley Cases'] >= 0)])
data.shape

(68312, 12)

# 2. Aggregation

In [8]:
# Group by province, year, and month
aggregated_data = data.groupby(
    [
        'province',
        'district',
        'division',
        'Year',
        'month'
    ]
).agg(
    avg_rainfall = ('rainfall', 'mean'),
    avg_humidity = ('humidity', 'mean'),
    total_cases = ('Rift Valley Cases', 'sum')
).reset_index()

In [9]:
# Display the aggregated data
print("\nAggregated Data:")
print(aggregated_data.head())


Aggregated Data:
  province district    division  Year   month  avg_rainfall  avg_humidity  \
0  CENTRAL   KIAMBU  GITHUNGURI  1981   April      1.593538     68.860373   
1  CENTRAL   KIAMBU  GITHUNGURI  1981  August      0.137449     68.860373   
2  CENTRAL   KIAMBU  GITHUNGURI  1981    July      0.557422     68.860373   
3  CENTRAL   KIAMBU  GITHUNGURI  1981   March      1.465576     68.860373   
4  CENTRAL   KIAMBU  GITHUNGURI  1981     May      0.586008     68.860373   

   total_cases  
0            0  
1            0  
2            0  
3            0  
4            0  


In [10]:
# Shape after aggregation
aggregated_data.head()

Unnamed: 0,province,district,division,Year,month,avg_rainfall,avg_humidity,total_cases
0,CENTRAL,KIAMBU,GITHUNGURI,1981,April,1.593538,68.860373,0
1,CENTRAL,KIAMBU,GITHUNGURI,1981,August,0.137449,68.860373,0
2,CENTRAL,KIAMBU,GITHUNGURI,1981,July,0.557422,68.860373,0
3,CENTRAL,KIAMBU,GITHUNGURI,1981,March,1.465576,68.860373,0
4,CENTRAL,KIAMBU,GITHUNGURI,1981,May,0.586008,68.860373,0


In [12]:
# Save the Processed Data
output_file_path = 'processed_rvf_data.csv'
aggregated_data.to_csv(output_file_path, index=False)