# Calculating Distances Between Brazillian Establishments and Natural Disasters

## Overview
This notebook calculates the distances between Brazilian establishments (from `geocoded_data_updt.csv`) and natural disasters (from `Brazil\ Shocks.\ 2000-2018.csv`). Distances are calculated using the Haversine formula, which accounts for the curvature of the Earth, and for all pairs of establishments and disasters.

Note that the Haversine formula assumes that the Earth is a perfect sphere, which is not entirely accurate. However, for most practical purposes, this approximation is sufficient.

## Output

This notebook will output a single CSV file containing a single row for each establishment-disaster pair within the same year. The output file is an **inner join** of the two input files, meaning that only disaster-establishment pairs that happen in the same year will be included.
The columns of the output file are as follows:
- `est_id`: The ID of the establishment. This comes from the `cnpj_cei` column in the `geocoded_data_updt.csv` file. Note that there are some NAs in this column.
- `disaster_id`: The ID of the disaster. This comes from the `geo_id` and `year` columns in the `Brazil\ Shocks.\ 2000-2018.csv` file. The format is `geo_id_year`.
- `year`: The year of the disaster and the establishment.
- `disaster_type`: The type of disaster. This comes from the `disastertype` column in the `Brazil\ Shocks.\ 2000-2018.csv` file.
- `lat_est`: The latitude of the establishment. 
- `lon_est`: The longitude of the establishment.
- `lat_disaster`: The latitude of the disaster.
- `lon_disaster`: The longitude of the disaster.
- `distance_km`: The distance between the establishment and the disaster in kilometers. This is calculated using the Haversine formula.



In [33]:
# 1. Detect the encoding of input files
import chardet
import os

brazilian_shocks_path = "/Users/koacow/BOSTON UNIVERSITY Dropbox/Ngoc Duy Khoa Cao/GLOB~S/Data/Natural Disasters/corrected_disaster_data.xlsx"
brazilian_est_path = os.path.join(os.getcwd(), 'geocoded_data/geocoded_data_updt.csv')


In [34]:
# 2.1 Load the data with only the necessary columns
import pandas as pd
import numpy as np

shocks_df = pd.read_excel(brazilian_shocks_path, usecols=['year', 'geo_id', 'iso3', 'latitude', 'longitude', 'disastertype_x', 'Total Deaths', "Total Damage ('000 US$)"])
est_df = pd.read_csv(brazilian_est_path, usecols=['cnpj_cei', 'year', 'lat', 'lng'])
shocks_df = shocks_df[shocks_df['iso3'] == 'BRA']
shocks_df = shocks_df[shocks_df['year'].isin(range(2000, 2019))]
shocks_df = shocks_df[shocks_df['disastertype_x'].isin(['flood', 'storm', 'landslide', 'earthquake'])]
# 2.2 Rename columns for consistency
shocks_df['geo_id'] = shocks_df['geo_id'].astype(str)
shocks_df['disaster_id'] = shocks_df['geo_id'] + '_' + shocks_df['year'].astype(str)
shocks_df_cols = {
    'year': 'year',
    'disaster_id': 'disaster_id',
    'latitude': 'lat',
    'longitude': 'lng',
    'disastertype_x': 'disaster_type',
    'Total Deaths': 'total_deaths',
    "Total Damage ('000 US$)": 'total_damage'
}

est_df_cols = {
    'cnpj_cei': 'est_id',
    'year': 'year',
    'lat': 'lat',
    'lng': 'lng'
}

shocks_df = shocks_df[shocks_df_cols.keys()]
est_df = est_df[est_df_cols.keys()]
shocks_df.rename(columns=shocks_df_cols, inplace=True)
est_df.rename(columns=est_df_cols, inplace=True)

print(shocks_df.shape)
print(shocks_df.head())
print(est_df.shape)
print(est_df.head())




(287, 7)
      year disaster_id        lat        lng disaster_type  total_deaths  \
66    2013   3036_2013 -22.587911 -43.326527         flood           4.0   
67    2013   3037_2013 -22.587911 -43.326527         flood          30.0   
2227  2000   2711_2000 -22.507438 -44.188600         flood          26.0   
2228  2000   2915_2000 -22.441788 -44.490386         flood          26.0   
2229  2000   3035_2000 -22.490089 -44.087939         flood          26.0   

      total_damage  
66          2000.0  
67          1500.0  
2227           NaN  
2228           NaN  
2229           NaN  
(549082, 4)
         est_id  year        lat        lng
0  2.460658e+12  2003  -9.852406 -63.060539
1  8.464388e+13  2003  -9.903970 -63.035419
2  3.477327e+13  2003  -9.920243 -63.046216
3  8.462377e+13  2003 -10.083945 -63.217735
4  2.286109e+13  2003  -9.936345 -63.013974


In [35]:
# 3.1 Check for missing values
def check_missing_values(df):
    missing_values = df.isna().sum()
    return missing_values[missing_values > 0] if not missing_values[missing_values > 0].empty else None

missing_vals_shocks = check_missing_values(shocks_df)
missing_vals_est = check_missing_values(est_df)
print(f"Missing values in shocks_df: {missing_vals_shocks}")
print(f"Missing values in est_df: {missing_vals_est}")

# 3.2 Check for duplicates

duplicates_shocks = shocks_df.duplicated(subset=['disaster_id']).sum()
duplicates_est = est_df.duplicated(subset=['est_id', 'year']).sum()
print(f"Number of rows in shocks_df with same (disaster_id): {duplicates_shocks}")
print(f"Number of rows in est_df with same (est_id, year): {duplicates_est}")

Missing values in shocks_df: total_deaths      9
total_damage    148
dtype: int64
Missing values in est_df: est_id    7
dtype: int64
Number of rows in shocks_df with same (disaster_id): 0
Number of rows in est_df with same (est_id, year): 0


In [36]:
# 4.1 Merge the two dataframes on year
merged_df = pd.merge(shocks_df, est_df, on='year', suffixes=('_disaster', '_est'))
print("Merged DataFrame shape:")
print("Number of rows:", merged_df.shape[0])
print("Number of columns:", merged_df.shape[1])
print(merged_df.head())


Merged DataFrame shape:
Number of rows: 8999218
Number of columns: 10
   year disaster_id  lat_disaster  lng_disaster disaster_type  total_deaths  \
0  2013   3036_2013    -22.587911    -43.326527         flood           4.0   
1  2013   3036_2013    -22.587911    -43.326527         flood           4.0   
2  2013   3036_2013    -22.587911    -43.326527         flood           4.0   
3  2013   3036_2013    -22.587911    -43.326527         flood           4.0   
4  2013   3036_2013    -22.587911    -43.326527         flood           4.0   

   total_damage        est_id    lat_est    lng_est  
0        2000.0  2.916265e+12 -10.033735 -62.977755  
1        2000.0  8.461554e+13  -9.908812 -63.035547  
2        2000.0  6.900697e+12  -9.903959 -63.034621  
3        2000.0  3.780605e+12  -9.898254 -63.034756  
4        2000.0  4.082624e+12  -9.898254 -63.034756  


In [37]:
from haversine import haversine

def calculate_distance(row: pd.Series) -> float:
    """
    Calculate the distance between two geographical points using the Haversine formula. Returns the distance in kilometers.
    """
    from math import radians, sin, cos, sqrt, atan2

    # Extract latitude and longitude from the row
    lat1 = row['lat_disaster']
    lon1 = row['lng_disaster']
    lat2 = row['lat_est']
    lon2 = row['lng_est']

    dist = haversine((lat1, lon1), (lat2, lon2), unit='km')
    return dist

# 5. Calculate the distance between each establishment-disaster pair
merged_df['distance_km'] = merged_df.apply(calculate_distance, axis=1)
print(merged_df.head())


   year disaster_id  lat_disaster  lng_disaster disaster_type  total_deaths  \
0  2013   3036_2013    -22.587911    -43.326527         flood           4.0   
1  2013   3036_2013    -22.587911    -43.326527         flood           4.0   
2  2013   3036_2013    -22.587911    -43.326527         flood           4.0   
3  2013   3036_2013    -22.587911    -43.326527         flood           4.0   
4  2013   3036_2013    -22.587911    -43.326527         flood           4.0   

   total_damage        est_id    lat_est    lng_est  distance_km  
0        2000.0  2.916265e+12 -10.033735 -62.977755  2514.201261  
1        2000.0  8.461554e+13  -9.908812 -63.035547  2527.515565  
2        2000.0  6.900697e+12  -9.903959 -63.034621  2527.753442  
3        2000.0  3.780605e+12  -9.898254 -63.034756  2528.141185  
4        2000.0  4.082624e+12  -9.898254 -63.034756  2528.141185  


In [38]:
# 6. Check for duplicates in the merged DataFrame
duplicates_merged = merged_df.duplicated(subset=['est_id', 'disaster_id']).sum()
print(f"Number of rows in merged_df with same (est_id, disaster_id): {duplicates_merged}")

Number of rows in merged_df with same (est_id, disaster_id): 0


In [39]:
# 7.1 Keep only the relevant columns
columns_to_keep = ['year', 'disaster_id', 'est_id', 'lat_disaster', 'lng_disaster', 'lat_est', 'lng_est', 'disaster_type', 'distance_km', 'total_deaths', 'total_damage']
merged_df = merged_df[columns_to_keep]
merged_df['est_id'] = merged_df['est_id'].astype(str)

# 7.2 Write the merged DataFrame to a CSV file
output_path = os.path.join(os.getcwd(), 'geocoded_data/brazil_est_shock_distances.csv')
merged_df.to_csv(output_path, index=False)