# **Integration**

(For code descriptions only -- not intended to be run.)

## Imports

In [1]:
import pandas as pd
import hashlib
import requests
import os 
from io import StringIO

## Dataframe Creation

In [None]:
## reads raw Iowa datasets into separate dataframes
df1 = pd.read_csv('../data/raw/iowa1.csv')
df2 = pd.read_csv('../data/raw/iowa2.csv')
df3 = pd.read_csv('../data/raw/iowa3.csv')

## combines all Iowa dataframes into one, singular dataframe
df_accidents = pd.concat([df1, df2, df3], ignore_index=True)

## reads raw NCEI dataset into a dataframe
df_ncei = pd.read_csv('../data/raw/ncei.csv')

  df1 = pd.read_csv('../../data/raw/iowa1.csv')


## Dataframe Modification

In [None]:
## converts `date_of_crash` column to datetime
date_time = pd.to_datetime(df_accidents['date_of_crash'])
df_accidents['year'] = date_time.dt.year

## filters `df_accidents` to columns of interest
df_accidents = df_accidents[[
    'year', 'enviro_conditions', 'surface_conditions', 'weather_conditions',
    'crash_severity', 'fatalities', 'injuries', 'majinjury', 'mininjury',
    'possinjury', 'unkinjury', 'propdmg'
]]

## converts `Unknown` values to NaN
df_accidents['enviro_conditions'] = df_accidents['enviro_conditions'].fillna('Unknown')
df_accidents['surface_conditions'] = df_accidents['surface_conditions'].fillna('Unknown')

## filters `df_accidents` to observations of interest (wet weather conditions)
df_accidents_filtered = df_accidents[
    ((df_accidents['weather_conditions'] == 'Rain') | 
    (df_accidents['weather_conditions'] == 'Sleet, hail') | 
    (df_accidents['weather_conditions'] == 'Freezing rain/drizzle')) & 
    ((df_accidents['surface_conditions'] == 'Wet') | 
    (df_accidents['surface_conditions'] == 'Unknown'))
]

## converts `year` column to datetime
df_ncei['year'] = df_ncei['Date'].apply(lambda x: int(str(x)[:4]))
df_ncei.rename(columns={'Value': 'precipitation'}, inplace=True)
df_ncei = df_ncei[['year', 'precipitation']]
df_ncei['year'] = df_ncei['year'].astype(int)
df_ncei = df_ncei[df_ncei['year'] >= 2014]

## Data Integration

In [None]:
## creates processed dataframe and exports it as `../data/processed/integrated.csv`
yearly_counts = {'year': [], 'count': []}
current_year = None
count = 0

for index, row in df_accidents_filtered.iterrows():
    if row['year'] == current_year:
        count += 1
    else:
        if current_year is not None:
            yearly_counts['year'].append(current_year)
            yearly_counts['count'].append(count)
        current_year = row['year']
        count = 1

yearly_counts['year'].append(current_year)
yearly_counts['count'].append(count)

yearly_counts_df = pd.DataFrame(yearly_counts)

df_grouped = df_accidents_filtered.groupby(['year']).sum().reset_index()
df_merged = pd.merge(df_ncei, df_grouped, on='year', how='inner')
df_merged = pd.merge(yearly_counts_df, df_merged, on='year', how='inner')
df_merged = df_merged[['year', 'precipitation', 'count', 'fatalities', 'injuries', 'majinjury', 'mininjury', 'possinjury', 'unkinjury', 'propdmg']]

df_merged.to_csv('../data/processed/integrated.csv', index=False)