In [1]:
import pandas as pd
from datetime import datetime

# 1. Load in all CSVs
crime_old = pd.read_csv('university_of_iowa_city_crime_data.csv')
crime_new = pd.read_csv('uiowa_city_crime_data_new.csv')
weather_old = pd.read_csv('cedarrapids_weather.csv')
weather_new = pd.read_csv('cedarrapids_weather_new.csv')

# 2. Combine and deduplicate
crime_df = pd.concat([crime_old, crime_new], ignore_index=True)
weather_df = pd.concat([weather_old, weather_new], ignore_index=True)

crime_df.drop_duplicates(subset=['Case Number'], inplace=True)
weather_df.drop_duplicates(subset=['Date', 'Time'], inplace=True)

# 3. Save combined files (optional)
crime_df.to_csv('uiowa_city_crime_data_full.csv', index=False)
weather_df.to_csv('cedarrapids_weather_full.csv', index=False)

# 4. Format date/time columns
crime_df['Date/Time Reported'] = pd.to_datetime(crime_df['Date/Time Reported'], format='%m/%d/%Y %H:%M')
crime_df['Date'] = pd.to_datetime(crime_df['Date/Time Reported'].dt.date)
crime_df['Time'] = crime_df['Date/Time Reported'].dt.strftime('%H:%M')
weather_df['Date'] = pd.to_datetime(weather_df['Date'], format='%Y-%m-%d')

# 5. Assign time buckets to crime data
def assign_time_bucket(time_str):
    hour = pd.to_datetime(time_str).hour
    if hour < 6:
        return 'Night'
    elif hour < 12:
        return 'Morning'
    elif hour < 18:
        return 'Afternoon'
    else:
        return 'Evening'

crime_df['Time Bucket'] = crime_df['Time'].apply(assign_time_bucket)

# 6. Create datetime columns for merging
crime_df['DateTime'] = pd.to_datetime(crime_df['Date'].astype(str) + ' ' + crime_df['Time'])
weather_df['DateTime'] = pd.to_datetime(weather_df['Date'].astype(str) + ' ' + weather_df['Time'])

# 7. Match each crime with closest weather timestamp
def find_nearest_time(crime_time, weather_times):
    return weather_times.iloc[(weather_times - crime_time).abs().argsort()[0]]

crime_df['Nearest Weather Time'] = crime_df['DateTime'].apply(
    lambda x: find_nearest_time(x, weather_df['DateTime'])
)
crime_df['Nearest Weather Time'] = crime_df['Nearest Weather Time'].dt.strftime('%H:%M')
weather_df['Time'] = weather_df['DateTime'].dt.strftime('%H:%M')

# 8. Merge on Date + nearest time
merged_df = crime_df.merge(
    weather_df,
    left_on=['Date', 'Nearest Weather Time'],
    right_on=['Date', 'Time'],
    how='left'
)

# 9. Clean up columns
merged_df.drop(columns=['Time_y', 'DateTime_y'], inplace=True)
merged_df.rename(columns={
    'Time_x': 'Time',
    'DateTime_x': 'DateTime'
}, inplace=True)

# 10. Clean numeric weather values
for col in ['Temperature', 'Dew Point', 'Humidity', 'Wind Speed', 'Wind Gust', 'Pressure', 'Precip.']:
    merged_df[col] = merged_df[col].str.extract('([0-9.]+)').astype(float)

# 11. Export final merged dataset
merged_df.to_csv('merged_data_new.csv', index=False)
