In [1]:
# Import needed modules
import pandas as pd

# Import the files
weather_df = pd.read_csv("Resources/Canton_Ohio_Weather.csv")
cfs_df = pd.read_excel("Resources/cfs_data_Canton.xlsx")

In [2]:
# Evaluate the weather data - ds_iso is the utc datetime column
weather_df.dtypes

dt                       int64
dt_iso                  object
timezone                 int64
city_name               object
lat                    float64
lon                    float64
temp                   float64
visibility             float64
dew_point              float64
feels_like             float64
temp_min               float64
temp_max               float64
pressure                 int64
sea_level              float64
grnd_level             float64
humidity                 int64
wind_speed             float64
wind_deg                 int64
wind_gust              float64
rain_1h                float64
rain_3h                float64
snow_1h                float64
snow_3h                float64
clouds_all               int64
weather_id               int64
weather_main            object
weather_description     object
weather_icon            object
dtype: object

In [3]:
# Convert dt_iso into datetime format and convert to America/New_York timezone
weather_df['dt_iso'] = pd.to_datetime(weather_df['dt_iso'], format='%Y-%m-%d %H:%M:%S %z UTC')
weather_df['dt_iso'] = weather_df['dt_iso'].dt.tz_convert("America/New_York")
weather_df['dt_iso'].head(3)

0   2015-01-01 00:00:00+00:00
1   2015-01-01 01:00:00+00:00
2   2015-01-01 02:00:00+00:00
Name: dt_iso, dtype: datetime64[ns, UTC]

In [5]:
# Create column to merge weather and cfs data on.  Since weather data has
# by the hour with minutes and seconds 00:00, that will need to be the same
# for CFS data.
weather_df['relation'] = weather_df['dt_iso'].dt.strftime("%Y-%m-%d %H:%M:%S")
weather_df['relation'].head(3)

0    2014-12-31 19:00:00
1    2014-12-31 20:00:00
2    2014-12-31 21:00:00
Name: relation, dtype: object

In [6]:
cfs_df.dtypes

CallID                     int64
Department                object
CallType                  object
CreateDatetime    datetime64[ns]
GeoFlag                     bool
AgencyType                 int64
dtype: object

In [8]:
# Create column within CFS to merge weather with.  This returns 00:00 for 
# minutes and seconds to match weather format.
cfs_df['relation'] = cfs_df['CreateDatetime'].dt.strftime("%Y-%m-%d %H:00:00")

In [9]:
# Merge the dataframes together on relation table with weather on the out
merged_df = pd.merge(cfs_df, weather_df, on="relation", how="outer")
merged_df.head(3)

Unnamed: 0,CallID,Department,CallType,CreateDatetime,GeoFlag,AgencyType,relation,dt,dt_iso,timezone,...,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,6140391.0,Canton Police Department,Disturbance,2023-12-22 21:09:34.750,False,1.0,2023-12-22 21:00:00,,NaT,,...,,,,,,,,,,
1,6140386.0,Canton Police Department,911 Hangup,2023-12-22 21:04:17.120,False,1.0,2023-12-22 21:00:00,,NaT,,...,,,,,,,,,,
2,6140368.0,Canton Police Department,911 Hangup,2023-12-22 20:44:22.477,False,1.0,2023-12-22 20:00:00,,NaT,,...,,,,,,,,,,


In [11]:
# Create crime_data_df to only use needed columns
crime_data_df = merged_df[['CallID','CallType','CreateDatetime','relation','temp','dew_point','feels_like','temp_min','temp_max','pressure','humidity','wind_speed','weather_main','weather_description']]
crime_data_df.columns

Index(['CallID', 'CallType', 'CreateDatetime', 'relation', 'temp', 'dew_point',
       'feels_like', 'temp_min', 'temp_max', 'pressure', 'humidity',
       'wind_speed', 'weather_main', 'weather_description'],
      dtype='object')

In [13]:
# Dropna if CallID is NA.  We will need to fill or drop weather data due to
# the time of collection being before 2024
crime_data_df = crime_data_df.dropna(subset=['CallID'], axis='rows')

In [14]:
crime_data_df.shape

(355053, 14)

In [15]:
merged_df.shape

(392601, 35)

In [16]:
crime_data_df.head()

Unnamed: 0,CallID,CallType,CreateDatetime,relation,temp,dew_point,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,weather_main,weather_description
0,6140391.0,Disturbance,2023-12-22 21:09:34.750,2023-12-22 21:00:00,,,,,,,,,,
1,6140386.0,911 Hangup,2023-12-22 21:04:17.120,2023-12-22 21:00:00,,,,,,,,,,
2,6140368.0,911 Hangup,2023-12-22 20:44:22.477,2023-12-22 20:00:00,,,,,,,,,,
3,6140364.0,911 Hangup,2023-12-22 20:41:46.953,2023-12-22 20:00:00,,,,,,,,,,
4,6140355.0,Theft,2023-12-22 20:27:56.443,2023-12-22 20:00:00,,,,,,,,,,


In [17]:
# Exported combined data for preservation
# crime_data_df.to_excel("combined_data.xlsx",index=False, header=True)