In [1]:
import pandas as pd
import datetime as dt
import seaborn as sns

In [2]:
weather_df = pd.read_csv("https://raw.githubusercontent.com/kaberry2/DSCI689/main/datasets/muncie_weather.csv")
entries_df = pd.read_csv("https://raw.githubusercontent.com/kaberry2/DSCI689/main/datasets/entries.csv")

In [3]:
weather_df.shape


(8929, 28)

In [4]:
weather_df.head()

Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,visibility,dew_point,feels_like,...,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,1609459200,2021-01-01 00:00:00 +0000 UTC,-18000,Muncie,40.193377,-85.38636,269.33,10000.0,267.15,265.49,...,0.0,,,,,75,803,Clouds,broken clouds,04n
1,1609462800,2021-01-01 01:00:00 +0000 UTC,-18000,Muncie,40.193377,-85.38636,269.29,10000.0,266.97,265.44,...,0.0,,,,,1,800,Clear,sky is clear,01n
2,1609466400,2021-01-01 02:00:00 +0000 UTC,-18000,Muncie,40.193377,-85.38636,268.82,10000.0,266.79,264.88,...,0.0,,,,,1,800,Clear,sky is clear,01n
3,1609470000,2021-01-01 03:00:00 +0000 UTC,-18000,Muncie,40.193377,-85.38636,268.41,10000.0,265.96,263.85,...,0.0,,,,,1,800,Clear,sky is clear,01n
4,1609473600,2021-01-01 04:00:00 +0000 UTC,-18000,Muncie,40.193377,-85.38636,268.84,10000.0,266.81,265.53,...,0.0,,,,,1,800,Clear,sky is clear,01n


In [5]:
entries_df.shape

(458791, 10)

## Cleaning / Grouping Dates

In [6]:
weather_df['timestamp'] = pd.to_datetime(weather_df['dt'], unit = 's')
weather_df['hour'] = pd.DatetimeIndex(weather_df['timestamp']).hour
weather_df['date'] = pd.DatetimeIndex(weather_df['timestamp']).date
weather_df.loc[:, ['hour', 'date']]


Unnamed: 0,hour,date
0,0,2021-01-01
1,1,2021-01-01
2,2,2021-01-01
3,3,2021-01-01
4,4,2021-01-01
...,...,...
8924,19,2021-12-31
8925,20,2021-12-31
8926,21,2021-12-31
8927,22,2021-12-31


In [7]:
weather_df.columns

Index(['dt', 'dt_iso', 'timezone', 'city_name', 'lat', 'lon', 'temp',
       'visibility', 'dew_point', 'feels_like', 'temp_min', 'temp_max',
       'pressure', 'sea_level', 'grnd_level', 'humidity', 'wind_speed',
       'wind_deg', 'wind_gust', 'rain_1h', 'rain_3h', 'snow_1h', 'snow_3h',
       'clouds_all', 'weather_id', 'weather_main', 'weather_description',
       'weather_icon', 'timestamp', 'hour', 'date'],
      dtype='object')

## Converting Kelvin / Other SI Units to Imperial Units (boo)

In [8]:
weather_df['temp'] = (weather_df['temp']-273.15)*1.8 + 32
weather_df['rain_1h'] = weather_df['rain_1h']/2.54
weather_df['rain_3h'] = weather_df['rain_3h']/2.54
weather_df['snow_1h'] = weather_df['snow_1h']/2.54
weather_df['snow_3h'] = weather_df['snow_3h']/2.54
weather_df['wind_speed'] = weather_df['wind_speed']*2.237


## Grouping / Aggregating based on Day

In [9]:
aggregated_weather_df = weather_df.groupby('date').agg({'temp': ['mean', 'min', 'max'],
                                                       'humidity': 'mean',
                                                       'wind_speed': 'mean',
                                                       'rain_1h': 'median',
                                                       'rain_3h': 'median',
                                                       'snow_1h': 'median',
                                                       'snow_3h': 'median'})

aggregated_weather_df.columns = ['temp_mean', 'temp_min', 'temp_max', 'humidity_mean', 'wind_speed_mean', 
                                'rain_1h_median', 'rain_3h_median', 'snow_1h_median', 'snow_3h_median']

aggregated_weather_df = aggregated_weather_df.reset_index()

In [10]:
# from pathlib import Path
# path = Path("C:/Users/Collin/Desktop/Academic/Courses/DSCI 689/Group Analysis/agg_weather.csv")
# path.parent.mkdir(parents = True, exist_ok = True)
# aggregated_weather_df.to_csv(path)