In [1]:
#importing dependencies
import pandas as pd
import datetime as dt

In [2]:
#load the raw file
file_path = "Resources/53212_bulk_historical_weather.csv"
weather_df = pd.read_csv(file_path)
weather_df.head()

Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,feels_like,temp_min,temp_max,...,wind_deg,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,1388534400,2014-01-01 00:00:00 +0000 UTC,-21600,53212,43.073271,-87.908962,7.81,1.09,3.97,9.86,...,310,,,0.52,,90,601,Snow,snow,13n
1,1388538000,2014-01-01 01:00:00 +0000 UTC,-21600,53212,43.073271,-87.908962,7.72,7.72,5.77,8.89,...,283,,,0.56,,90,601,Snow,snow,13n
2,1388541600,2014-01-01 02:00:00 +0000 UTC,-21600,53212,43.073271,-87.908962,8.42,8.42,5.77,9.86,...,273,,,0.44,,90,600,Snow,light snow,13n
3,1388545200,2014-01-01 03:00:00 +0000 UTC,-21600,53212,43.073271,-87.908962,8.35,8.35,5.77,8.89,...,271,,,0.25,,75,600,Snow,light snow,13n
4,1388548800,2014-01-01 04:00:00 +0000 UTC,-21600,53212,43.073271,-87.908962,8.46,8.46,5.77,9.86,...,271,,,0.12,,90,600,Snow,light snow,13n


In [3]:
#analizing columns
weather_df.dtypes

dt                       int64
dt_iso                  object
timezone                 int64
city_name                int64
lat                    float64
lon                    float64
temp                   float64
feels_like             float64
temp_min               float64
temp_max               float64
pressure                 int64
sea_level              float64
grnd_level             float64
humidity                 int64
wind_speed             float64
wind_deg                 int64
rain_1h                float64
rain_3h                float64
snow_1h                float64
snow_3h                float64
clouds_all               int64
weather_id               int64
weather_main            object
weather_description     object
weather_icon            object
dtype: object

In [4]:
#converting "dt" column into datetime
weather_df['dt'] = pd.to_datetime(weather_df["dt"], unit="s")

#filtering columns within our daterange
weather_df = weather_df[(weather_df['dt'] >= "2019-02-15")]

In [5]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23749 entries, 46961 to 70709
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   dt                   23749 non-null  datetime64[ns]
 1   dt_iso               23749 non-null  object        
 2   timezone             23749 non-null  int64         
 3   city_name            23749 non-null  int64         
 4   lat                  23749 non-null  float64       
 5   lon                  23749 non-null  float64       
 6   temp                 23749 non-null  float64       
 7   feels_like           23749 non-null  float64       
 8   temp_min             23749 non-null  float64       
 9   temp_max             23749 non-null  float64       
 10  pressure             23749 non-null  int64         
 11  sea_level            0 non-null      float64       
 12  grnd_level           0 non-null      float64       
 13  humidity             23749 

In [6]:
#dropping irrelevant columns
columns_no = ['dt_iso',
              'timezone', 
              'city_name', 
              'lat', 
              'lon',
              'sea_level',
              'grnd_level',
              'wind_deg',
              'rain_1h',
              'rain_3h',
              'snow_1h',
              'snow_3h',
              'weather_id',
              'weather_description', 
              'weather_icon']

weather_df.drop(columns=columns_no, inplace=True)

In [7]:
weather_df.head()

Unnamed: 0,dt,temp,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,clouds_all,weather_main
46961,2019-02-15 00:00:00,39.31,33.71,36.37,41.14,998,77,8.05,90,Rain
46962,2019-02-15 01:00:00,37.96,37.96,36.28,39.9,998,81,1.99,90,Clouds
46963,2019-02-15 02:00:00,37.53,34.45,36.28,39.0,998,80,4.0,90,Rain
46964,2019-02-15 03:00:00,35.78,25.72,33.3,37.72,999,76,17.22,90,Rain
46965,2019-02-15 04:00:00,32.81,20.82,28.74,35.56,1000,78,20.8,90,Snow


In [8]:
#analyzing weather values for "weather_main" column
weather_df['weather_main'].value_counts()

Clear           10236
Clouds           7852
Rain             2679
Mist             1246
Snow              793
Drizzle           459
Haze              244
Fog               102
Thunderstorm      101
Smoke              37
Name: weather_main, dtype: int64

In [9]:
#create numerical values for categorical features
nweather_df = pd.get_dummies(weather_df, columns = ['weather_main']).reset_index().drop(columns=['index'])
nweather_df.head()

Unnamed: 0,dt,temp,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,clouds_all,weather_main_Clear,weather_main_Clouds,weather_main_Drizzle,weather_main_Fog,weather_main_Haze,weather_main_Mist,weather_main_Rain,weather_main_Smoke,weather_main_Snow,weather_main_Thunderstorm
0,2019-02-15 00:00:00,39.31,33.71,36.37,41.14,998,77,8.05,90,0,0,0,0,0,0,1,0,0,0
1,2019-02-15 01:00:00,37.96,37.96,36.28,39.9,998,81,1.99,90,0,1,0,0,0,0,0,0,0,0
2,2019-02-15 02:00:00,37.53,34.45,36.28,39.0,998,80,4.0,90,0,0,0,0,0,0,1,0,0,0
3,2019-02-15 03:00:00,35.78,25.72,33.3,37.72,999,76,17.22,90,0,0,0,0,0,0,1,0,0,0
4,2019-02-15 04:00:00,32.81,20.82,28.74,35.56,1000,78,20.8,90,0,0,0,0,0,0,0,0,1,0


## How will summarizing the week work?

In [10]:
weather_attributes = nweather_df.columns

In [11]:
#initialize a dict to store summary of the week
week_summary_dict = {}
#initialize empty lists in that dictionary 
for attribute in weather_attributes:
       week_summary_dict[attribute] = []

week_summary_dict

{'dt': [],
 'temp': [],
 'feels_like': [],
 'temp_min': [],
 'temp_max': [],
 'pressure': [],
 'humidity': [],
 'wind_speed': [],
 'clouds_all': [],
 'weather_main_Clear': [],
 'weather_main_Clouds': [],
 'weather_main_Drizzle': [],
 'weather_main_Fog': [],
 'weather_main_Haze': [],
 'weather_main_Mist': [],
 'weather_main_Rain': [],
 'weather_main_Smoke': [],
 'weather_main_Snow': [],
 'weather_main_Thunderstorm': []}

In [12]:
#we first need to take groups for every week
week_end = dt.datetime(2019, 2,15)
index_count = 0

while (index_count < len(nweather_df)):
    
    #establish beggining and end of week
    week_begin = week_end
    week_end = week_begin + dt.timedelta(days=7)

    #create new df with the week range 
    week_df = nweather_df[(nweather_df['dt'] >= week_begin) & (nweather_df['dt'] < week_end)]

    #on the dt column goes the first entry
    week_summary_dict['dt'].append(week_df['dt'][index_count])
    #on the temp, feels_like columns go the mean temperatures of the week
    week_summary_dict['temp'].append(week_df['temp'].aggregate('mean'))
    week_summary_dict['feels_like'].append(week_df['feels_like'].aggregate('mean'))
    #temp_min has the minimum temperature of the week
    week_summary_dict['temp_min'].append(week_df['temp_min'].aggregate('min'))
    #temp_max has the maximum temperature of the week
    week_summary_dict['temp_max'].append(week_df['temp_max'].aggregate('max'))
    #pressure, humidity, wind_speed, clouds_all are all averages
    week_summary_dict['pressure'].append(week_df['pressure'].aggregate('mean'))
    week_summary_dict['humidity'].append(week_df['humidity'].aggregate('mean'))
    week_summary_dict['wind_speed'].append(week_df['wind_speed'].aggregate('mean'))
    week_summary_dict['clouds_all'].append(week_df['clouds_all'].aggregate('mean'))
    #the rest of the columns will show a percentage of the time that the weather was like that
    week_summary_dict['weather_main_Clear'].append(week_df['weather_main_Clear'].aggregate('sum')/len(week_df))
    week_summary_dict['weather_main_Clouds'].append(week_df['weather_main_Clouds'].aggregate('sum')/len(week_df))
    week_summary_dict['weather_main_Drizzle'].append(week_df['weather_main_Drizzle'].aggregate('sum')/len(week_df))
    week_summary_dict['weather_main_Fog'].append(week_df['weather_main_Fog'].aggregate('sum')/len(week_df))
    week_summary_dict['weather_main_Haze'].append(week_df['weather_main_Haze'].aggregate('sum')/len(week_df))
    week_summary_dict['weather_main_Mist'].append(week_df['weather_main_Mist'].aggregate('sum')/len(week_df))
    week_summary_dict['weather_main_Rain'].append(week_df['weather_main_Rain'].aggregate('sum')/len(week_df))
    week_summary_dict['weather_main_Smoke'].append(week_df['weather_main_Smoke'].aggregate('sum')/len(week_df))
    week_summary_dict['weather_main_Snow'].append(week_df['weather_main_Snow'].aggregate('sum')/len(week_df))
    week_summary_dict['weather_main_Thunderstorm'].append(week_df['weather_main_Thunderstorm'].aggregate('sum')/len(week_df))

    index_count += len(week_df)

In [13]:
#save into a pandas df
week_weather_summary_df = pd.DataFrame(week_summary_dict)

In [14]:
#preview the df
week_weather_summary_df

Unnamed: 0,dt,temp,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,clouds_all,weather_main_Clear,weather_main_Clouds,weather_main_Drizzle,weather_main_Fog,weather_main_Haze,weather_main_Mist,weather_main_Rain,weather_main_Smoke,weather_main_Snow,weather_main_Thunderstorm
0,2019-02-15,23.368876,14.429101,-5.10,41.14,1018.101124,76.247191,11.688090,50.320225,0.410112,0.224719,0.005618,0.005618,0.000000,0.039326,0.067416,0.0,0.247191,0.000000
1,2019-02-22,22.586823,18.859063,0.37,42.76,1019.541667,77.239583,3.882969,61.526042,0.229167,0.213542,0.041667,0.010417,0.062500,0.088542,0.130208,0.0,0.223958,0.000000
2,2019-03-01,15.662619,11.220119,-8.72,31.91,1022.636905,64.565476,3.630298,43.059524,0.464286,0.386905,0.000000,0.000000,0.053571,0.005952,0.000000,0.0,0.089286,0.000000
3,2019-03-08,34.007813,31.361771,5.77,62.64,1012.838542,79.098958,3.752135,52.005208,0.302083,0.203125,0.057292,0.010417,0.057292,0.109375,0.239583,0.0,0.020833,0.000000
4,2019-03-15,36.160057,33.896761,21.74,52.23,1019.306818,71.698864,3.503636,48.096591,0.392045,0.340909,0.028409,0.000000,0.051136,0.079545,0.102273,0.0,0.005682,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,2021-08-20,75.183600,77.331886,55.65,94.64,1015.194286,80.605714,5.489771,28.222857,0.485714,0.222857,0.000000,0.017143,0.005714,0.080000,0.171429,0.0,0.000000,0.017143
132,2021-08-27,73.168103,74.399310,50.77,93.42,1015.879310,75.241379,5.646724,25.689655,0.563218,0.235632,0.000000,0.000000,0.011494,0.040230,0.132184,0.0,0.000000,0.017241
133,2021-09-03,66.520357,66.284048,48.65,81.84,1012.250000,72.464286,6.398869,31.553571,0.571429,0.351190,0.000000,0.000000,0.005952,0.005952,0.053571,0.0,0.000000,0.011905
134,2021-09-10,67.382024,67.394643,48.97,87.76,1014.958333,74.767857,5.763690,32.386905,0.559524,0.386905,0.000000,0.000000,0.017857,0.017857,0.017857,0.0,0.000000,0.000000


In [16]:
#save weather data to csv
week_weather_summary_df.to_csv("Resources/week_weather_summary.csv", index=False)