In [1]:
#importing dependencies
import pandas as pd
import datetime as dt

In [2]:
#load the raw file
file_path = "Resources/53212_bulk_historical_weather.csv"
weather_df = pd.read_csv(file_path)
weather_df.head()

Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,feels_like,temp_min,temp_max,...,wind_deg,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,1388534400,2014-01-01 00:00:00 +0000 UTC,-21600,53212,43.073271,-87.908962,7.81,1.09,3.97,9.86,...,310,,,0.52,,90,601,Snow,snow,13n
1,1388538000,2014-01-01 01:00:00 +0000 UTC,-21600,53212,43.073271,-87.908962,7.72,7.72,5.77,8.89,...,283,,,0.56,,90,601,Snow,snow,13n
2,1388541600,2014-01-01 02:00:00 +0000 UTC,-21600,53212,43.073271,-87.908962,8.42,8.42,5.77,9.86,...,273,,,0.44,,90,600,Snow,light snow,13n
3,1388545200,2014-01-01 03:00:00 +0000 UTC,-21600,53212,43.073271,-87.908962,8.35,8.35,5.77,8.89,...,271,,,0.25,,75,600,Snow,light snow,13n
4,1388548800,2014-01-01 04:00:00 +0000 UTC,-21600,53212,43.073271,-87.908962,8.46,8.46,5.77,9.86,...,271,,,0.12,,90,600,Snow,light snow,13n


In [3]:
#analizing columns
weather_df.dtypes

dt                       int64
dt_iso                  object
timezone                 int64
city_name                int64
lat                    float64
lon                    float64
temp                   float64
feels_like             float64
temp_min               float64
temp_max               float64
pressure                 int64
sea_level              float64
grnd_level             float64
humidity                 int64
wind_speed             float64
wind_deg                 int64
rain_1h                float64
rain_3h                float64
snow_1h                float64
snow_3h                float64
clouds_all               int64
weather_id               int64
weather_main            object
weather_description     object
weather_icon            object
dtype: object

In [4]:
#converting "dt" column into datetime
weather_df['dt'] = pd.to_datetime(weather_df["dt"], unit="s")

#filtering columns within our daterange
weather_df = weather_df[(weather_df['dt'] > "2019-02-09")]

In [5]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23896 entries, 46814 to 70709
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   dt                   23896 non-null  datetime64[ns]
 1   dt_iso               23896 non-null  object        
 2   timezone             23896 non-null  int64         
 3   city_name            23896 non-null  int64         
 4   lat                  23896 non-null  float64       
 5   lon                  23896 non-null  float64       
 6   temp                 23896 non-null  float64       
 7   feels_like           23896 non-null  float64       
 8   temp_min             23896 non-null  float64       
 9   temp_max             23896 non-null  float64       
 10  pressure             23896 non-null  int64         
 11  sea_level            0 non-null      float64       
 12  grnd_level           0 non-null      float64       
 13  humidity             23896 

In [6]:
#dropping irrelevant columns
columns_no = ['dt_iso',
              'timezone', 
              'city_name', 
              'lat', 
              'lon',
              'sea_level',
              'grnd_level',
              'wind_deg',
              'rain_1h',
              'rain_3h',
              'snow_1h',
              'snow_3h',
              'weather_id',
              'weather_description', 
              'weather_icon']

weather_df.drop(columns=columns_no, inplace=True)

In [7]:
weather_df.head()

Unnamed: 0,dt,temp,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,clouds_all,weather_main
46814,2019-02-09 01:00:00,8.65,-3.95,6.3,10.36,1037,66,16.11,1,Clear
46815,2019-02-09 02:00:00,8.29,-4.31,5.29,10.36,1038,65,14.99,1,Clear
46816,2019-02-09 03:00:00,7.5,-5.1,4.73,9.1,1039,67,12.75,1,Clear
46817,2019-02-09 04:00:00,7.05,-5.55,4.28,8.94,1040,66,13.87,1,Clear
46818,2019-02-09 05:00:00,6.55,-6.05,3.78,8.56,1041,67,17.22,1,Clear


In [8]:
#analyzing weather values for "weather_main" column
weather_df['weather_main'].value_counts()

Clear           10273
Clouds           7895
Rain             2687
Mist             1251
Snow              841
Drizzle           459
Haze              250
Fog               102
Thunderstorm      101
Smoke              37
Name: weather_main, dtype: int64

In [9]:
#create numerical values for categorical features
nweather_df = pd.get_dummies(weather_df, columns = ['weather_main']).reset_index().drop(columns=['index'])
nweather_df.head()

Unnamed: 0,dt,temp,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,clouds_all,weather_main_Clear,weather_main_Clouds,weather_main_Drizzle,weather_main_Fog,weather_main_Haze,weather_main_Mist,weather_main_Rain,weather_main_Smoke,weather_main_Snow,weather_main_Thunderstorm
0,2019-02-09 01:00:00,8.65,-3.95,6.3,10.36,1037,66,16.11,1,1,0,0,0,0,0,0,0,0,0
1,2019-02-09 02:00:00,8.29,-4.31,5.29,10.36,1038,65,14.99,1,1,0,0,0,0,0,0,0,0,0
2,2019-02-09 03:00:00,7.5,-5.1,4.73,9.1,1039,67,12.75,1,1,0,0,0,0,0,0,0,0,0
3,2019-02-09 04:00:00,7.05,-5.55,4.28,8.94,1040,66,13.87,1,1,0,0,0,0,0,0,0,0,0
4,2019-02-09 05:00:00,6.55,-6.05,3.78,8.56,1041,67,17.22,1,1,0,0,0,0,0,0,0,0,0


## How will summarizing the week work?

In [10]:
weather_attributes = nweather_df.columns

In [11]:
#initialize a dict to store summary of the week
week_summary_dict = {}
#initialize empty lists in that dictionary 
for attribute in weather_attributes:
       week_summary_dict[attribute] = []

week_summary_dict

{'dt': [],
 'temp': [],
 'feels_like': [],
 'temp_min': [],
 'temp_max': [],
 'pressure': [],
 'humidity': [],
 'wind_speed': [],
 'clouds_all': [],
 'weather_main_Clear': [],
 'weather_main_Clouds': [],
 'weather_main_Drizzle': [],
 'weather_main_Fog': [],
 'weather_main_Haze': [],
 'weather_main_Mist': [],
 'weather_main_Rain': [],
 'weather_main_Smoke': [],
 'weather_main_Snow': [],
 'weather_main_Thunderstorm': []}

In [12]:
#we first need to take groups for every week
week_end = dt.datetime(2019, 2,9)
index_count = 0

while (index_count < len(nweather_df)):
    
    #establish beggining and end of week
    week_begin = week_end
    week_end = week_begin + dt.timedelta(days=7)

    #create new df with the week range 
    week_df = nweather_df[(nweather_df['dt'] >= week_begin) & (nweather_df['dt'] < week_end)]

    #on the dt column goes the first entry
    week_summary_dict['dt'].append(week_df['dt'][index_count])
    #on the temp, feels_like columns go the mean temperatures of the week
    week_summary_dict['temp'].append(week_df['temp'].aggregate('mean'))
    week_summary_dict['feels_like'].append(week_df['feels_like'].aggregate('mean'))
    #temp_min has the minimum temperature of the week
    week_summary_dict['temp_min'].append(week_df['temp_min'].aggregate('min'))
    #temp_max has the maximum temperature of the week
    week_summary_dict['temp_max'].append(week_df['temp_max'].aggregate('max'))
    #pressure, humidity, wind_speed, clouds_all are all averages
    week_summary_dict['pressure'].append(week_df['pressure'].aggregate('mean'))
    week_summary_dict['humidity'].append(week_df['humidity'].aggregate('mean'))
    week_summary_dict['wind_speed'].append(week_df['wind_speed'].aggregate('mean'))
    week_summary_dict['clouds_all'].append(week_df['clouds_all'].aggregate('mean'))
    #the rest of the columns will show a percentage of the time that the weather was like that
    week_summary_dict['weather_main_Clear'].append(week_df['weather_main_Clear'].aggregate('sum')/len(week_df))
    week_summary_dict['weather_main_Clouds'].append(week_df['weather_main_Clouds'].aggregate('sum')/len(week_df))
    week_summary_dict['weather_main_Drizzle'].append(week_df['weather_main_Drizzle'].aggregate('sum')/len(week_df))
    week_summary_dict['weather_main_Fog'].append(week_df['weather_main_Fog'].aggregate('sum')/len(week_df))
    week_summary_dict['weather_main_Haze'].append(week_df['weather_main_Haze'].aggregate('sum')/len(week_df))
    week_summary_dict['weather_main_Mist'].append(week_df['weather_main_Mist'].aggregate('sum')/len(week_df))
    week_summary_dict['weather_main_Rain'].append(week_df['weather_main_Rain'].aggregate('sum')/len(week_df))
    week_summary_dict['weather_main_Smoke'].append(week_df['weather_main_Smoke'].aggregate('sum')/len(week_df))
    week_summary_dict['weather_main_Snow'].append(week_df['weather_main_Snow'].aggregate('sum')/len(week_df))
    week_summary_dict['weather_main_Thunderstorm'].append(week_df['weather_main_Thunderstorm'].aggregate('sum')/len(week_df))

    index_count += len(week_df)

In [13]:
#save into a pandas df
week_weather_summary_df = pd.DataFrame(week_summary_dict)

In [14]:
#preview the df
week_weather_summary_df

Unnamed: 0,dt,temp,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,clouds_all,weather_main_Clear,weather_main_Clouds,weather_main_Drizzle,weather_main_Fog,weather_main_Haze,weather_main_Mist,weather_main_Rain,weather_main_Smoke,weather_main_Snow,weather_main_Thunderstorm
0,2019-02-09 01:00:00,23.568363,15.214620,-0.71,47.23,1017.263158,78.625731,11.276023,57.245614,0.298246,0.269006,0.000000,0.000000,0.035088,0.029240,0.064327,0.0,0.304094,0.000000
1,2019-02-16 00:00:00,23.841573,15.963652,-5.10,39.63,1021.078652,76.646067,9.853596,45.404494,0.393258,0.207865,0.005618,0.005618,0.028090,0.061798,0.073034,0.0,0.224719,0.000000
2,2019-02-23 00:00:00,22.523594,18.984167,0.37,42.76,1018.697917,75.333333,3.719427,72.338542,0.171875,0.338542,0.041667,0.010417,0.036458,0.067708,0.109375,0.0,0.223958,0.000000
3,2019-03-02 00:00:00,15.520060,11.202143,-8.72,38.62,1022.809524,66.922619,3.568095,37.785714,0.517857,0.309524,0.000000,0.000000,0.059524,0.005952,0.017857,0.0,0.089286,0.000000
4,2019-03-09 00:00:00,35.959689,32.900725,14.74,62.64,1010.424870,80.217617,4.331451,56.948187,0.253886,0.196891,0.072539,0.010363,0.062176,0.129534,0.248705,0.0,0.025907,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,2021-08-21 00:00:00,75.262599,77.448531,55.65,94.64,1015.508475,81.384181,5.778870,37.372881,0.429379,0.293785,0.000000,0.000000,0.016949,0.079096,0.163842,0.0,0.000000,0.016949
133,2021-08-28 00:00:00,71.843743,72.720409,50.77,93.42,1016.052632,73.631579,5.395848,17.058480,0.672515,0.192982,0.000000,0.000000,0.000000,0.000000,0.116959,0.0,0.000000,0.017544
134,2021-09-04 00:00:00,66.383036,66.089702,48.65,81.84,1011.964286,71.547619,6.275774,32.095238,0.547619,0.375000,0.000000,0.000000,0.005952,0.005952,0.053571,0.0,0.000000,0.011905
135,2021-09-11 00:00:00,68.155774,68.268036,48.97,87.76,1014.898810,74.601190,6.398214,29.791667,0.607143,0.339286,0.000000,0.000000,0.017857,0.017857,0.017857,0.0,0.000000,0.000000


In [None]:
#save weather data to csv
week_weather_summary_df.to_csv("Resources/week_weather_summary.csv")