# Cleaning Weather Data
#### By: Jack Cohen, Karina Hutula, Raheem Paxton

# Import Dependencies

In [1]:
import pandas as pd
from datetime import timedelta, datetime, tzinfo
import time
import os
t = time.time()

# Import Data

In [2]:
# Import Chicago Weather Data from OpenWeather API

data = pd.read_csv('Resources/chicago-hourly-weather-1980-2021.csv')

# Read columns available
print('Raw Data Columns')
print('---------------------')
columns = data.columns
for x in columns:
    print(x)

# Show preview of table
data.tail()

Raw Data Columns
---------------------
dt
dt_iso
timezone
city_name
lat
lon
temp
feels_like
temp_min
temp_max
pressure
sea_level
grnd_level
humidity
wind_speed
wind_deg
rain_1h
rain_3h
snow_1h
snow_3h
clouds_all
weather_id
weather_main
weather_description
weather_icon


Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,feels_like,temp_min,temp_max,...,wind_deg,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
402325,1630436400,2021-08-31 19:00:00 +0000 UTC,-18000,Chicago,41.878114,-87.629798,27.26,27.47,26.1,28.38,...,80,,,,,75,803,Clouds,broken clouds,04d
402326,1630440000,2021-08-31 20:00:00 +0000 UTC,-18000,Chicago,41.878114,-87.629798,27.04,27.39,25.56,28.33,...,71,,,,,75,803,Clouds,broken clouds,04d
402327,1630443600,2021-08-31 21:00:00 +0000 UTC,-18000,Chicago,41.878114,-87.629798,26.89,27.31,25.55,28.33,...,70,,,,,75,803,Clouds,broken clouds,04d
402328,1630447200,2021-08-31 22:00:00 +0000 UTC,-18000,Chicago,41.878114,-87.629798,26.13,26.13,25.0,27.3,...,90,,,,,75,803,Clouds,broken clouds,04d
402329,1630450800,2021-08-31 23:00:00 +0000 UTC,-18000,Chicago,41.878114,-87.629798,25.22,25.13,24.44,26.02,...,83,,,,,75,803,Clouds,broken clouds,04d


# Timezone Adjustment

In [3]:
# Adjust dt for timezone; Create columns with local Chicago datetime/date/time
# Filter for years only over 2000

date_time = []
date = []
time = []
year = []
TF = []

for x in range(len(data)):
    adjusted_date = datetime.utcfromtimestamp(data['dt'][x])+timedelta(seconds=int(data['timezone'][x]))
    date_time.append(adjusted_date.strftime("%Y-%m-%d %H:%M"))
    year.append(adjusted_date.strftime("%Y"))
    if adjusted_date.year <=2000:
        TF.append(True)
    else:
        TF.append(False)

data.insert(0,'local_dt',date_time)
data.insert(0,'year',year)
data.insert(0,'TF',TF)

data_all = data

data.head()

Unnamed: 0,TF,year,local_dt,dt,dt_iso,timezone,city_name,lat,lon,temp,...,wind_deg,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,True,1978,1978-12-31 18:00,283996800,1979-01-01 00:00:00 +0000 UTC,-21600,Chicago,41.878114,-87.629798,0.03,...,10,,,,,90,600,Snow,light snow,13n
1,True,1978,1978-12-31 19:00,284000400,1979-01-01 01:00:00 +0000 UTC,-21600,Chicago,41.878114,-87.629798,0.0,...,340,,,,,90,600,Snow,light snow,13n
2,True,1978,1978-12-31 20:00,284004000,1979-01-01 02:00:00 +0000 UTC,-21600,Chicago,41.878114,-87.629798,0.0,...,340,,,,,90,600,Snow,light snow,13n
3,True,1978,1978-12-31 21:00,284007600,1979-01-01 03:00:00 +0000 UTC,-21600,Chicago,41.878114,-87.629798,-2.2,...,320,,,,,90,600,Snow,light snow,13n
4,True,1978,1978-12-31 22:00,284011200,1979-01-01 04:00:00 +0000 UTC,-21600,Chicago,41.878114,-87.629798,-2.2,...,320,,,,,90,600,Snow,light snow,13n


# Drop Unnecessary Columns

In [4]:
# Find columns with no data

a = data['snow_3h'].value_counts()
b = data['rain_3h'].value_counts()
c = data['sea_level'].value_counts()
d = data['grnd_level'].value_counts()

print(a)
print(b)
print(c)
print(d)

Series([], Name: snow_3h, dtype: int64)
Series([], Name: rain_3h, dtype: int64)
Series([], Name: sea_level, dtype: int64)
Series([], Name: grnd_level, dtype: int64)


In [5]:
# Drop data before year 2000 (crime data has no data for these years)
data.drop(data[(data['TF']==True)].index,inplace=True)

# Drop columns with no data or irrelevant data
data.drop(columns=['city_name',
                   'weather_icon',
                   'lat',
                   'lon',
                   'dt',
                   'dt_iso',
                   'timezone',
                   'snow_3h',
                   'rain_3h',
                   'sea_level',
                   'grnd_level',
                   'TF',
                   'year'],
          inplace=True)

# Fill precipitation NaN values with 0; easier for plotting
data['snow_1h'].fillna(0,inplace=True)
data['rain_1h'].fillna(0,inplace=True)

data.tail()

Unnamed: 0,local_dt,temp,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,rain_1h,snow_1h,clouds_all,weather_id,weather_main,weather_description
402325,2021-08-31 14:00,27.26,27.47,26.1,28.38,1010,47,4.63,80,0.0,0.0,75,803,Clouds,broken clouds
402326,2021-08-31 15:00,27.04,27.39,25.56,28.33,1011,49,3.58,71,0.0,0.0,75,803,Clouds,broken clouds
402327,2021-08-31 16:00,26.89,27.31,25.55,28.33,1010,50,7.2,70,0.0,0.0,75,803,Clouds,broken clouds
402328,2021-08-31 17:00,26.13,26.13,25.0,27.3,1010,50,2.24,90,0.0,0.0,75,803,Clouds,broken clouds
402329,2021-08-31 18:00,25.22,25.13,24.44,26.02,1010,51,4.47,83,0.0,0.0,75,803,Clouds,broken clouds


# Conversions

In [6]:
# Temperatures: from C to F
data['temp']=round((data['temp']*1.8)+32,2)
data['feels_like']=round((data['feels_like']*1.8)+32,2)
data['temp_min']=round((data['temp_min']*1.8)+32,2)
data['temp_max']=round((data['temp_max']*1.8)+32,2)

# Volume: from mm to inches
data['rain_1h'] = round(data['rain_1h']/25.4,2)
data['snow_1h'] = round(data['snow_1h']/25.4,2)

# Speed: from m/s to mph
data['wind_speed'] = round(data['wind_speed']*2.237,2)

data.tail()

Unnamed: 0,local_dt,temp,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,rain_1h,snow_1h,clouds_all,weather_id,weather_main,weather_description
402325,2021-08-31 14:00,81.07,81.45,78.98,83.08,1010,47,10.36,80,0.0,0.0,75,803,Clouds,broken clouds
402326,2021-08-31 15:00,80.67,81.3,78.01,82.99,1011,49,8.01,71,0.0,0.0,75,803,Clouds,broken clouds
402327,2021-08-31 16:00,80.4,81.16,77.99,82.99,1010,50,16.11,70,0.0,0.0,75,803,Clouds,broken clouds
402328,2021-08-31 17:00,79.03,79.03,77.0,81.14,1010,50,5.01,90,0.0,0.0,75,803,Clouds,broken clouds
402329,2021-08-31 18:00,77.4,77.23,75.99,78.84,1010,51,10.0,83,0.0,0.0,75,803,Clouds,broken clouds


# Rename Headers

In [7]:
# Rename headers to add units
data.rename(columns={'temp':'temp_F',
                     'feels_like':'feels_like_F',
                     'temp_min':'temp_min_F',
                     'temp_max':'temp_max_F',
                     'pressure':'pressure_hPa',
                     'humidity':'humidity_percent',
                     'wind_speed':'wind_speed_mph',
                     'rain_1h':'rain_1h_inches',
                     'snow_1h':'snow_1h_inches',
                     'clouds_all':'clouds_percent'},
            inplace=True)

weather_df = data
weather_df.tail()

Unnamed: 0,local_dt,temp_F,feels_like_F,temp_min_F,temp_max_F,pressure_hPa,humidity_percent,wind_speed_mph,wind_deg,rain_1h_inches,snow_1h_inches,clouds_percent,weather_id,weather_main,weather_description
402325,2021-08-31 14:00,81.07,81.45,78.98,83.08,1010,47,10.36,80,0.0,0.0,75,803,Clouds,broken clouds
402326,2021-08-31 15:00,80.67,81.3,78.01,82.99,1011,49,8.01,71,0.0,0.0,75,803,Clouds,broken clouds
402327,2021-08-31 16:00,80.4,81.16,77.99,82.99,1010,50,16.11,70,0.0,0.0,75,803,Clouds,broken clouds
402328,2021-08-31 17:00,79.03,79.03,77.0,81.14,1010,50,5.01,90,0.0,0.0,75,803,Clouds,broken clouds
402329,2021-08-31 18:00,77.4,77.23,75.99,78.84,1010,51,10.0,83,0.0,0.0,75,803,Clouds,broken clouds


# Format DataFrame

In [8]:
# Check for duplicates
print(weather_df['local_dt'].value_counts())

# Create list of unique datetimes
unique_dt = list(weather_df['local_dt'].unique())

2017-11-05 01:00    4
2009-03-08 14:00    4
2003-09-26 17:00    3
2014-10-15 07:00    3
2009-04-30 15:00    3
                   ..
2019-11-17 05:00    1
2014-10-19 21:00    1
2006-12-26 22:00    1
2007-10-08 16:00    1
2012-11-10 14:00    1
Name: local_dt, Length: 181126, dtype: int64


In [9]:
# Make weather_id/main/description into list form to remove duplicate timestamps

for x in unique_dt:
    test = weather_df.loc[weather_df['local_dt']==x]
    ids = list(test['weather_id'])
    mains = list(test['weather_main'])
    descriptions = list(test['weather_description'])

    test = test.drop_duplicates(subset='local_dt')

    test['weather_id'] = test['weather_id'].astype(object)
    test['weather_main'] = test['weather_main'].astype(object)
    test['weather_description'] = test['weather_description'].astype(object)

    test.iat[0,test.columns.get_loc('weather_id')] = ids
    test.iat[0,test.columns.get_loc('weather_main')] = mains
    test.iat[0,test.columns.get_loc('weather_description')] = descriptions
    
    weather_df.loc[weather_df['local_dt']==x] = test

# Show Available Data

In [10]:
# Print column headers to show available data

print('Data Columns')
print('---------------------')
columns = weather_df.columns
for x in columns:
    print(x)

weather_df.head()

Data Columns
---------------------
local_dt
temp_F
feels_like_F
temp_min_F
temp_max_F
pressure_hPa
humidity_percent
wind_speed_mph
wind_deg
rain_1h_inches
snow_1h_inches
clouds_percent
weather_id
weather_main
weather_description


Unnamed: 0,local_dt,temp_F,feels_like_F,temp_min_F,temp_max_F,pressure_hPa,humidity_percent,wind_speed_mph,wind_deg,rain_1h_inches,snow_1h_inches,clouds_percent,weather_id,weather_main,weather_description
208651,2001-01-01 00:00,19.65,9.77,12.25,22.35,1027.0,85.0,8.05,10.0,0.0,0.0,75.0,[803],[Clouds],[broken clouds]
208652,2001-01-01 01:00,19.44,12.67,13.15,22.17,1027.0,79.0,4.7,360.0,0.0,0.0,90.0,[804],[Clouds],[overcast clouds]
208653,2001-01-01 02:00,19.24,12.43,13.15,21.99,1028.0,79.0,4.7,310.0,0.0,0.0,90.0,[804],[Clouds],[overcast clouds]
208654,2001-01-01 03:00,19.09,14.13,17.46,21.67,1028.0,85.0,3.36,290.0,0.0,0.0,90.0,[804],[Clouds],[overcast clouds]
208655,2001-01-01 04:00,18.12,11.14,10.27,21.27,1028.0,78.0,4.7,310.0,0.0,0.0,90.0,[804],[Clouds],[overcast clouds]


# Export DataFrame

In [11]:
# Export DataFrame to CSV
weather_df.to_csv('Resources/weather_df.csv',index=False)

In [12]:
# Alert completion
import os
os.system('say "Your program has finished."')
import time
elapsed = (time.time()-t)/60
print(f"Code ran in {elapsed} minutes")

AttributeError: 'list' object has no attribute 'time'