### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()

### Load File and Investigate

In [6]:
data = pd.read_csv('Chicago Weather Station Data.csv', parse_dates=[2])
data.head()

Unnamed: 0,STATION,NAME,DATE,PRCP,SNOW,SNWD,TMAX,TMIN,TOBS,WESD,WT01,WT16,WT18
0,USC00111532,"CHICAGO LAKEVIEW PUMP, IL US",1948-07-01,0.0,0.0,0.0,,,,,,,
1,USC00111532,"CHICAGO LAKEVIEW PUMP, IL US",1948-07-02,0.0,0.0,0.0,,,,,,,
2,USC00111532,"CHICAGO LAKEVIEW PUMP, IL US",1948-07-03,0.0,0.0,0.0,,,,,,,
3,USC00111532,"CHICAGO LAKEVIEW PUMP, IL US",1948-07-04,0.0,0.0,0.0,,,,,,,
4,USC00111532,"CHICAGO LAKEVIEW PUMP, IL US",1948-07-05,0.0,0.0,0.0,,,,,,,


In [9]:
data.describe()

Unnamed: 0,PRCP,SNOW,SNWD,TMAX,TMIN,TOBS,WESD,WT01,WT16,WT18
count,53749.0,36228.0,36233.0,54184.0,54183.0,21797.0,622.0,2286.0,540.0,2.0
mean,0.096948,0.089058,0.559048,58.200317,42.925032,50.462357,0.029582,1.0,1.0,1.0
std,0.306512,0.562209,1.90054,20.776471,18.985838,20.425397,0.220436,0.0,0.0,0.0
min,0.0,0.0,0.0,-20.0,-27.0,-25.0,0.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,41.0,30.0,35.0,0.0,1.0,1.0,1.0
50%,0.0,0.0,0.0,59.0,43.0,51.0,0.0,1.0,1.0,1.0
75%,0.04,0.0,0.0,76.0,59.0,68.0,0.0,1.0,1.0,1.0
max,30.0,16.4,25.0,171.0,85.0,101.0,3.5,1.0,1.0,1.0


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55568 entries, 0 to 55567
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   STATION  55568 non-null  object        
 1   NAME     55568 non-null  object        
 2   DATE     55568 non-null  datetime64[ns]
 3   PRCP     53749 non-null  float64       
 4   SNOW     36228 non-null  float64       
 5   SNWD     36233 non-null  float64       
 6   TMAX     54184 non-null  float64       
 7   TMIN     54183 non-null  float64       
 8   TOBS     21797 non-null  float64       
 9   WESD     622 non-null    float64       
 10  WT01     2286 non-null   float64       
 11  WT16     540 non-null    float64       
 12  WT18     2 non-null      float64       
dtypes: datetime64[ns](1), float64(10), object(2)
memory usage: 5.5+ MB


### Convert data types

In [12]:
# Parsed date column in initial .csv read

### Convert column names

In [14]:
data.columns.values

array(['STATION', 'NAME', 'DATE', 'PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN',
       'TOBS', 'WESD', 'WT01', 'WT16', 'WT18'], dtype=object)

In [17]:
data = data.rename(columns={'PRCP':'Precipitation (in)', 'SNOW':'Snowfall (in)', 'SNWD':'Snow Depth (in)', 'TMAX':'Max. Temp (degF)', 'TMIN':'Min. Temp (degF)',
       'TOBS':'Observed Temp (degF)', 'WESD':'Water Equivalent to Snow'})

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55568 entries, 0 to 55567
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   STATION                   55568 non-null  object        
 1   NAME                      55568 non-null  object        
 2   DATE                      55568 non-null  datetime64[ns]
 3   Precipitation (in)        53749 non-null  float64       
 4   Snowfall (in)             36228 non-null  float64       
 5   Snow Depth (in)           36233 non-null  float64       
 6   Max. Temp (degF)          54184 non-null  float64       
 7   Min. Temp (degF)          54183 non-null  float64       
 8   Observed Temp (degF)      21797 non-null  float64       
 9   Water Equivalent to Snow  622 non-null    float64       
 10  WT01                      2286 non-null   float64       
 11  WT16                      540 non-null    float64       
 12  WT18              

### Drop column values with high Null occurrences

In [21]:
data_drop = data.drop(['Water Equivalent to Snow', 'WT01', 'WT16', 'WT18'], axis=1)
data_drop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55568 entries, 0 to 55567
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   STATION               55568 non-null  object        
 1   NAME                  55568 non-null  object        
 2   DATE                  55568 non-null  datetime64[ns]
 3   Precipitation (in)    53749 non-null  float64       
 4   Snowfall (in)         36228 non-null  float64       
 5   Snow Depth (in)       36233 non-null  float64       
 6   Max. Temp (degF)      54184 non-null  float64       
 7   Min. Temp (degF)      54183 non-null  float64       
 8   Observed Temp (degF)  21797 non-null  float64       
dtypes: datetime64[ns](1), float64(6), object(2)
memory usage: 3.8+ MB


### Investigate different weater stations and divide data

In [24]:
data_drop['NAME'].unique()

array(['CHICAGO LAKEVIEW PUMP, IL US', 'CHICAGO BOTANIC GARDEN, IL US',
       'CHICAGO GRANT PARK, IL US',
       'CHICAGO WEATHER BUREAU CITY 2, IL US',
       'CHICAGO UNIVERSITY, IL US'], dtype=object)

In [None]:
data