# Packages and Functions

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

# Data Ingestion

In [12]:
file_loc = "../data/Summary of Weather.csv"
weather_summary = pd.read_csv(file_loc)
weather_summary.head()

Unnamed: 0,STA,Date,Precip,WindGustSpd,MaxTemp,MinTemp,MeanTemp,Snowfall,PoorWeather,YR,...,FB,FTI,ITH,PGT,TSHDSBRSGF,SD3,RHX,RHN,RVG,WTE
0,10001,1942-7-1,1.016,,25.555556,22.222222,23.888889,0.0,,42,...,,,,,,,,,,
1,10001,1942-7-2,0.0,,28.888889,21.666667,25.555556,0.0,,42,...,,,,,,,,,,
2,10001,1942-7-3,2.54,,26.111111,22.222222,24.444444,0.0,,42,...,,,,,,,,,,
3,10001,1942-7-4,2.54,,26.666667,22.222222,24.444444,0.0,,42,...,,,,,,,,,,
4,10001,1942-7-5,0.0,,26.666667,21.666667,24.444444,0.0,,42,...,,,,,,,,,,


In [13]:
file_loc = "../data/Weather Station Locations.csv"
weather_loc = pd.read_csv(file_loc)
weather_loc.head()

Unnamed: 0,WBAN,NAME,STATE/COUNTRY ID,LAT,LON,ELEV,Latitude,Longitude
0,33013,AIN EL,AL,3623N,00637E,611,36.383333,6.65
1,33031,LA SENIA,AL,3537N,00037E,88,35.616667,0.583333
2,33023,MAISON BLANCHE,AL,3643N,00314E,23,36.716667,3.216667
3,33044,TELERGMA,AL,3607N,00621E,754,36.116667,6.416667
4,12001,TINDOUF,AL,2741N,00809W,443,27.683333,-8.083333


In [14]:
weather_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119040 entries, 0 to 119039
Data columns (total 31 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   STA          119040 non-null  int64  
 1   Date         119040 non-null  object 
 2   Precip       119040 non-null  object 
 3   WindGustSpd  532 non-null     float64
 4   MaxTemp      119040 non-null  float64
 5   MinTemp      119040 non-null  float64
 6   MeanTemp     119040 non-null  float64
 7   Snowfall     117877 non-null  object 
 8   PoorWeather  34237 non-null   object 
 9   YR           119040 non-null  int64  
 10  MO           119040 non-null  int64  
 11  DA           119040 non-null  int64  
 12  PRCP         117108 non-null  object 
 13  DR           533 non-null     float64
 14  SPD          532 non-null     float64
 15  MAX          118566 non-null  float64
 16  MIN          118572 non-null  float64
 17  MEA          118542 non-null  float64
 18  SNF          117877 non-

### Column Guide:
- STA: Weather Station
- Date: Self-explanatory
- Precip: Precipitation in mm
- WindGustSpd [DROPPED]: Peak wind gust speed in km/h
- MaxTemp: Maximum temperature in degrees Celsius
- MinTemp: Minimum temperature in degrees Celsius
- MeanTemp: Mean temperature in degrees Celsius
- Snowfall: Snowfall and Ice Pellets in mm
- PoorWeather [DROPPED]: Self-explanatory
- YR: Year of Observation
- MO: Month of Observation
- DA: Day of Observation
- PRCP: Precipitation in Inches
- DR [DROPPED]: Peak wind gust direction in tens of degrees
- SPD [DROPPED]: Peak wind gust speed in knots
- MAX: Maximum temperature in degrees Fahrenheit
- MIN: Minimum temperature in degrees Fahrenheit
- MEA: Mean temperature in degrees Fahrenheit
- SNF: Snowfall in inches and tenths
- SND [DROPPED]: Snow depth (includes ice pellets) recorded at 1200 GMT except 0000 GMT in Far East Asian Area in inches and tenths
- FT [DROPPED]: Frozen Ground Top (depth in inches)
- FB [DROPPED]: Frozen Ground Base (depth in inches)
- FTI [DROPPED]: Frozen Ground Thickness (thickness in inches)
- ITH [DROPPED]: Ice Thickness on Water (inches and tenths)
- PGT [DROPPED]: Peak wind gust time (hours and tenths)
- TSHDSBRSGF [DROPPED]: Day with: Thunder; Sleet; Hail; Dust or Sand; Smoke or Haze; Blowing Snow; Rain; Snow; Glaze; Fog; 0 = No, 1 = Yes
- SD3 [DROPPED]: Snow depth at 0030 GMT includes ice pellets in inches and tenths
- RHX [DROPPED]: 24-hour maximum relative humidity, as a whole percent
- RHN [DROPPED]: 24-hour minimum relative humidity, as a whole percent
- RVG [DROPPED]: River guage in feet and tenths
- WTE [DROPPED]: Water equivalent of snow and ice on ground in inches and hundredths

In [15]:
weather_loc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161 entries, 0 to 160
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   WBAN              161 non-null    int64  
 1   NAME              161 non-null    object 
 2   STATE/COUNTRY ID  161 non-null    object 
 3   LAT               161 non-null    object 
 4   LON               161 non-null    object 
 5   ELEV              161 non-null    int64  
 6   Latitude          161 non-null    float64
 7   Longitude         161 non-null    float64
dtypes: float64(2), int64(2), object(4)
memory usage: 10.2+ KB


### Column Guide
- WBAN: Weather Station Number
- NAME: Weather Station Name
- STATE/COUNTRY ID: Location
- LAT [DROPPED]: Latitude as string
- LON [DROPPED]: Longitude as string
- ELEV: Elevation (9999 means unknown)
- Latitude: Latitude as a numeric variable
- Longitude: Longitude as a numeric variable

# Data Cleaning and Preprocessing

In [16]:
drop_cols = ['LAT', 'LON']
weather_loc_proc = weather_loc.drop(drop_cols, axis=1)
weather_loc_proc.head()

Unnamed: 0,WBAN,NAME,STATE/COUNTRY ID,ELEV,Latitude,Longitude
0,33013,AIN EL,AL,611,36.383333,6.65
1,33031,LA SENIA,AL,88,35.616667,0.583333
2,33023,MAISON BLANCHE,AL,23,36.716667,3.216667
3,33044,TELERGMA,AL,754,36.116667,6.416667
4,12001,TINDOUF,AL,443,27.683333,-8.083333


In [17]:
drop_cols = ['WindGustSpd', 'DR', 'SPD', 'SND', 'FT', 'FB', 'FTI', 'ITH', 'PGT', 'SD3', 'RHX', 'RHN', 'RVG', 'WTE', 'PoorWeather', 'TSHDSBRSGF']
weather_summary_proc = weather_summary.drop(drop_cols, axis=1)
weather_summary_proc.head()

Unnamed: 0,STA,Date,Precip,MaxTemp,MinTemp,MeanTemp,Snowfall,YR,MO,DA,PRCP,MAX,MIN,MEA,SNF
0,10001,1942-7-1,1.016,25.555556,22.222222,23.888889,0.0,42,7,1,0.04,78.0,72.0,75.0,0.0
1,10001,1942-7-2,0.0,28.888889,21.666667,25.555556,0.0,42,7,2,0.0,84.0,71.0,78.0,0.0
2,10001,1942-7-3,2.54,26.111111,22.222222,24.444444,0.0,42,7,3,0.1,79.0,72.0,76.0,0.0
3,10001,1942-7-4,2.54,26.666667,22.222222,24.444444,0.0,42,7,4,0.1,80.0,72.0,76.0,0.0
4,10001,1942-7-5,0.0,26.666667,21.666667,24.444444,0.0,42,7,5,0.0,80.0,71.0,76.0,0.0
