In [4]:
import pandas as pd

### Let's construct a function that outputs the name of our clean data file. This way we won't have to manually type the full file name for each station.

In [5]:
def fname(station):
  return f'clean_data/{station}_weather_clean.csv'

### Now we can merge the dataframes together

In [6]:
# names of stations
stations = ["APA", "central_park", "DEN", "water_dept"]

# initialize empty dataframe
# we will add each station's entries in the for loop
df_merged = pd.DataFrame()

for station in stations:
  df_merged = pd.concat([df_merged, pd.read_csv(fname(station))], ignore_index=True)

df_merged.head()

Unnamed: 0,DATE,STATION,NAME,AWND,PRCP,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,TARGET,SNOW,SNWD
0,2005-01-01,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",7.61,0.0,51.0,22.0,260.0,250.0,21.0,29.1,34.0,,
1,2005-01-02,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",5.14,0.0,34.0,21.0,160.0,160.0,14.1,15.0,49.0,,
2,2005-01-03,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",4.47,0.0,49.0,17.0,50.0,50.0,14.1,16.1,28.0,,
3,2005-01-04,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",7.16,0.0,28.0,6.0,340.0,340.0,19.9,21.0,7.0,,
4,2005-01-05,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",3.8,0.0,7.0,1.0,10.0,20.0,8.9,10.1,41.0,,


### First we need to change the STATION and NAME column to be numerical values. Since STATION and NAME are fully dependent on one another, we will only keep the STATION column and drop the NAME column.

In [7]:
# dropping "NAME" column
df_merged.drop("NAME", axis=1, inplace=True)

df_merged.head()

Unnamed: 0,DATE,STATION,AWND,PRCP,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,TARGET,SNOW,SNWD
0,2005-01-01,USW00093067,7.61,0.0,51.0,22.0,260.0,250.0,21.0,29.1,34.0,,
1,2005-01-02,USW00093067,5.14,0.0,34.0,21.0,160.0,160.0,14.1,15.0,49.0,,
2,2005-01-03,USW00093067,4.47,0.0,49.0,17.0,50.0,50.0,14.1,16.1,28.0,,
3,2005-01-04,USW00093067,7.16,0.0,28.0,6.0,340.0,340.0,19.9,21.0,7.0,,
4,2005-01-05,USW00093067,3.8,0.0,7.0,1.0,10.0,20.0,8.9,10.1,41.0,,


In [8]:
# APA: 0,
# central_park: 1,
# DEN: 2,
# water_dept: 3

# unique list of station codes [APA, central_park, DEN, water_dept]
station_codes = df_merged["STATION"].unique()
# empty dictionary of station code mappings
station_codes_dict = {}

for i in range(len(station_codes)):
  code = station_codes[i]
  station_codes_dict[code] = i

df_merged["STATION"] = df_merged["STATION"].map(station_codes_dict)

df_merged.head()

Unnamed: 0,DATE,STATION,AWND,PRCP,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,TARGET,SNOW,SNWD
0,2005-01-01,0,7.61,0.0,51.0,22.0,260.0,250.0,21.0,29.1,34.0,,
1,2005-01-02,0,5.14,0.0,34.0,21.0,160.0,160.0,14.1,15.0,49.0,,
2,2005-01-03,0,4.47,0.0,49.0,17.0,50.0,50.0,14.1,16.1,28.0,,
3,2005-01-04,0,7.16,0.0,28.0,6.0,340.0,340.0,19.9,21.0,7.0,,
4,2005-01-05,0,3.8,0.0,7.0,1.0,10.0,20.0,8.9,10.1,41.0,,


###Because we are working with timeseries data, we will sort the entries by date in descending order

In [9]:
# sort values by DATE column
df_merged.sort_values("DATE", inplace=True)

df_merged.head()

Unnamed: 0,DATE,STATION,AWND,PRCP,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,TARGET,SNOW,SNWD
0,2005-01-01,0,7.61,0.0,51.0,22.0,260.0,250.0,21.0,29.1,34.0,,
7397,2005-01-01,1,,0.0,58.0,21.0,,,,,45.0,0.0,0.0
21704,2005-01-01,3,,0.0,28.0,20.0,,,,,45.0,0.0,0.0
1,2005-01-02,0,5.14,0.0,34.0,21.0,160.0,160.0,14.1,15.0,49.0,,
7398,2005-01-02,1,,0.0,45.0,19.0,,,,,32.0,0.0,0.0


### The indices got messed up when rearranging the entries, so we will reset them here.

In [10]:
# reset index after rearranging rows by date
df_merged.reset_index(inplace=True, drop=True)

df_merged.head()

Unnamed: 0,DATE,STATION,AWND,PRCP,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,TARGET,SNOW,SNWD
0,2005-01-01,0,7.61,0.0,51.0,22.0,260.0,250.0,21.0,29.1,34.0,,
1,2005-01-01,1,,0.0,58.0,21.0,,,,,45.0,0.0,0.0
2,2005-01-01,3,,0.0,28.0,20.0,,,,,45.0,0.0,0.0
3,2005-01-02,0,5.14,0.0,34.0,21.0,160.0,160.0,14.1,15.0,49.0,,
4,2005-01-02,1,,0.0,45.0,19.0,,,,,32.0,0.0,0.0


### We also notice that the DATE column isn't in datetime format. Let's fix this.

In [11]:
# change DATE column to datetime data type
df_merged["DATE"] = pd.to_datetime(df_merged["DATE"])

df_merged.head()

Unnamed: 0,DATE,STATION,AWND,PRCP,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,TARGET,SNOW,SNWD
0,2005-01-01,0,7.61,0.0,51.0,22.0,260.0,250.0,21.0,29.1,34.0,,
1,2005-01-01,1,,0.0,58.0,21.0,,,,,45.0,0.0,0.0
2,2005-01-01,3,,0.0,28.0,20.0,,,,,45.0,0.0,0.0
3,2005-01-02,0,5.14,0.0,34.0,21.0,160.0,160.0,14.1,15.0,49.0,,
4,2005-01-02,1,,0.0,45.0,19.0,,,,,32.0,0.0,0.0


### The combined tables share some common columns, but there are many that are not shared among them. Let's see how many missing values each column contains.

In [12]:
# number of observations
total_observations = df_merged.shape[0]
print(f'Number of observations: {total_observations}')

# number of NaN values for each column
nan_values = pd.Series(df_merged.apply(pd.isnull).sum(),
                       name="NaN count")

print(f'NaN values in each column:')

# sort rows by number of NaN values
nan_values.sort_values()

Number of observations: 28991
NaN values in each column:


DATE           0
STATION        0
PRCP           0
TMAX           0
TMIN           0
TARGET         0
SNOW        7397
SNWD        7397
AWND       14623
WDF2       14623
WDF5       14623
WSF2       14623
WSF5       14623
Name: NaN count, dtype: int64

### The data will be difficult to work with if there are a lot of missing values. However, because we are working with multiple datasets from the same area, we can likely group the data by date. Because we are working with a small sample for each date, we will fill NaNs with the median value for the corresponding date.

In [13]:
def impute_data(df: pd.DataFrame, feature: str, method: str):
  """
  Fill NaN values of df with mean or median value from that particular date

  Inputs:
    df: dataframe to impute
    feature: feature to group by ("DATE")
    method: string, {'mean', 'median'}

  Output:
    dataframe with NaN values filled using specified imputation method
  """

  valid_methods = {"mean", "median"}

  # ensure inputs are valid
  if feature not in df.columns:
    raise ValueError(f'ValueError: feature must be in one of {set(df.columns)}')
  elif method not in valid_methods:
    raise ValueError('ValueError: method must be one of {"mean", "median"}')

  # create dataframe based on imputation method
  if method == "median":
    df_grouped = df.groupby(feature).median(numeric_only=True)
  elif method == "mean":
    df_grouped = df.groupby(feature).mean()


  # fill row
  def fill_row(row, feature=feature):
    # check that the date did not have all NaN values
    date = row[feature]
    if date not in df_grouped.index:
      return row

    # check for NaN columns and update them accordingly
    for col in df.columns:
      # check for col in df_grouped.columns in the case of numeric_only=True
      if pd.isna(row[col]) and col in df_grouped.columns:
        row[col] = df_grouped.loc[date, col]
    return row

  return df.apply(fill_row, axis=1)




df_merged = impute_data(df_merged, "DATE", "median")

df_merged.head()

Unnamed: 0,DATE,STATION,AWND,PRCP,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,TARGET,SNOW,SNWD
0,2005-01-01,0,7.61,0.0,51.0,22.0,260.0,250.0,21.0,29.1,34.0,0.0,0.0
1,2005-01-01,1,7.61,0.0,58.0,21.0,260.0,250.0,21.0,29.1,45.0,0.0,0.0
2,2005-01-01,3,7.61,0.0,28.0,20.0,260.0,250.0,21.0,29.1,45.0,0.0,0.0
3,2005-01-02,0,5.14,0.0,34.0,21.0,160.0,160.0,14.1,15.0,49.0,0.0,0.0
4,2005-01-02,1,5.14,0.0,45.0,19.0,160.0,160.0,14.1,15.0,32.0,0.0,0.0


### Now let's create separate columns for the year, month and date. This will make aggregating easier during our EDA later on.

In [14]:
# insert columns for year, month, and day as integers
df_merged.insert(1, "YEAR", df_merged["DATE"].dt.year.astype(int))
df_merged.insert(2, "MONTH", df_merged["DATE"].dt.month.astype(int))
df_merged.insert(3, "DAY", df_merged["DATE"].dt.day.astype(int))

df_merged.head()

Unnamed: 0,DATE,YEAR,MONTH,DAY,STATION,AWND,PRCP,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,TARGET,SNOW,SNWD
0,2005-01-01,2005,1,1,0,7.61,0.0,51.0,22.0,260.0,250.0,21.0,29.1,34.0,0.0,0.0
1,2005-01-01,2005,1,1,1,7.61,0.0,58.0,21.0,260.0,250.0,21.0,29.1,45.0,0.0,0.0
2,2005-01-01,2005,1,1,3,7.61,0.0,28.0,20.0,260.0,250.0,21.0,29.1,45.0,0.0,0.0
3,2005-01-02,2005,1,2,0,5.14,0.0,34.0,21.0,160.0,160.0,14.1,15.0,49.0,0.0,0.0
4,2005-01-02,2005,1,2,1,5.14,0.0,45.0,19.0,160.0,160.0,14.1,15.0,32.0,0.0,0.0


### Now we need to split the data into training, testing, and validation sets. We will set aside 70% of the data for training, 15% for testing, and 15% for validation.

In [15]:
# set training size
train_size = int(0.7*len(df_merged))

# set validation size to 15% (half of remaining data)
validation_size = int(0.5*(len(df_merged) - train_size))

# set test size to leftover data not used by training or testing
test_size = len(df_merged) - (train_size + validation_size)

# check that the sum of all three parts is the same as the original dataset size
assert train_size + test_size + validation_size == len(df_merged)

print(f'Training Dataset Size: {train_size}')
print(f'Validation Dataset Size: {validation_size}')
print(f'Test Dataset Size: {test_size}')

Training Dataset Size: 20293
Validation Dataset Size: 4349
Test Dataset Size: 4349


In [16]:
# split the data and check that they have the correct number of observations
df_train = df_merged.iloc[:train_size,:]
df_validation = df_merged.iloc[train_size:train_size + validation_size, :]
df_test = df_merged.iloc[train_size + validation_size:, :]

assert train_size == len(df_train) and test_size == len(df_test) and validation_size == len(df_validation)

In [17]:
# cleaned file names
fname_train = "merged_data/denver_weather_train.csv"
fname_test = "merged_data/denver_weather_test.csv"
fname_validation = "merged_data/denver_weather_validation.csv"

# save files
df_train.to_csv(fname_train, index=False)
df_validation.to_csv(fname_validation, index=False)
df_test.to_csv(fname_test, index=False)