In [1]:
import pandas as pd

In [2]:
# construct the file path

# name of raw data file
fname_raw = "raw_data/DEN_weather_raw.csv"


# import csv file as pandas dataframe
df = pd.read_csv(fname_raw)

df.head()

Unnamed: 0,STATION,NAME,DATE,AWND,FMTM,PGTM,PRCP,PSUN,SNOW,SNWD,...,WT11,WT13,WT14,WT15,WT16,WT17,WT18,WT19,WT21,WT22
0,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",2005-01-01,9.4,10.0,1048.0,0.0,,,,...,,,,,,,,,,
1,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",2005-01-02,5.59,326.0,325.0,0.0,,,,...,,,,,,,,,,
2,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",2005-01-03,4.92,1519.0,1523.0,0.0,,,,...,,1.0,,,,,,,,1.0
3,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",2005-01-04,8.5,1031.0,1031.0,0.02,,,,...,,1.0,,,,,1.0,,,1.0
4,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",2005-01-05,4.7,2348.0,50.0,0.1,,,1.0,...,,1.0,,,,,1.0,,,1.0


### The index is currently the standard RangeIndex. We want to make it such that the date acts as the index for each data point.

In [3]:
# change the index of the dataframe to be DATE column
df.set_index("DATE", inplace=True)

# turn the index from string into a datetime object
df.index = pd.to_datetime(df.index)

df.head()

Unnamed: 0_level_0,STATION,NAME,AWND,FMTM,PGTM,PRCP,PSUN,SNOW,SNWD,TAVG,...,WT11,WT13,WT14,WT15,WT16,WT17,WT18,WT19,WT21,WT22
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-01,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",9.4,10.0,1048.0,0.0,,,,32.0,...,,,,,,,,,,
2005-01-02,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",5.59,326.0,325.0,0.0,,,,27.0,...,,,,,,,,,,
2005-01-03,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",4.92,1519.0,1523.0,0.0,,,,30.0,...,,1.0,,,,,,,,1.0
2005-01-04,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",8.5,1031.0,1031.0,0.02,,,,16.0,...,,1.0,,,,,1.0,,,1.0
2005-01-05,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",4.7,2348.0,50.0,0.1,,,1.0,2.0,...,,1.0,,,,,1.0,,,1.0


### We see that there are a lot of NaN values in the dataset. Let's see how many missing values there are in comparison to the number of observations.

In [4]:
# number of observations
total_observations = df.shape[0]
print(f'Number of observations: {total_observations}')

# number of NaN values for each column
nan_values = pd.Series(df.apply(pd.isnull).sum(),
                       name="NaN count")

print(f'NaN values in each column:')

# sort rows by number of NaN values
nan_values.sort_values()

Number of observations: 7370
NaN values in each column:


Unnamed: 0,NaN count
STATION,0
NAME,0
PRCP,185
TMIN,245
TMAX,245
WDF2,246
WSF2,246
AWND,247
WSF5,258
WDF5,258


### There is a significant jump in NaN values between the SNOW and TAVG columns. Therefore, we will tolerate all columns containing NaNs values less than or equal to the SNOW column. First we create a list of all the columns we will accept.

In [5]:
# all acceptable columns and their corresponding Nan value counts
acceptable_columns = nan_values[nan_values <= nan_values["SNOW"]]

# turn acceptable_columns into a list of the columns with low Nan counts
acceptable_columns = acceptable_columns.index.to_list()

acceptable_columns

['STATION',
 'NAME',
 'AWND',
 'PRCP',
 'SNOW',
 'SNWD',
 'TMAX',
 'TMIN',
 'WDF2',
 'WDF5',
 'WSF2',
 'WSF5']

### Now that we know which columns are acceptable, we can remove all other columns.

In [6]:
# remove columns with too many NaN values
df = df[acceptable_columns].copy()

df

Unnamed: 0_level_0,STATION,NAME,AWND,PRCP,SNOW,SNWD,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2005-01-01,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",9.40,0.00,,,43.0,20.0,160.0,20.0,16.1,17.0
2005-01-02,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",5.59,0.00,,,33.0,20.0,170.0,170.0,16.1,17.0
2005-01-03,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",4.92,0.00,,,41.0,18.0,60.0,60.0,13.0,14.1
2005-01-04,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",8.50,0.02,,,26.0,5.0,20.0,20.0,17.9,21.0
2005-01-05,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",4.70,0.10,,1.0,6.0,-3.0,210.0,30.0,10.1,12.1
...,...,...,...,...,...,...,...,...,...,...,...,...
2025-04-02,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",10.51,0.00,0.0,0.0,51.0,28.0,300.0,290.0,29.1,36.9
2025-04-03,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",7.16,0.13,1.0,0.0,40.0,27.0,100.0,100.0,21.9,28.0
2025-04-04,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",11.63,0.00,0.1,1.2,38.0,27.0,30.0,30.0,25.1,32.0
2025-04-05,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",,,,,,,,,,


### It looks like measuring devices for certain values such as SNOW and SNWD were not implemented before January 01, 2005. We can try to remove any data before these measuring devices were implemented. Let's find the when the first valid measurements were recorded for each column.

In [7]:
# create a series with acceptable columns and their corresponding first valid entries
first_valid_entry = pd.Series([df[col].first_valid_index() for col in acceptable_columns],
                              index=acceptable_columns,
                              name = 'First Valid Entry')

first_valid_entry.sort_values()

Unnamed: 0,First Valid Entry
STATION,2005-01-01
NAME,2005-01-01
AWND,2005-01-01
PRCP,2005-01-01
TMAX,2005-01-01
TMIN,2005-01-01
WDF2,2005-01-01
WDF5,2005-01-01
WSF2,2005-01-01
WSF5,2005-01-01


### The SNOW colum is the last column to receive a valid measurement value. It's first proper measurement doesn't come until February 1, 2006, which accounts for most of the NaN values in this column. Therefore, we will remove all datapoints occurring before this date.

In [8]:
# SNOW column's first valid index
t0 = first_valid_entry["SNOW"]

# remove data before t0
df = df[df.index >= t0]

df

Unnamed: 0_level_0,STATION,NAME,AWND,PRCP,SNOW,SNWD,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2006-02-01,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",8.05,0.00,0.0,0.0,53.0,22.0,20.0,30.0,21.0,25.1
2006-02-02,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",12.30,0.00,0.0,0.0,47.0,23.0,310.0,310.0,25.9,30.0
2006-02-03,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",10.96,0.00,0.0,0.0,44.0,22.0,340.0,340.0,36.9,46.1
2006-02-04,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",9.40,0.00,0.0,0.0,53.0,16.0,140.0,140.0,21.0,23.9
2006-02-05,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",16.78,0.00,0.0,0.0,46.0,19.0,10.0,10.0,36.0,42.9
...,...,...,...,...,...,...,...,...,...,...,...,...
2025-04-02,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",10.51,0.00,0.0,0.0,51.0,28.0,300.0,290.0,29.1,36.9
2025-04-03,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",7.16,0.13,1.0,0.0,40.0,27.0,100.0,100.0,21.9,28.0
2025-04-04,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",11.63,0.00,0.1,1.2,38.0,27.0,30.0,30.0,25.1,32.0
2025-04-05,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",,,,,,,,,,


### We notice that the data fore April 05, 2025 and April 06, 2025 have no meaning (all values are NaN). Given that these are the two most recent dates, it is reasonable that the data has not been updated yet. We will simply remove rows like these.

In [9]:
# all rows whose values are exclusively NaN (aside from STATION and NAME)
nan_rows = df[df.iloc[:,2:].isnull().all(axis=1)]

nan_rows

Unnamed: 0_level_0,STATION,NAME,AWND,PRCP,SNOW,SNWD,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2025-04-05,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",,,,,,,,,,
2025-04-06,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",,,,,,,,,,


In [10]:
# drop rows with all NaN values
df = df.drop(nan_rows.index, axis='index')

df

Unnamed: 0_level_0,STATION,NAME,AWND,PRCP,SNOW,SNWD,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2006-02-01,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",8.05,0.00,0.0,0.0,53.0,22.0,20.0,30.0,21.0,25.1
2006-02-02,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",12.30,0.00,0.0,0.0,47.0,23.0,310.0,310.0,25.9,30.0
2006-02-03,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",10.96,0.00,0.0,0.0,44.0,22.0,340.0,340.0,36.9,46.1
2006-02-04,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",9.40,0.00,0.0,0.0,53.0,16.0,140.0,140.0,21.0,23.9
2006-02-05,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",16.78,0.00,0.0,0.0,46.0,19.0,10.0,10.0,36.0,42.9
...,...,...,...,...,...,...,...,...,...,...,...,...
2025-03-31,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",8.95,0.00,0.0,0.0,62.0,31.0,150.0,170.0,21.9,30.0
2025-04-01,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",10.51,0.08,0.3,0.0,52.0,29.0,10.0,350.0,30.0,38.0
2025-04-02,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",10.51,0.00,0.0,0.0,51.0,28.0,300.0,290.0,29.1,36.9
2025-04-03,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",7.16,0.13,1.0,0.0,40.0,27.0,100.0,100.0,21.9,28.0


### Now let's see how many missing values we have after removing all those columns and datapoints.

In [11]:
# number of NaN values for each column
nan_values = pd.Series(df.apply(pd.isnull).sum(),
                       name="NaN count")
print(f'Null values in each column:')

# sort rows by number of
nan_values.sort_values()

Null values in each column:


Unnamed: 0,NaN count
STATION,0
NAME,0
SNWD,30
SNOW,30
PRCP,183
TMAX,243
TMIN,243
WDF2,244
WSF2,244
AWND,245


### Next we will fill in the remaining NaN values. In this context, it makes sense since the weather typically doesn't change dramatically between days. We will fill NaN values using the most recent previous value.

In [12]:
# fill missing values using previous value
df = df.ffill()

# number of null values for each column
nan_values_ffill = pd.Series(df.apply(pd.isnull).sum(),
                       name="NaN count")
print(f'Null values in each column after ffill():')
nan_values_ffill

Null values in each column after ffill():


Unnamed: 0,NaN count
STATION,0
NAME,0
AWND,0
PRCP,0
SNOW,0
SNWD,0
TMAX,0
TMIN,0
WDF2,0
WDF5,0


### Now we create a target column. We want to predict the next day's high temperature, so we will create a column called TARGET which will be the next day's high.

In [13]:
df["TARGET"] = df["TMAX"].shift(-1)
df.head()

Unnamed: 0_level_0,STATION,NAME,AWND,PRCP,SNOW,SNWD,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,TARGET
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2006-02-01,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",8.05,0.0,0.0,0.0,53.0,22.0,20.0,30.0,21.0,25.1,47.0
2006-02-02,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",12.3,0.0,0.0,0.0,47.0,23.0,310.0,310.0,25.9,30.0,44.0
2006-02-03,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",10.96,0.0,0.0,0.0,44.0,22.0,340.0,340.0,36.9,46.1,53.0
2006-02-04,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",9.4,0.0,0.0,0.0,53.0,16.0,140.0,140.0,21.0,23.9,46.0
2006-02-05,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",16.78,0.0,0.0,0.0,46.0,19.0,10.0,10.0,36.0,42.9,45.0


### Because the final row does not have a target, we will remove it from the data set.

In [14]:
df = df.iloc[:-1,:]
df.head()

Unnamed: 0_level_0,STATION,NAME,AWND,PRCP,SNOW,SNWD,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,TARGET
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2006-02-01,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",8.05,0.0,0.0,0.0,53.0,22.0,20.0,30.0,21.0,25.1,47.0
2006-02-02,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",12.3,0.0,0.0,0.0,47.0,23.0,310.0,310.0,25.9,30.0,44.0
2006-02-03,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",10.96,0.0,0.0,0.0,44.0,22.0,340.0,340.0,36.9,46.1,53.0
2006-02-04,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",9.4,0.0,0.0,0.0,53.0,16.0,140.0,140.0,21.0,23.9,46.0
2006-02-05,USW00003017,"DENVER INTERNATIONAL AIRPORT, CO US",16.78,0.0,0.0,0.0,46.0,19.0,10.0,10.0,36.0,42.9,45.0


### Now we can export the dataframe as a csv file.

In [15]:
# cleaned file name
fname_clean = "clean_data/DEN_weather_clean.csv" # change 'raw' to 'clean

df.to_csv(fname_clean, index=True)