In [2]:
import pandas as pd

In [3]:
# construct the file path

# name of raw data file
fname_raw = "raw_data/APA_weather_raw.csv"

# import csv file as pandas dataframe
df = pd.read_csv(fname_raw)

df.head()

Unnamed: 0,STATION,NAME,DATE,AWND,FMTM,PGTM,PRCP,SNOW,SNWD,TAVG,...,WT01,WT02,WT03,WT04,WT05,WT06,WT07,WT08,WT09,WT10
0,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",2005-01-01,7.61,1056.0,1002.0,0.0,,,37.0,...,,,,,,,,,,
1,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",2005-01-02,5.14,655.0,654.0,0.0,,,28.0,...,,,,,,,,,,
2,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",2005-01-03,4.47,1519.0,1520.0,0.0,,,33.0,...,,,,,,,,,,
3,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",2005-01-04,7.16,1103.0,1103.0,0.0,,,17.0,...,,,,,,,,,,
4,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",2005-01-05,3.8,458.0,249.0,0.0,,,4.0,...,,,,,,,,,,


### The index is currently the standard RangeIndex. We want to make it such that the date acts as the index for each data point.

In [4]:
# change the index of the dataframe to be DATE column
df.set_index("DATE", inplace=True)

# turn the index from string into a datetime object
df.index = pd.to_datetime(df.index)

df.head()

Unnamed: 0_level_0,STATION,NAME,AWND,FMTM,PGTM,PRCP,SNOW,SNWD,TAVG,TMAX,...,WT01,WT02,WT03,WT04,WT05,WT06,WT07,WT08,WT09,WT10
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-01,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",7.61,1056.0,1002.0,0.0,,,37.0,51.0,...,,,,,,,,,,
2005-01-02,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",5.14,655.0,654.0,0.0,,,28.0,34.0,...,,,,,,,,,,
2005-01-03,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",4.47,1519.0,1520.0,0.0,,,33.0,49.0,...,,,,,,,,,,
2005-01-04,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",7.16,1103.0,1103.0,0.0,,,17.0,28.0,...,,,,,,,,,,
2005-01-05,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",3.8,458.0,249.0,0.0,,,4.0,7.0,...,,,,,,,,,,


### We see that there are a lot of NaN values in the dataset. Let's see how many missing values there are in comparison to the number of observations.

In [5]:
# number of observations
total_observations = df.shape[0]
print(f'Number of observations: {total_observations}')

# number of NaN values for each column
nan_values = pd.Series(df.apply(pd.isnull).sum(),
                       name="NaN count")

print(f'NaN values in each column:')

# sort rows by number of NaN values
nan_values.sort_values()

Number of observations: 7398
NaN values in each column:


STATION       0
NAME          0
TMIN          1
WSF2          2
PRCP          2
WDF2          2
TMAX          5
AWND          8
WSF5         14
WDF5         14
PGTM       4998
SNWD       5551
SNOW       5588
WT01       5738
WT03       6357
WT08       6731
FMTM       7033
WT02       7118
TAVG       7186
WT05       7323
TSUN       7350
WT06       7351
WT09       7367
WT07       7391
WT04       7392
WT10       7394
Name: NaN count, dtype: int64

### There is a significant jump in NaN values between the WSF5 and PGTM columns. Therefore, we will tolerate all columns containing NaNs values less than or equal to the WSF5 column. First we create a list of all the columns we will accept.

In [6]:
# all acceptable columns and their corresponding Nan value counts
acceptable_columns = nan_values[nan_values <= nan_values["WSF5"]]

# turn acceptable_columns into a list of the columns with low Nan counts
acceptable_columns = acceptable_columns.index.to_list()

acceptable_columns

['STATION',
 'NAME',
 'AWND',
 'PRCP',
 'TMAX',
 'TMIN',
 'WDF2',
 'WDF5',
 'WSF2',
 'WSF5']

### Now that we know which columns are acceptable, we can remove all other columns.

In [7]:
# remove columns with too many NaN values
df = df[acceptable_columns].copy()

df

Unnamed: 0_level_0,STATION,NAME,AWND,PRCP,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2005-01-01,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",7.61,0.00,51.0,22.0,260.0,250.0,21.0,29.1
2005-01-02,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",5.14,0.00,34.0,21.0,160.0,160.0,14.1,15.0
2005-01-03,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",4.47,0.00,49.0,17.0,50.0,50.0,14.1,16.1
2005-01-04,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",7.16,0.00,28.0,6.0,340.0,340.0,19.9,21.0
2005-01-05,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",3.80,0.00,7.0,1.0,10.0,20.0,8.9,10.1
...,...,...,...,...,...,...,...,...,...,...
2025-03-31,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",8.05,0.00,61.0,31.0,160.0,200.0,23.0,35.1
2025-04-01,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",6.71,0.08,51.0,31.0,40.0,30.0,21.9,31.1
2025-04-02,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",10.29,0.01,50.0,30.0,330.0,350.0,25.9,36.0
2025-04-03,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",5.59,0.12,39.0,26.0,270.0,260.0,16.1,21.9


### Let's check to make sure that no row is completely filled with NaN

In [8]:
# all rows whose values are exclusively NaN (aside from STATION and NAME)
nan_rows = df[df.iloc[:,2:].isnull().all(axis=1)]

nan_rows

Unnamed: 0_level_0,STATION,NAME,AWND,PRCP,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


In [9]:
# number of NaN values for each column
nan_values = pd.Series(df.apply(pd.isnull).sum(),
                       name="NaN count")

print(f'Null values in each column:')

# sort rows by number of
nan_values.sort_values()

Null values in each column:


STATION     0
NAME        0
TMIN        1
PRCP        2
WDF2        2
WSF2        2
TMAX        5
AWND        8
WDF5       14
WSF5       14
Name: NaN count, dtype: int64

### Next we will fill in the remaining NaN values. In this context, it makes sense since the weather typically doesn't change dramatically between days. We will fill NaN values using the most recent previous value.

In [10]:
# fill missing values using previous value
df = df.ffill()

# number of null values for each column
nan_values_ffill = pd.Series(df.apply(pd.isnull).sum(),
                       name="NaN count")
print(f'Null values in each column after ffill():')
nan_values_ffill

Null values in each column after ffill():


STATION    0
NAME       0
AWND       0
PRCP       0
TMAX       0
TMIN       0
WDF2       0
WDF5       0
WSF2       0
WSF5       0
Name: NaN count, dtype: int64

### Now we create a target column. We want to predict the next day's high temperature, so we will create a column called TARGET which will be the next day's high.

In [11]:
df["TARGET"] = df["TMAX"].shift(-1)
df.head()

Unnamed: 0_level_0,STATION,NAME,AWND,PRCP,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,TARGET
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2005-01-01,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",7.61,0.0,51.0,22.0,260.0,250.0,21.0,29.1,34.0
2005-01-02,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",5.14,0.0,34.0,21.0,160.0,160.0,14.1,15.0,49.0
2005-01-03,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",4.47,0.0,49.0,17.0,50.0,50.0,14.1,16.1,28.0
2005-01-04,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",7.16,0.0,28.0,6.0,340.0,340.0,19.9,21.0,7.0
2005-01-05,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",3.8,0.0,7.0,1.0,10.0,20.0,8.9,10.1,41.0


### Because the final row does not have a target, we will remove it from the data set.

In [12]:
df = df.iloc[:-1,:]
df.head()

Unnamed: 0_level_0,STATION,NAME,AWND,PRCP,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,TARGET
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2005-01-01,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",7.61,0.0,51.0,22.0,260.0,250.0,21.0,29.1,34.0
2005-01-02,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",5.14,0.0,34.0,21.0,160.0,160.0,14.1,15.0,49.0
2005-01-03,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",4.47,0.0,49.0,17.0,50.0,50.0,14.1,16.1,28.0
2005-01-04,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",7.16,0.0,28.0,6.0,340.0,340.0,19.9,21.0,7.0
2005-01-05,USW00093067,"DENVER CENTENNIAL AIRPORT, CO US",3.8,0.0,7.0,1.0,10.0,20.0,8.9,10.1,41.0


### Now we can export the dataframe as a csv file.

In [13]:
# cleaned file name
fname_clean = "clean_data/APA_weather_clean.csv"

df.to_csv(fname_clean, index=True)