In [1]:
import pandas as pd

In [2]:
# construct the file path

# name of raw data file
fname_raw = "raw_data/central_park_weather_raw.csv"


# import csv file as pandas dataframe
df = pd.read_csv(fname_raw)

df.head()

Unnamed: 0,STATION,NAME,DATE,DASF,MDSF,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,TOBS,WT01,WT03,WT04,WT05,WT06,WT11
0,USW00023062,"DENVER CENTRAL PARK, CO US",2005-01-01,,,0.0,0.0,0.0,,58.0,21.0,21.0,,,,,,
1,USW00023062,"DENVER CENTRAL PARK, CO US",2005-01-02,,,0.0,0.0,0.0,,45.0,19.0,20.0,,,,,,
2,USW00023062,"DENVER CENTRAL PARK, CO US",2005-01-03,,,0.0,0.0,0.0,,32.0,15.0,15.0,,,,,,
3,USW00023062,"DENVER CENTRAL PARK, CO US",2005-01-04,,,0.0,0.0,0.0,,42.0,15.0,24.0,,,,,,
4,USW00023062,"DENVER CENTRAL PARK, CO US",2005-01-05,,,0.08,1.7,2.0,,24.0,2.0,2.0,,,,,,


### The index is currently the standard RangeIndex. We want to make it such that the date acts as the index for each data point.

In [3]:
# change the index of the dataframe to be DATE column
df.set_index("DATE", inplace=True)

# turn the index from string into a datetime object
df.index = pd.to_datetime(df.index)

df.head()

Unnamed: 0_level_0,STATION,NAME,DASF,MDSF,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,TOBS,WT01,WT03,WT04,WT05,WT06,WT11
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2005-01-01,USW00023062,"DENVER CENTRAL PARK, CO US",,,0.0,0.0,0.0,,58.0,21.0,21.0,,,,,,
2005-01-02,USW00023062,"DENVER CENTRAL PARK, CO US",,,0.0,0.0,0.0,,45.0,19.0,20.0,,,,,,
2005-01-03,USW00023062,"DENVER CENTRAL PARK, CO US",,,0.0,0.0,0.0,,32.0,15.0,15.0,,,,,,
2005-01-04,USW00023062,"DENVER CENTRAL PARK, CO US",,,0.0,0.0,0.0,,42.0,15.0,24.0,,,,,,
2005-01-05,USW00023062,"DENVER CENTRAL PARK, CO US",,,0.08,1.7,2.0,,24.0,2.0,2.0,,,,,,


### We see that there are a lot of NaN values in the dataset. Let's see how many missing values there are in comparison to the number of observations.

In [4]:
# number of observations
total_observations = df.shape[0]
print(f'Number of observations: {total_observations}')

# number of NaN values for each column
nan_values = pd.Series(df.apply(pd.isnull).sum(),
                       name="NaN count")

print(f'NaN values in each column:')

# sort rows by number of NaN values
nan_values.sort_values()

Number of observations: 7337
NaN values in each column:


STATION       0
NAME          0
PRCP          0
SNWD          2
SNOW          4
TMAX         17
TMIN         20
TOBS        805
WT03       6773
WT01       7124
WT05       7292
WT06       7293
WT04       7328
WT11       7334
DASF       7335
MDSF       7335
TAVG       7337
Name: NaN count, dtype: int64

### There is a significant jump in NaN values between the TMIN and TOBS columns. Therefore, we will tolerate all columns containing NaNs values less than or equal to the WSF5 column. First we create a list of all the columns we will accept.

In [5]:
# all acceptable columns and their corresponding Nan value counts
acceptable_columns = nan_values[nan_values <= nan_values["TMIN"]]

# turn acceptable_columns into a list of the columns with low Nan counts
acceptable_columns = acceptable_columns.index.to_list()

acceptable_columns

['STATION', 'NAME', 'PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN']

### Now that we know which columns are acceptable, we can remove all other columns.

In [6]:
# remove columns with too many NaN values
df = df[acceptable_columns].copy()

df

Unnamed: 0_level_0,STATION,NAME,PRCP,SNOW,SNWD,TMAX,TMIN
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2005-01-01,USW00023062,"DENVER CENTRAL PARK, CO US",0.00,0.0,0.0,58.0,21.0
2005-01-02,USW00023062,"DENVER CENTRAL PARK, CO US",0.00,0.0,0.0,45.0,19.0
2005-01-03,USW00023062,"DENVER CENTRAL PARK, CO US",0.00,0.0,0.0,32.0,15.0
2005-01-04,USW00023062,"DENVER CENTRAL PARK, CO US",0.00,0.0,0.0,42.0,15.0
2005-01-05,USW00023062,"DENVER CENTRAL PARK, CO US",0.08,1.7,2.0,24.0,2.0
...,...,...,...,...,...,...,...
2025-02-28,USW00023062,"DENVER CENTRAL PARK, CO US",0.00,0.0,0.0,63.0,26.0
2025-03-03,USW00023062,"DENVER CENTRAL PARK, CO US",0.00,0.0,0.0,65.0,31.0
2025-03-04,USW00023062,"DENVER CENTRAL PARK, CO US",0.11,0.0,0.0,63.0,34.0
2025-03-05,USW00023062,"DENVER CENTRAL PARK, CO US",0.00,0.0,0.0,45.0,22.0


### Let's check to make sure that no row is completely filled with NaN

In [7]:
# all rows whose values are exclusively NaN (aside from STATION and NAME)
nan_rows = df[df.iloc[:,2:].isnull().all(axis=1)]

nan_rows

Unnamed: 0_level_0,STATION,NAME,PRCP,SNOW,SNWD,TMAX,TMIN
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


### Next we will fill in the remaining NaN values. In this context, it makes sense since the weather typically doesn't change dramatically between days. We will fill NaN values using the most recent previous value.

In [8]:
# fill missing values using previous value
df = df.ffill()

# number of null values for each column
nan_values_ffill = pd.Series(df.apply(pd.isnull).sum(),
                       name="NaN count")
print(f'Null values in each column after ffill():')
nan_values_ffill

Null values in each column after ffill():


STATION    0
NAME       0
PRCP       0
SNOW       0
SNWD       0
TMAX       0
TMIN       0
Name: NaN count, dtype: int64

### Now we create a target column. We want to predict the next day's high temperature, so we will create a column called TARGET which will be the next day's high.

In [9]:
df["TARGET"] = df["TMAX"].shift(-1)
df.head()

Unnamed: 0_level_0,STATION,NAME,PRCP,SNOW,SNWD,TMAX,TMIN,TARGET
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2005-01-01,USW00023062,"DENVER CENTRAL PARK, CO US",0.0,0.0,0.0,58.0,21.0,45.0
2005-01-02,USW00023062,"DENVER CENTRAL PARK, CO US",0.0,0.0,0.0,45.0,19.0,32.0
2005-01-03,USW00023062,"DENVER CENTRAL PARK, CO US",0.0,0.0,0.0,32.0,15.0,42.0
2005-01-04,USW00023062,"DENVER CENTRAL PARK, CO US",0.0,0.0,0.0,42.0,15.0,24.0
2005-01-05,USW00023062,"DENVER CENTRAL PARK, CO US",0.08,1.7,2.0,24.0,2.0,6.0


### Because the final row does not have a target, we will remove it from the data set.

In [10]:
df = df.iloc[:-1,:]
df.head()

Unnamed: 0_level_0,STATION,NAME,PRCP,SNOW,SNWD,TMAX,TMIN,TARGET
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2005-01-01,USW00023062,"DENVER CENTRAL PARK, CO US",0.0,0.0,0.0,58.0,21.0,45.0
2005-01-02,USW00023062,"DENVER CENTRAL PARK, CO US",0.0,0.0,0.0,45.0,19.0,32.0
2005-01-03,USW00023062,"DENVER CENTRAL PARK, CO US",0.0,0.0,0.0,32.0,15.0,42.0
2005-01-04,USW00023062,"DENVER CENTRAL PARK, CO US",0.0,0.0,0.0,42.0,15.0,24.0
2005-01-05,USW00023062,"DENVER CENTRAL PARK, CO US",0.08,1.7,2.0,24.0,2.0,6.0


### Now we can export the dataframe as a csv file.

In [11]:
# cleaned file name
fname_clean = "clean_data/central_park_weather_clean.csv"

df.to_csv(fname_clean, index=True)