In [112]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Load the main weather dataset with low_memory set to False
weather_data = pd.read_csv("data/Summary_of_Weather.csv", low_memory=False)

# Load the weather station locations dataset
stations_data = pd.read_csv("data/Weather_Station_Locations.csv", low_memory=False)

# Set the random seed for reproducibility
np.random.seed(0)

# Display the first few rows of the weather dataset
weather_data.head()


Unnamed: 0,STA,Date,Precip,WindGustSpd,MaxTemp,MinTemp,MeanTemp,Snowfall,PoorWeather,YR,...,FB,FTI,ITH,PGT,TSHDSBRSGF,SD3,RHX,RHN,RVG,WTE
0,10001,1942-7-1,1.016,,25.555556,22.222222,23.888889,0,,42,...,,,,,,,,,,
1,10001,1942-7-2,0.0,,28.888889,21.666667,25.555556,0,,42,...,,,,,,,,,,
2,10001,1942-7-3,2.54,,26.111111,22.222222,24.444444,0,,42,...,,,,,,,,,,
3,10001,1942-7-4,2.54,,26.666667,22.222222,24.444444,0,,42,...,,,,,,,,,,
4,10001,1942-7-5,0.0,,26.666667,21.666667,24.444444,0,,42,...,,,,,,,,,,


In [113]:
# Calculate the number of missing values in each column
missing_values_count = weather_data.isnull().sum()

# Display the number of missing values in the first 10 columns
missing_values_count[0:10]


STA                 0
Date                0
Precip              0
WindGustSpd    118508
MaxTemp             0
MinTemp             0
MeanTemp            0
Snowfall         1163
PoorWeather     84803
YR                  0
dtype: int64

In [114]:
# Calculate the total number of cells in the dataset
total_cells = np.product(weather_data.shape)

# Calculate the total number of missing values
total_missing = missing_values_count.sum()

# Calculate the percentage of missing data
percent_missing = (total_missing / total_cells) * 100
print(percent_missing)


49.70351521852237


In [115]:
# Display the number of missing values in a specific column
missing_values_count['Snowfall']

1163

In [116]:
# Remove all rows that contain missing values
weather_data_cleaned_rows = weather_data.dropna()
weather_data_cleaned_rows

Unnamed: 0,STA,Date,Precip,WindGustSpd,MaxTemp,MinTemp,MeanTemp,Snowfall,PoorWeather,YR,...,FB,FTI,ITH,PGT,TSHDSBRSGF,SD3,RHX,RHN,RVG,WTE


In [117]:
# Remove all columns that contain missing values
weather_data_cleaned_columns = weather_data.dropna(axis=1)
weather_data_cleaned_columns.head()

Unnamed: 0,STA,Date,Precip,MaxTemp,MinTemp,MeanTemp,YR,MO,DA
0,10001,1942-7-1,1.016,25.555556,22.222222,23.888889,42,7,1
1,10001,1942-7-2,0.0,28.888889,21.666667,25.555556,42,7,2
2,10001,1942-7-3,2.54,26.111111,22.222222,24.444444,42,7,3
3,10001,1942-7-4,2.54,26.666667,22.222222,24.444444,42,7,4
4,10001,1942-7-5,0.0,26.666667,21.666667,24.444444,42,7,5


In [118]:
# Display the number of columns before and after removing columns with missing values
print("Number of columns before removal: %d" % weather_data.shape[1])
print("Number of columns after removal: %d" % weather_data_cleaned_columns.shape[1])

Number of columns before removal: 31
Number of columns after removal: 9


In [119]:
# Fill missing values with 0
weather_data_filled_zeros = weather_data.fillna(0)

# Check if missing values are filled with 0
print("Missing values after filling with 0:")
print(weather_data_filled_zeros.isnull().sum())


Missing values after filling with 0:
STA            0
Date           0
Precip         0
WindGustSpd    0
MaxTemp        0
MinTemp        0
MeanTemp       0
Snowfall       0
PoorWeather    0
YR             0
MO             0
DA             0
PRCP           0
DR             0
SPD            0
MAX            0
MIN            0
MEA            0
SNF            0
SND            0
FT             0
FB             0
FTI            0
ITH            0
PGT            0
TSHDSBRSGF     0
SD3            0
RHX            0
RHN            0
RVG            0
WTE            0
dtype: int64


In [120]:
# Select a subset of data based on specific columns
subset_weather_data = weather_data.loc[:, 'MaxTemp':'MinTemp'].head()
subset_weather_data

Unnamed: 0,MaxTemp,MinTemp
0,25.555556,22.222222
1,28.888889,21.666667
2,26.111111,22.222222
3,26.666667,22.222222
4,26.666667,21.666667


In [121]:
# Introduce a NaN value in the first row for demonstration
subset_weather_data.iloc[0, 0] = np.nan
print("Original subset with missing values:")
print(subset_weather_data)

Original subset with missing values:
     MaxTemp    MinTemp
0        NaN  22.222222
1  28.888889  21.666667
2  26.111111  22.222222
3  26.666667  22.222222
4  26.666667  21.666667


In [122]:
# Fill missing values with the number 100
subset_weather_data_filled_100 = subset_weather_data.fillna(100)

# Check if missing values in the subset are filled with 100
print("Subset after filling missing values with 100:")
print(subset_weather_data_filled_100)


Subset after filling missing values with 100:
      MaxTemp    MinTemp
0  100.000000  22.222222
1   28.888889  21.666667
2   26.111111  22.222222
3   26.666667  22.222222
4   26.666667  21.666667


In [123]:
# Introduce NaN values in the subset for demonstration
subset_weather_data.iloc[0, 0] = np.nan
subset_weather_data.iloc[1, 1] = np.nan
print("Original subset with missing values introduced:")
print(subset_weather_data)

# Fill missing values using backfill method directly
subset_weather_data_filled_bfill = subset_weather_data.bfill()
print("Subset after filling missing values using backfill:")
print(subset_weather_data_filled_bfill)


Original subset with missing values introduced:
     MaxTemp    MinTemp
0        NaN  22.222222
1  28.888889        NaN
2  26.111111  22.222222
3  26.666667  22.222222
4  26.666667  21.666667
Subset after filling missing values using backfill:
     MaxTemp    MinTemp
0  28.888889  22.222222
1  28.888889  22.222222
2  26.111111  22.222222
3  26.666667  22.222222
4  26.666667  21.666667


In [124]:
# Select a new subset and introduce NaN values for specific columns
subset_weather_data_2 = weather_data.loc[:, 'MaxTemp':'MinTemp'].head()
subset_weather_data_2.iloc[0, 0] = np.nan
subset_weather_data_2.iloc[2, 1] = np.nan
print("Original subset with missing values introduced:")
print(subset_weather_data_2)

# Fill missing values only in the 'MaxTemp' column with a specific value (e.g., 999)
subset_weather_data_2['MaxTemp'] = subset_weather_data_2['MaxTemp'].fillna(999)
print("Subset after selectively filling 'MaxTemp' with 999:")
print(subset_weather_data_2)


Original subset with missing values introduced:
     MaxTemp    MinTemp
0        NaN  22.222222
1  28.888889  21.666667
2  26.111111        NaN
3  26.666667  22.222222
4  26.666667  21.666667
Subset after selectively filling 'MaxTemp' with 999:
      MaxTemp    MinTemp
0  999.000000  22.222222
1   28.888889  21.666667
2   26.111111        NaN
3   26.666667  22.222222
4   26.666667  21.666667
