In [2]:
import matplotlib.pyplot as plt
from ucimlrepo import fetch_ucirepo 
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd

In [3]:
#Raw content URL for the dataset
url = "https://raw.githubusercontent.com/hastighsh/Ellehacks_2024/main/Monthly_weather_report.csv"

#Specify the delimiter
delimiter = ','

#Read the data into a DataFrame
weather = pd.read_csv(url, delimiter=delimiter)

#Let's create a backup copy of the dataset
weather_backup = weather.copy()

In [4]:
print(weather.shape)

(577927, 8)


In [5]:
### check for the number of these missing values 
missing_values_count = weather.isna().sum().sum()

print(f"Total number of missing values: {missing_values_count}")

missing_values_by_feature = weather.apply(lambda x: (x == '?').sum())
print(missing_values_by_feature)

Total number of missing values: 838749
Unnamed: 0.1                   0
Unnamed: 0                     0
id                             0
Time                           0
Monthly Average Temperature    0
Monthly Maximum Temperature    0
Monthly Minimum Temperature    0
Monthly Total Precepitation    0
dtype: int64


In [7]:
# Count missing values in each column
missing_values = weather.isnull().sum()

print("Missing values in each column:")
print(missing_values)


Missing values in each column:
Unnamed: 0.1                        0
Unnamed: 0                          0
id                                  0
Time                                0
Monthly Average Temperature    293126
Monthly Maximum Temperature    159465
Monthly Minimum Temperature    159816
Monthly Total Precepitation    226342
dtype: int64


In [8]:
display(weather)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,Time,Monthly Average Temperature,Monthly Maximum Temperature,Monthly Minimum Temperature,Monthly Total Precepitation
0,0,0,70326,1942-02-01,1.0,,,42.6
1,1,1,70326,1942-03-01,-5.0,-0.4,-12.4,12.6
2,2,2,70326,1942-04-01,3.0,6.6,-2.2,35.6
3,3,3,70326,1942-05-01,9.3,14.2,2.5,28.8
4,4,4,70326,1942-06-01,10.8,15.6,4.9,65.8
...,...,...,...,...,...,...,...,...
577922,577922,577922,PABL0,2011-12-01,,,,
577923,577923,577923,PABL0,2012-01-01,,,,
577924,577924,577924,PABL0,2012-02-01,,,,
577925,577925,577925,PABL0,2012-03-01,,,,


In [11]:
# Convert 'Time' column to datetime
weather['Time'] = pd.to_datetime(weather['Time'])

# Filter data for the years 2000 to 2015
weather_filtered = weather[(weather['Time'].dt.year >= 2000) & (weather['Time'].dt.year <= 2015)]

# print("Filtered Data:")
# print(weather_filtered)

In [12]:
display(weather_filtered)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,Time,Monthly Average Temperature,Monthly Maximum Temperature,Monthly Minimum Temperature,Monthly Total Precepitation
695,695,695,70326,2000-01-01,-15.3,-10.5,-20.1,29.0
696,696,696,70326,2000-02-01,-0.7,3.0,-4.3,20.0
697,697,697,70326,2000-03-01,-0.7,2.7,-4.1,9.0
698,698,698,70326,2000-04-01,1.8,6.8,-3.1,17.0
699,699,699,70326,2000-05-01,6.0,11.9,0.2,32.0
...,...,...,...,...,...,...,...,...
577922,577922,577922,PABL0,2011-12-01,,,,
577923,577923,577923,PABL0,2012-01-01,,,,
577924,577924,577924,PABL0,2012-02-01,,,,
577925,577925,577925,PABL0,2012-03-01,,,,


In [13]:
# Count missing values in each column
missing_values = weather_filtered.isnull().sum()

print("Missing values in each column:")
print(missing_values)

Missing values in each column:
Unnamed: 0.1                       0
Unnamed: 0                         0
id                                 0
Time                               0
Monthly Average Temperature    68171
Monthly Maximum Temperature    56605
Monthly Minimum Temperature    56663
Monthly Total Precepitation    84111
dtype: int64


In [14]:
df = weather_filtered.copy()

In [17]:
# Convert 'Time' column to datetime
df['Time'] = pd.to_datetime(df['Time'])

# Extract year from 'Time' column
df['Year'] = df['Time'].dt.year

# Group by year and calculate mean
yearly_means = df.groupby('Year').mean(numeric_only=True) 

# Function to fill missing values with corresponding year's mean
def fill_missing_with_year_mean(row):
    year = row['Year']
    for col in df.columns[4:]:  # Exclude 'Unnamed' columns and 'id'
        if pd.isnull(row[col]):
            row[col] = yearly_means.loc[year, col]
    return row

# Apply the function row-wise
df_filled = df.apply(fill_missing_with_year_mean, axis=1)

# Drop the 'Year' column
df_filled.drop('Year', axis=1, inplace=True)

print("DataFrame with Missing Values Filled:")
print(df_filled)

DataFrame with Missing Values Filled:
        Unnamed: 0.1  Unnamed: 0     id       Time  \
695              695         695  70326 2000-01-01   
696              696         696  70326 2000-02-01   
697              697         697  70326 2000-03-01   
698              698         698  70326 2000-04-01   
699              699         699  70326 2000-05-01   
...              ...         ...    ...        ...   
577922        577922      577922  PABL0 2011-12-01   
577923        577923      577923  PABL0 2012-01-01   
577924        577924      577924  PABL0 2012-02-01   
577925        577925      577925  PABL0 2012-03-01   
577926        577926      577926  PABL0 2012-04-01   

        Monthly Average Temperature  Monthly Maximum Temperature  \
695                      -15.300000                   -10.500000   
696                       -0.700000                     3.000000   
697                       -0.700000                     2.700000   
698                        1.800000      

In [18]:
# Count missing values in each column
missing_values = df_filled.isnull().sum()

print("Missing values in each column:")
print(missing_values)

Missing values in each column:
Unnamed: 0.1                   0
Unnamed: 0                     0
id                             0
Time                           0
Monthly Average Temperature    0
Monthly Maximum Temperature    0
Monthly Minimum Temperature    0
Monthly Total Precepitation    0
dtype: int64


In [19]:
display(df_filled)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,Time,Monthly Average Temperature,Monthly Maximum Temperature,Monthly Minimum Temperature,Monthly Total Precepitation
695,695,695,70326,2000-01-01,-15.300000,-10.500000,-20.100000,29.000000
696,696,696,70326,2000-02-01,-0.700000,3.000000,-4.300000,20.000000
697,697,697,70326,2000-03-01,-0.700000,2.700000,-4.100000,9.000000
698,698,698,70326,2000-04-01,1.800000,6.800000,-3.100000,17.000000
699,699,699,70326,2000-05-01,6.000000,11.900000,0.200000,32.000000
...,...,...,...,...,...,...,...,...
577922,577922,577922,PABL0,2011-12-01,12.856701,18.307096,7.080456,76.548122
577923,577923,577923,PABL0,2012-01-01,13.968208,19.541884,7.992203,69.516697
577924,577924,577924,PABL0,2012-02-01,13.968208,19.541884,7.992203,69.516697
577925,577925,577925,PABL0,2012-03-01,13.968208,19.541884,7.992203,69.516697


In [20]:
print(df.columns)

Index(['Unnamed: 0.1', 'Unnamed: 0', 'id', 'Time',
       'Monthly Average Temperature', 'Monthly Maximum Temperature',
       'Monthly Minimum Temperature', 'Monthly Total Precepitation', 'Year'],
      dtype='object')


In [27]:
# drop columns
columns_to_drop = ['Unnamed: 0.1', 'Unnamed: 0']
df_filled.drop(columns_to_drop, axis=1, inplace=True)

In [28]:
display(df_filled)

Unnamed: 0,id,Time,Monthly Average Temperature,Monthly Maximum Temperature,Monthly Minimum Temperature,Monthly Total Precepitation
695,70326,2000-01-01,-15.300000,-10.500000,-20.100000,29.000000
696,70326,2000-02-01,-0.700000,3.000000,-4.300000,20.000000
697,70326,2000-03-01,-0.700000,2.700000,-4.100000,9.000000
698,70326,2000-04-01,1.800000,6.800000,-3.100000,17.000000
699,70326,2000-05-01,6.000000,11.900000,0.200000,32.000000
...,...,...,...,...,...,...
577922,PABL0,2011-12-01,12.856701,18.307096,7.080456,76.548122
577923,PABL0,2012-01-01,13.968208,19.541884,7.992203,69.516697
577924,PABL0,2012-02-01,13.968208,19.541884,7.992203,69.516697
577925,PABL0,2012-03-01,13.968208,19.541884,7.992203,69.516697
