In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
import warnings
from sklearn.exceptions import ConvergenceWarning
from joblib import dump
from astral import moon
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [None]:
# importing the dataset

df = pd.read_csv ("/content/prediction_data.csv")

df

Unnamed: 0,datetime,tempmax,tempmin,temp,dew,humidity,precip,precipprob,precipcover,preciptype,...,sealevelpressure,cloudcover,icon,flood occurrence,year,month,day,day_of_week,moon_phase,moon_phase_cat
0,2004-05-20,29.7,25.7,27.7,25.2,86.9,0.0,0.0,0.00,none,...,1012.9,90.0,partly-cloudy-day,0.0,2004,5,20,3,New Moon,1
1,2004-05-23,25.7,25.1,25.4,24.7,95.7,23.1,100.0,4.17,rain,...,1013.9,90.0,rain,0.0,2004,5,23,6,First Quarter,2
2,2004-05-25,31.6,27.1,29.2,25.3,79.7,0.0,0.0,0.00,none,...,1012.9,90.0,partly-cloudy-day,0.0,2004,5,25,1,First Quarter,2
3,2004-05-26,26.1,26.1,26.1,23.1,83.6,0.0,0.0,0.00,none,...,1010.8,90.0,partly-cloudy-day,0.0,2004,5,26,2,Full Moon,3
4,2004-05-30,30.1,25.1,27.1,25.1,89.2,0.0,0.0,0.00,none,...,1013.4,90.0,partly-cloudy-day,0.0,2004,5,30,6,Last Quarter,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6475,2024-06-29,31.0,23.9,27.1,24.7,87.1,12.0,100.0,4.17,rain,...,1012.7,61.8,rain,0.0,2024,6,29,5,New Moon,1
6476,2024-06-30,31.2,25.0,27.8,25.2,86.2,3.0,100.0,4.17,rain,...,1013.0,62.4,rain,0.0,2024,6,30,6,New Moon,1
6477,2024-07-01,32.0,25.0,27.9,24.9,84.6,6.9,100.0,50.00,rain,...,1013.5,55.3,rain,0.0,2024,7,1,0,New Moon,1
6478,2024-07-02,30.0,24.0,26.9,24.3,85.9,13.6,100.0,62.50,rain,...,1012.7,76.9,rain,0.0,2024,7,2,1,New Moon,1


## Building the model

In [None]:
#dropping the unnecessary columns

pred_data = df.drop (["datetime", "moon_phase_cat", "year", "month", "day", "day_of_week"], axis = 1)

pred_data

Unnamed: 0,tempmax,tempmin,temp,dew,humidity,precip,precipprob,precipcover,preciptype,visibility,windspeed,sealevelpressure,cloudcover,icon,flood occurrence,moon_phase
0,29.7,25.7,27.7,25.2,86.9,0.0,0.0,0.00,none,8.0,14.8,1012.9,90.0,partly-cloudy-day,0.0,New Moon
1,25.7,25.1,25.4,24.7,95.7,23.1,100.0,4.17,rain,7.0,20.5,1013.9,90.0,rain,0.0,First Quarter
2,31.6,27.1,29.2,25.3,79.7,0.0,0.0,0.00,none,10.5,27.7,1012.9,90.0,partly-cloudy-day,0.0,First Quarter
3,26.1,26.1,26.1,23.1,83.6,0.0,0.0,0.00,none,11.0,13.0,1010.8,90.0,partly-cloudy-day,0.0,Full Moon
4,30.1,25.1,27.1,25.1,89.2,0.0,0.0,0.00,none,8.3,22.3,1013.4,90.0,partly-cloudy-day,0.0,Last Quarter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6475,31.0,23.9,27.1,24.7,87.1,12.0,100.0,4.17,rain,9.6,22.3,1012.7,61.8,rain,0.0,New Moon
6476,31.2,25.0,27.8,25.2,86.2,3.0,100.0,4.17,rain,8.5,25.9,1013.0,62.4,rain,0.0,New Moon
6477,32.0,25.0,27.9,24.9,84.6,6.9,100.0,50.00,rain,10.6,22.3,1013.5,55.3,rain,0.0,New Moon
6478,30.0,24.0,26.9,24.3,85.9,13.6,100.0,62.50,rain,11.1,22.7,1012.7,76.9,rain,0.0,New Moon


In [None]:
# checking for missing values

pred_data.isnull().sum()

tempmax             0
tempmin             0
temp                0
dew                 0
humidity            0
precip              0
precipprob          0
precipcover         0
preciptype          0
visibility          0
windspeed           0
sealevelpressure    0
cloudcover          0
icon                0
flood occurrence    0
moon_phase          0
dtype: int64

### Using one hot encoding method 

-- One Hot Encoding is used when categorical variables has no inherent order or hierarchy.

In [None]:
# dummy columns

dummy = pd.get_dummies (pred_data [["preciptype","icon","moon_phase"]], dtype = int)

dummy

Unnamed: 0,preciptype_none,preciptype_rain,icon_clear-day,icon_cloudy,icon_fog,icon_partly-cloudy-day,icon_rain,icon_wind,moon_phase_First Quarter,moon_phase_Full Moon,moon_phase_Last Quarter,moon_phase_New Moon
0,1,0,0,0,0,1,0,0,0,0,0,1
1,0,1,0,0,0,0,1,0,1,0,0,0
2,1,0,0,0,0,1,0,0,1,0,0,0
3,1,0,0,0,0,1,0,0,0,1,0,0
4,1,0,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
6475,0,1,0,0,0,0,1,0,0,0,0,1
6476,0,1,0,0,0,0,1,0,0,0,0,1
6477,0,1,0,0,0,0,1,0,0,0,0,1
6478,0,1,0,0,0,0,1,0,0,0,0,1


In [None]:
merged_df = pd.concat ([pred_data, dummy], axis = "columns")

merged_df

Unnamed: 0,tempmax,tempmin,temp,dew,humidity,precip,precipprob,precipcover,preciptype,visibility,...,icon_clear-day,icon_cloudy,icon_fog,icon_partly-cloudy-day,icon_rain,icon_wind,moon_phase_First Quarter,moon_phase_Full Moon,moon_phase_Last Quarter,moon_phase_New Moon
0,29.7,25.7,27.7,25.2,86.9,0.0,0.0,0.00,none,8.0,...,0,0,0,1,0,0,0,0,0,1
1,25.7,25.1,25.4,24.7,95.7,23.1,100.0,4.17,rain,7.0,...,0,0,0,0,1,0,1,0,0,0
2,31.6,27.1,29.2,25.3,79.7,0.0,0.0,0.00,none,10.5,...,0,0,0,1,0,0,1,0,0,0
3,26.1,26.1,26.1,23.1,83.6,0.0,0.0,0.00,none,11.0,...,0,0,0,1,0,0,0,1,0,0
4,30.1,25.1,27.1,25.1,89.2,0.0,0.0,0.00,none,8.3,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6475,31.0,23.9,27.1,24.7,87.1,12.0,100.0,4.17,rain,9.6,...,0,0,0,0,1,0,0,0,0,1
6476,31.2,25.0,27.8,25.2,86.2,3.0,100.0,4.17,rain,8.5,...,0,0,0,0,1,0,0,0,0,1
6477,32.0,25.0,27.9,24.9,84.6,6.9,100.0,50.00,rain,10.6,...,0,0,0,0,1,0,0,0,0,1
6478,30.0,24.0,26.9,24.3,85.9,13.6,100.0,62.50,rain,11.1,...,0,0,0,0,1,0,0,0,0,1


In [None]:
# defining the X variable

X = merged_df.drop (["preciptype_none", "icon_clear-day", "moon_phase_Last Quarter", "flood occurrence",
                    "preciptype","icon","moon_phase"], axis = 1)

X

Unnamed: 0,tempmax,tempmin,temp,dew,humidity,precip,precipprob,precipcover,visibility,windspeed,...,cloudcover,preciptype_rain,icon_cloudy,icon_fog,icon_partly-cloudy-day,icon_rain,icon_wind,moon_phase_First Quarter,moon_phase_Full Moon,moon_phase_New Moon
0,29.7,25.7,27.7,25.2,86.9,0.0,0.0,0.00,8.0,14.8,...,90.0,0,0,0,1,0,0,0,0,1
1,25.7,25.1,25.4,24.7,95.7,23.1,100.0,4.17,7.0,20.5,...,90.0,1,0,0,0,1,0,1,0,0
2,31.6,27.1,29.2,25.3,79.7,0.0,0.0,0.00,10.5,27.7,...,90.0,0,0,0,1,0,0,1,0,0
3,26.1,26.1,26.1,23.1,83.6,0.0,0.0,0.00,11.0,13.0,...,90.0,0,0,0,1,0,0,0,1,0
4,30.1,25.1,27.1,25.1,89.2,0.0,0.0,0.00,8.3,22.3,...,90.0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6475,31.0,23.9,27.1,24.7,87.1,12.0,100.0,4.17,9.6,22.3,...,61.8,1,0,0,0,1,0,0,0,1
6476,31.2,25.0,27.8,25.2,86.2,3.0,100.0,4.17,8.5,25.9,...,62.4,1,0,0,0,1,0,0,0,1
6477,32.0,25.0,27.9,24.9,84.6,6.9,100.0,50.00,10.6,22.3,...,55.3,1,0,0,0,1,0,0,0,1
6478,30.0,24.0,26.9,24.3,85.9,13.6,100.0,62.50,11.1,22.7,...,76.9,1,0,0,0,1,0,0,0,1


In [None]:
# defining the y variable

y = pred_data["flood occurrence"]

y

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
6475    0.0
6476    0.0
6477    0.0
6478    0.0
6479    1.0
Name: flood occurrence, Length: 6480, dtype: float64

In [None]:
# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
model = LogisticRegression ()


In [None]:
model.fit(X_train, y_train)


In [None]:
y_predicted = model.predict (X_test)

In [None]:
print("Classification Report:")

print(classification_report(y_test, y_predicted))

In [None]:
#addressing the class imbalance

smote = SMOTE(sampling_strategy = "minority")

X_sm, y_sm = smote.fit_resample (X_train, y_train)

y_sm.value_counts()

flood occurrence
0.0    5151
1.0    5151
Name: count, dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size = 0.20, random_state = 15, stratify = y_sm)


In [None]:
model.fit(X_train, y_train)

In [None]:
y_predicted = model.predict (X_test)


In [None]:
print("Classification Report:")

print(classification_report(y_test, y_predicted))

Classification Report:
              precision    recall  f1-score   support

         0.0       0.73      0.72      0.73      1031
         1.0       0.72      0.74      0.73      1030

    accuracy                           0.73      2061
   macro avg       0.73      0.73      0.73      2061
weighted avg       0.73      0.73      0.73      2061



- Overall, the results suggest that the model's performance has improved after addressing class imbalance using SMOTE, with similar precision, recall, and F1-score for both classes.

- However, further evaluation and fine-tuning may be necessary to enhance the model's predictive capabilities further.

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
model = LogisticRegression(max_iter=1000)

cross_val_score(model, X_scaled, y, cv=cv)


array([0.99151235, 0.99614198, 0.99305556, 0.99459877, 0.99459877])

In [None]:
# Suppress ConvergenceWarning
warnings.filterwarnings('ignore', category=ConvergenceWarning)

# Define the models and their hyperparameters
def find_best_model (X, y):
    algorithm = {
    'Logistic Regression': {
        'model': LogisticRegression(max_iter=1000, random_state=42),
        'params': {
            'C': [0.01, 0.1, 1, 10, 100]
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [5, 10, 20]
        }
    },
    'Gradient Boosting': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1],
            'max_depth': [3, 5, 10]
        }
    }
}

# Train and evaluate each model using GridSearchCV
    results = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

    for model_name, config in algorithm.items():
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore')  # Suppress all warnings
            gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
            gs.fit(X_train, y_train)

        y_pred = gs.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)
        results.append({
            'model': model_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_,
            'classification_report': report
        })

    return pd.DataFrame(results, columns=['model', 'best_score', 'best_params', 'classification_report'])


In [None]:
best_model = find_best_model (X_train, y_train)

best_model

Unnamed: 0,model,best_score,best_params,classification_report
0,Logistic Regression,0.832383,{'C': 10},"{'0.0': {'precision': 0.8244781783681214, 'rec..."
1,Random Forest,0.993087,"{'max_depth': 20, 'n_estimators': 50}","{'0.0': {'precision': 0.9961127308066083, 'rec..."
2,Gradient Boosting,0.99527,"{'learning_rate': 1, 'max_depth': 5, 'n_estima...","{'0.0': {'precision': 1.0, 'recall': 0.9980601..."


## Based on the result, Gradient Boost performed better compared to the rest.

In [None]:
# Define the Gradient Boosting classifier 
gb_clf = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=10,
    random_state=42
)

In [None]:
gb_clf.fit(X_train, y_train)


In [None]:
y_pred = gb_clf.predict(X_test)


In [None]:
print(classification_report(y_test, y_pred))


## Saving the model

In [None]:
# Assuming gb_clf is your trained GradientBoostingClassifier model

dump(gb_clf, 'gradient_boosting_model.joblib')


## Testing the model on forecast data

In [None]:
#importing the forecast dataset to test

test_df = pd.read_csv ("/content/Lagos,Nigeria.csv")

test_df

Unnamed: 0,name,datetime,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,...,solarenergy,uvindex,severerisk,sunrise,sunset,moonphase,conditions,description,icon,stations
0,"Lagos,Nigeria",2024-07-06,29.0,25.0,26.6,34.4,25.0,28.4,23.4,83.0,...,14.0,9,30,2024-07-06T06:36:55,2024-07-06T19:05:48,0.02,"Rain, Partially cloudy",Partly cloudy throughout the day with storms p...,rain,"DNMM,remote"
1,"Lagos,Nigeria",2024-07-07,26.8,25.6,26.3,29.2,25.6,26.4,22.9,81.7,...,16.8,9,10,2024-07-07T06:37:08,2024-07-07T19:05:54,0.05,"Rain, Overcast",Cloudy skies throughout the day with a chance ...,rain,
2,"Lagos,Nigeria",2024-07-08,26.4,24.4,25.5,26.4,24.4,25.5,23.0,86.0,...,3.1,1,10,2024-07-08T06:37:21,2024-07-08T19:05:59,0.08,"Rain, Overcast",Cloudy skies throughout the day with a chance ...,rain,
3,"Lagos,Nigeria",2024-07-09,27.1,25.1,26.2,29.6,25.1,26.8,22.6,80.4,...,18.6,7,10,2024-07-09T06:37:34,2024-07-09T19:06:04,0.11,"Rain, Overcast",Cloudy skies throughout the day with a chance ...,rain,
4,"Lagos,Nigeria",2024-07-10,26.2,25.1,25.6,26.2,25.1,25.6,22.8,84.7,...,12.5,7,10,2024-07-10T06:37:47,2024-07-10T19:06:08,0.15,"Rain, Overcast",Cloudy skies throughout the day with a chance ...,rain,
5,"Lagos,Nigeria",2024-07-11,25.6,24.6,25.0,25.6,24.6,25.0,22.8,87.6,...,2.7,1,10,2024-07-11T06:37:59,2024-07-11T19:06:11,0.18,"Rain, Overcast",Cloudy skies throughout the day with a chance ...,rain,
6,"Lagos,Nigeria",2024-07-12,26.0,24.9,25.4,26.0,24.9,25.4,22.5,84.0,...,5.0,2,10,2024-07-12T06:38:11,2024-07-12T19:06:14,0.21,"Rain, Overcast",Cloudy skies throughout the day with a chance ...,rain,
7,"Lagos,Nigeria",2024-07-13,26.2,24.8,25.6,26.2,24.8,25.6,22.5,82.8,...,7.8,3,10,2024-07-13T06:38:23,2024-07-13T19:06:16,0.25,"Rain, Overcast",Cloudy skies throughout the day with rain clea...,rain,
8,"Lagos,Nigeria",2024-07-14,26.9,25.6,26.1,28.7,25.6,26.3,21.8,77.1,...,21.6,8,10,2024-07-14T06:38:34,2024-07-14T19:06:17,0.27,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,
9,"Lagos,Nigeria",2024-07-15,26.1,25.5,25.9,26.1,25.5,25.9,21.3,75.8,...,17.4,7,10,2024-07-15T06:38:46,2024-07-15T19:06:18,0.31,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,


## DATA CLEANING 

In [None]:
test_df = test_df.rename(columns = {"moonphase" : "moon_phase"})

test_df

Unnamed: 0,name,datetime,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,...,solarenergy,uvindex,severerisk,sunrise,sunset,moon_phase,conditions,description,icon,stations
0,"Lagos,Nigeria",2024-07-06,29.0,25.0,26.6,34.4,25.0,28.4,23.4,83.0,...,14.0,9,30,2024-07-06T06:36:55,2024-07-06T19:05:48,0.02,"Rain, Partially cloudy",Partly cloudy throughout the day with storms p...,rain,"DNMM,remote"
1,"Lagos,Nigeria",2024-07-07,26.8,25.6,26.3,29.2,25.6,26.4,22.9,81.7,...,16.8,9,10,2024-07-07T06:37:08,2024-07-07T19:05:54,0.05,"Rain, Overcast",Cloudy skies throughout the day with a chance ...,rain,
2,"Lagos,Nigeria",2024-07-08,26.4,24.4,25.5,26.4,24.4,25.5,23.0,86.0,...,3.1,1,10,2024-07-08T06:37:21,2024-07-08T19:05:59,0.08,"Rain, Overcast",Cloudy skies throughout the day with a chance ...,rain,
3,"Lagos,Nigeria",2024-07-09,27.1,25.1,26.2,29.6,25.1,26.8,22.6,80.4,...,18.6,7,10,2024-07-09T06:37:34,2024-07-09T19:06:04,0.11,"Rain, Overcast",Cloudy skies throughout the day with a chance ...,rain,
4,"Lagos,Nigeria",2024-07-10,26.2,25.1,25.6,26.2,25.1,25.6,22.8,84.7,...,12.5,7,10,2024-07-10T06:37:47,2024-07-10T19:06:08,0.15,"Rain, Overcast",Cloudy skies throughout the day with a chance ...,rain,
5,"Lagos,Nigeria",2024-07-11,25.6,24.6,25.0,25.6,24.6,25.0,22.8,87.6,...,2.7,1,10,2024-07-11T06:37:59,2024-07-11T19:06:11,0.18,"Rain, Overcast",Cloudy skies throughout the day with a chance ...,rain,
6,"Lagos,Nigeria",2024-07-12,26.0,24.9,25.4,26.0,24.9,25.4,22.5,84.0,...,5.0,2,10,2024-07-12T06:38:11,2024-07-12T19:06:14,0.21,"Rain, Overcast",Cloudy skies throughout the day with a chance ...,rain,
7,"Lagos,Nigeria",2024-07-13,26.2,24.8,25.6,26.2,24.8,25.6,22.5,82.8,...,7.8,3,10,2024-07-13T06:38:23,2024-07-13T19:06:16,0.25,"Rain, Overcast",Cloudy skies throughout the day with rain clea...,rain,
8,"Lagos,Nigeria",2024-07-14,26.9,25.6,26.1,28.7,25.6,26.3,21.8,77.1,...,21.6,8,10,2024-07-14T06:38:34,2024-07-14T19:06:17,0.27,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,
9,"Lagos,Nigeria",2024-07-15,26.1,25.5,25.9,26.1,25.5,25.9,21.3,75.8,...,17.4,7,10,2024-07-15T06:38:46,2024-07-15T19:06:18,0.31,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,


In [None]:
# Convert 'date' column to datetime

test_df["datetime"] = pd.to_datetime(test_df["datetime"])

In [None]:
# Function to get moon phase with descriptive name for a specific date

def get_moon_phase(date):
    phase = moon.phase(date)
    if phase < 1.845:  # New Moon to First Quarter
        return 'New Moon'
    elif phase < 5.537:  # First Quarter to Full Moon
        return 'First Quarter'
    elif phase < 9.230:  # Full Moon to Last Quarter
        return 'Full Moon'
    elif phase < 12.922:  # Last Quarter to New Moon
        return 'Last Quarter'
    else:
        return 'New Moon'  # Just in case of any edge cases


# Apply get_moon_phase function to each row in DataFrame
test_df['moon_phase'] = test_df['datetime'].apply(get_moon_phase)

In [None]:
# selecting the relevant columns 

test_data = test_df[['tempmax', 'tempmin', 'temp', 'dew', 'humidity', 'precip', 'precipprob',
       'precipcover', 'preciptype', 'visibility', 'windspeed',
       'sealevelpressure', 'cloudcover', 'icon',
       'moon_phase']]

test_data

Unnamed: 0,tempmax,tempmin,temp,dew,humidity,precip,precipprob,precipcover,preciptype,visibility,windspeed,sealevelpressure,cloudcover,icon,moon_phase
0,29.0,25.0,26.6,23.4,83.0,9.6,100.0,66.67,rain,13.4,23.8,1013.1,68.1,rain,New Moon
1,26.8,25.6,26.3,22.9,81.7,9.5,96.8,66.67,rain,21.2,27.7,1013.0,99.5,rain,New Moon
2,26.4,24.4,25.5,23.0,86.0,26.6,90.3,83.33,rain,14.7,28.1,1012.8,99.9,rain,First Quarter
3,27.1,25.1,26.2,22.6,80.4,9.7,80.6,75.0,rain,18.6,33.5,1011.9,90.3,rain,First Quarter
4,26.2,25.1,25.6,22.8,84.7,19.8,74.2,91.67,rain,13.3,29.2,1012.9,99.8,rain,First Quarter
5,25.6,24.6,25.0,22.8,87.6,26.8,74.2,37.5,rain,11.0,27.0,1014.2,99.9,rain,First Quarter
6,26.0,24.9,25.4,22.5,84.0,13.2,64.5,33.33,rain,14.7,24.1,1014.5,99.7,rain,Full Moon
7,26.2,24.8,25.6,22.5,82.8,12.4,45.2,33.33,rain,17.4,28.2,1015.3,99.5,rain,Full Moon
8,26.9,25.6,26.1,21.8,77.1,2.7,38.7,25.0,rain,23.5,29.2,1016.6,43.4,partly-cloudy-day,Full Moon
9,26.1,25.5,25.9,21.3,75.8,1.6,29.0,25.0,rain,22.9,27.7,1016.5,80.5,partly-cloudy-day,Full Moon


### Using one hot encoding

In [None]:
dummy = pd.get_dummies (test_data [["preciptype","icon","moon_phase"]], dtype = int)

dummy

Unnamed: 0,preciptype_rain,icon_cloudy,icon_partly-cloudy-day,icon_rain,moon_phase_First Quarter,moon_phase_Full Moon,moon_phase_Last Quarter,moon_phase_New Moon
0,1,0,0,1,0,0,0,1
1,1,0,0,1,0,0,0,1
2,1,0,0,1,1,0,0,0
3,1,0,0,1,1,0,0,0
4,1,0,0,1,1,0,0,0
5,1,0,0,1,1,0,0,0
6,1,0,0,1,0,1,0,0
7,1,0,0,1,0,1,0,0
8,1,0,1,0,0,1,0,0
9,1,0,1,0,0,1,0,0


In [None]:
merged_df = pd.concat ([test_data, dummy], axis = "columns")

merged_df

Unnamed: 0,tempmax,tempmin,temp,dew,humidity,precip,precipprob,precipcover,preciptype,visibility,...,icon,moon_phase,preciptype_rain,icon_cloudy,icon_partly-cloudy-day,icon_rain,moon_phase_First Quarter,moon_phase_Full Moon,moon_phase_Last Quarter,moon_phase_New Moon
0,29.0,25.0,26.6,23.4,83.0,9.6,100.0,66.67,rain,13.4,...,rain,New Moon,1,0,0,1,0,0,0,1
1,26.8,25.6,26.3,22.9,81.7,9.5,96.8,66.67,rain,21.2,...,rain,New Moon,1,0,0,1,0,0,0,1
2,26.4,24.4,25.5,23.0,86.0,26.6,90.3,83.33,rain,14.7,...,rain,First Quarter,1,0,0,1,1,0,0,0
3,27.1,25.1,26.2,22.6,80.4,9.7,80.6,75.0,rain,18.6,...,rain,First Quarter,1,0,0,1,1,0,0,0
4,26.2,25.1,25.6,22.8,84.7,19.8,74.2,91.67,rain,13.3,...,rain,First Quarter,1,0,0,1,1,0,0,0
5,25.6,24.6,25.0,22.8,87.6,26.8,74.2,37.5,rain,11.0,...,rain,First Quarter,1,0,0,1,1,0,0,0
6,26.0,24.9,25.4,22.5,84.0,13.2,64.5,33.33,rain,14.7,...,rain,Full Moon,1,0,0,1,0,1,0,0
7,26.2,24.8,25.6,22.5,82.8,12.4,45.2,33.33,rain,17.4,...,rain,Full Moon,1,0,0,1,0,1,0,0
8,26.9,25.6,26.1,21.8,77.1,2.7,38.7,25.0,rain,23.5,...,partly-cloudy-day,Full Moon,1,0,1,0,0,1,0,0
9,26.1,25.5,25.9,21.3,75.8,1.6,29.0,25.0,rain,22.9,...,partly-cloudy-day,Full Moon,1,0,1,0,0,1,0,0


In [None]:
# defining the X variable

X_new_test = merged_df.drop (["preciptype","icon","moon_phase"], axis = 1)

X_new_test.columns

Index(['tempmax', 'tempmin', 'temp', 'dew', 'humidity', 'precip', 'precipprob',
       'precipcover', 'visibility', 'windspeed', 'sealevelpressure',
       'cloudcover', 'preciptype_rain', 'icon_cloudy',
       'icon_partly-cloudy-day', 'icon_rain', 'moon_phase_First Quarter',
       'moon_phase_Full Moon', 'moon_phase_Last Quarter',
       'moon_phase_New Moon'],
      dtype='object')

In [None]:
# Subset to common columns between the model and new dataset 

common_columns = set(X_train) & set(X_new_test)

X_train_subset = X_train[list(common_columns)]

X_test_subset = X_new_test[list(common_columns)]


In [None]:
# Identify missing columns
missing_columns = set(X_train) - set(X_new_test)

missing_columns

{'icon_fog', 'icon_wind'}

In [None]:
# Add missing columns to X_test and initialize with appropriate default values
for col in missing_columns:
    if col.startswith('icon_'):
        # For icon columns, initialize with 0
        X_new_test[col] = 0
    elif col.startswith('moon_phase_'):
        # For moon_phase columns, initialize with 0 or the most frequent phase
        X_new_test[col] = 0  # Replace with appropriate initializatio

In [None]:
X_new_test.drop (columns = ["moon_phase_Last Quarter"], inplace = True)

In [None]:
# arranging columns for proper indexing

cloud_index = 13
fog_index = 14
partly_cloud_index = 15
wind_index = 17
moon_index = 19
columns = list(X_new_test.columns)

columns.insert(cloud_index, columns.pop(columns.index("icon_cloudy")))
columns.insert(fog_index, columns.pop(columns.index("icon_fog")))
columns.insert(partly_cloud_index, columns.pop(columns.index("icon_partly-cloudy-day")))
columns.insert(wind_index, columns.pop(columns.index("icon_wind")))
columns.insert(moon_index, columns.pop(columns.index("moon_phase_Full Moon")))

X_new_test = X_new_test[columns]

In [None]:
y_newtest_pred = gb_clf.predict(X_new_test)


In [None]:
y_newtest_pred

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [None]:
# converting into a dataframe 

y_prediction_df = pd.DataFrame(y_newtest_pred, columns=["Forecast"])

In [None]:
if "datetime" in test_df.columns:
    date_column = test_df.reset_index (drop = True)

# Combine ID and predictions into a single DataFrame
results_df = pd.concat([date_column, y_prediction_df], axis=1)


In [None]:
# final result 

results_df[["datetime", "Forecast"]]

Unnamed: 0,datetime,Forecast
0,2024-07-06,0.0
1,2024-07-07,0.0
2,2024-07-08,0.0
3,2024-07-09,0.0
4,2024-07-10,0.0
5,2024-07-11,0.0
6,2024-07-12,0.0
7,2024-07-13,0.0
8,2024-07-14,0.0
9,2024-07-15,0.0
