In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fullweatherdata/colombo_weather for 50 years.csv
/kaggle/input/weather-sl/colombo_weather.csv
/kaggle/input/weathersl-24/colombo_weatherfor 24 years.csv


In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import pickle

In [3]:
# Load the dataset
file_path = '/kaggle/input/fullweatherdata/colombo_weather for 50 years.csv'  # Replace with your file path
data = pd.read_csv(file_path, parse_dates=['time'], index_col='time')

# Display the first few rows
print(data.head())

# Check for missing values
print("Missing Values:\n", data.isnull().sum())

# Dataset summary
print("Dataset Info:")
print(data.info())

            tavg  tmin  tmax  prcp  snow   wdir  wspd  wpgt    pres  tsun
time                                                                     
1944-05-14  28.7  26.3  30.7   NaN   NaN  289.0  12.1   NaN  1006.8   NaN
1944-05-15  29.4  26.3  31.8   NaN   NaN  266.0  11.5   NaN  1005.9   NaN
1944-05-16  29.0  28.0  31.3   NaN   NaN  276.0  16.2   NaN  1005.8   NaN
1944-05-17  28.5  27.4  30.2   NaN   NaN  268.0  15.0   NaN  1006.5   NaN
1944-05-18  27.9  25.2  30.7   NaN   NaN  271.0   7.0   NaN  1007.2   NaN
Missing Values:
 tavg    21151
tmin    22683
tmax    22110
prcp    23215
snow    29409
wdir    26034
wspd    25848
wpgt    29409
pres    25841
tsun    29409
dtype: int64
Dataset Info:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 29409 entries, 1944-05-14 to 2024-11-18
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tavg    8258 non-null   float64
 1   tmin    6726 non-null   float64
 2   tmax    7299 non-nu

In [4]:
# Fill missing values with the column mean for numerical columns
data = data.fillna(data.mean())

# Verify there are no missing values
print("Missing Values After Cleaning:\n", data.isnull().sum())

Missing Values After Cleaning:
 tavg        0
tmin        0
tmax        0
prcp        0
snow    29409
wdir        0
wspd        0
wpgt    29409
pres        0
tsun    29409
dtype: int64


In [5]:
# Extract month and day from the datetime index
data['month'] = data.index.month
data['day'] = data.index.day

In [6]:
# Add a 'weather' category based on precipitation
def classify_weather(row):
    if row['prcp'] == 0:
        return 'clear'
    elif row['prcp'] < 5:
        return 'cloudy'
    else:
        return 'rainy'

# Apply the function to create a weather category
data['weather'] = data.apply(classify_weather, axis=1)

# Encode the weather category
label_encoder = LabelEncoder()
data['weather_encoded'] = label_encoder.fit_transform(data['weather'])

# Check the unique weather categories
print("Weather Categories:", label_encoder.classes_)

Weather Categories: ['clear' 'cloudy' 'rainy']


In [7]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, classification_report, accuracy_score
import pickle

# Define models for each numerical feature
features = ['tavg', 'tmin', 'tmax', 'prcp', 'wspd', 'pres']
regressors = {}

# Hyperparameter grid for RandomForestRegressor
param_grid_regressor = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Train regression models for each numerical feature
for feature in features:
    # Prepare the data
    X = data[['month', 'day']]  # Use 'month' and 'day' as input features
    y = data[feature]
    
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize and tune Random Forest Regressor
    regressor = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid_regressor, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    
    # Save the best model
    best_regressor = grid_search.best_estimator_
    with open(f'{feature}_model.pkl', 'wb') as file:
        pickle.dump(best_regressor, file)
    
    # Evaluate the tuned model
    y_pred = best_regressor.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"Feature: {feature}, Best Parameters: {grid_search.best_params_}, RMSE: {rmse}")
    
    # Store the model
    regressors[feature] = best_regressor

# Prepare the data for weather classification
X = data[['tavg', 'tmin', 'tmax', 'prcp', 'wspd', 'pres']]  # Use predicted features
y = data['weather_encoded']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter grid for RandomForestClassifier
param_grid_classifier = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
}

# Initialize and tune Random Forest Classifier
classifier = RandomForestClassifier(random_state=42)
grid_search_classifier = GridSearchCV(estimator=classifier, param_grid=param_grid_classifier, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search_classifier.fit(X_train, y_train)

# Save the best model
best_classifier = grid_search_classifier.best_estimator_

# Save the weather classification model
with open('weather_model.pkl', 'wb') as file:
    pickle.dump(best_classifier, file)

# Save the LabelEncoder (ensure this was defined earlier during preprocessing)
with open('label_encoder.pkl', 'wb') as file:
    pickle.dump(label_encoder, file)

# Evaluate the tuned model
y_pred = best_classifier.predict(X_test)
print("Best Parameters for Weather Classification Model:", grid_search_classifier.best_params_)
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("Accuracy:", accuracy_score(y_test, y_pred))


Fitting 3 folds for each of 81 candidates, totalling 243 fits
Feature: tavg, Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}, RMSE: 0.5964604888803925
Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   0.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   0.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   0.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   1.2s
[CV] END 

In [8]:
# Input date for prediction
input_date = pd.Timestamp('2024-11-18')  # Replace with the desired date
month = input_date.month
day = input_date.day

# Predict numerical features
predicted_values = {}
for feature in features:
    # Load the model
    with open(f'{feature}_model.pkl', 'rb') as file:
        model = pickle.load(file)
    
    # Predict the feature
    predicted_values[feature] = model.predict([[month, day]])[0]

# Display predicted numerical features
print("Predicted Values:")
for key, value in predicted_values.items():
    print(f"{key}: {value}")

# Predict weather category
weather_input = pd.DataFrame([predicted_values])  # Create input DataFrame for the classifier
with open('weather_model.pkl', 'rb') as file:
    weather_classifier = pickle.load(file)

predicted_weather = label_encoder.inverse_transform(weather_classifier.predict(weather_input))
print("Predicted Weather:", predicted_weather[0])

Predicted Values:
tavg: 27.456157608491516
tmin: 24.652575437801833
tmax: 30.885501047562325
prcp: 10.223888552733689
wspd: 6.539762681802986
pres: 1009.6449190929078
Predicted Weather: rainy




In [9]:
import pandas as pd
import pickle

# Input date for prediction
input_date = pd.Timestamp('2024-11-18')  # Replace with the desired date
month = input_date.month
day = input_date.day

# Define features to predict
features = ['tavg', 'tmin', 'tmax', 'prcp', 'wspd', 'pres']

# Predict numerical features
predicted_values = {}
for feature in features:
    # Load the model
    with open(f'{feature}_model.pkl', 'rb') as file:
        model = pickle.load(file)
    
    # Create a DataFrame with proper feature names for prediction
    input_features = pd.DataFrame([[month, day]], columns=['month', 'day'])
    
    # Predict the feature
    predicted_values[feature] = model.predict(input_features)[0]

# Display predicted numerical features
print("Predicted Numerical Features:")
for key, value in predicted_values.items():
    print(f"{key}: {value}")

# Convert predicted values to a DataFrame for classification
weather_input = pd.DataFrame([predicted_values])

# Load the weather classification model
with open('weather_model.pkl', 'rb') as file:
    weather_classifier = pickle.load(file)

# Load the label encoder
with open('label_encoder.pkl', 'rb') as file:
    label_encoder = pickle.load(file)

# Predict and decode the weather category
predicted_weather = weather_classifier.predict(weather_input)
predicted_weather_decoded = label_encoder.inverse_transform(predicted_weather)

print("Predicted Weather Category:", predicted_weather_decoded[0])

Predicted Numerical Features:
tavg: 27.456157608491516
tmin: 24.652575437801833
tmax: 30.885501047562325
prcp: 10.223888552733689
wspd: 6.539762681802986
pres: 1009.6449190929078
Predicted Weather Category: rainy


In [10]:
import pandas as pd
import pickle
from datetime import datetime, timedelta

# Define features to predict
features = ['tavg', 'tmin', 'tmax', 'prcp', 'wspd', 'pres']

# Load all models
regressors = {}
for feature in features:
    with open(f'{feature}_model.pkl', 'rb') as file:
        regressors[feature] = pickle.load(file)

# Load the weather classification model
with open('weather_model.pkl', 'rb') as file:
    weather_classifier = pickle.load(file)

# Load the label encoder
with open('label_encoder.pkl', 'rb') as file:
    label_encoder = pickle.load(file)

# Predict for the next week
next_week_predictions = []

# Start from today
today = datetime.now()
for i in range(7):  # Loop for the next 7 days
    prediction_date = today + timedelta(days=i)
    month = prediction_date.month
    day = prediction_date.day
    
    # Predict numerical features for the day
    predicted_values = {}
    for feature in features:
        # Create input DataFrame with proper feature names
        input_features = pd.DataFrame([[month, day]], columns=['month', 'day'])
        predicted_values[feature] = regressors[feature].predict(input_features)[0]
    
    # Add the predicted numerical features to a DataFrame for classification
    weather_input = pd.DataFrame([predicted_values])
    
    # Predict weather category
    predicted_weather = weather_classifier.predict(weather_input)
    predicted_weather_decoded = label_encoder.inverse_transform(predicted_weather)[0]
    
    # Append the results for the day
    next_week_predictions.append({
        'date': prediction_date.strftime('%Y-%m-%d'),
        'tavg': predicted_values['tavg'],
        'tmin': predicted_values['tmin'],
        'tmax': predicted_values['tmax'],
        'prcp': predicted_values['prcp'],
        'wspd': predicted_values['wspd'],
        'pres': predicted_values['pres'],
        'weather': predicted_weather_decoded
    })

# Convert to DataFrame for easy visualization
next_week_df = pd.DataFrame(next_week_predictions)

# Display predictions for the next week
print(next_week_df)

         date       tavg       tmin       tmax       prcp      wspd  \
0  2024-11-18  27.456158  24.652575  30.885501  10.223889  6.539763   
1  2024-11-19  27.297598  24.629844  30.866330   8.282752  6.596916   
2  2024-11-20  27.436502  24.630328  30.829743   7.967562  6.511697   
3  2024-11-21  27.428944  24.668483  30.854350   7.607910  6.601985   
4  2024-11-22  27.430395  24.666676  30.832577   7.675648  6.547792   
5  2024-11-23  27.380020  24.649663  30.893084   9.207778  6.610000   
6  2024-11-24  27.451826  24.754628  30.850702   8.521963  6.794136   

          pres weather  
0  1009.644919   rainy  
1  1009.660895   rainy  
2  1009.649521   rainy  
3  1009.651495   rainy  
4  1009.657120   rainy  
5  1009.661740   rainy  
6  1009.682804   rainy  
