In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("temperature.csv")
data.head()

Unnamed: 0,station,Date,Present_Tmax,Present_Tmin,LDAPS_RHmin,LDAPS_RHmax,LDAPS_Tmax_lapse,LDAPS_Tmin_lapse,LDAPS_WS,LDAPS_LH,...,LDAPS_PPT2,LDAPS_PPT3,LDAPS_PPT4,lat,lon,DEM,Slope,Solar radiation,Next_Tmax,Next_Tmin
0,1.0,30-06-2013,28.7,21.4,58.255688,91.116364,28.074101,23.006936,6.818887,69.451805,...,0.0,0.0,0.0,37.6046,126.991,212.335,2.785,5992.895996,29.1,21.2
1,2.0,30-06-2013,31.9,21.6,52.263397,90.604721,29.850689,24.035009,5.69189,51.937448,...,0.0,0.0,0.0,37.6046,127.032,44.7624,0.5141,5869.3125,30.5,22.5
2,3.0,30-06-2013,31.6,23.3,48.690479,83.973587,30.091292,24.565633,6.138224,20.57305,...,0.0,0.0,0.0,37.5776,127.058,33.3068,0.2661,5863.555664,31.1,23.9
3,4.0,30-06-2013,32.0,23.4,58.239788,96.483688,29.704629,23.326177,5.65005,65.727144,...,0.0,0.0,0.0,37.645,127.022,45.716,2.5348,5856.964844,31.7,24.3
4,5.0,30-06-2013,31.4,21.9,56.174095,90.155128,29.113934,23.48648,5.735004,107.965535,...,0.0,0.0,0.0,37.5507,127.135,35.038,0.5055,5859.552246,31.2,22.5


### Data Preprocessing 

### 1. Data Cleaning

### 2. EDA (Analysis)

### 3. Finding and Treating Outliers (if any)

### 4. Feature Selection for model building

### 5. Model Building

### 6. Hyperparameter tuning



In [3]:
data.isnull().sum()

station              2
Date                 2
Present_Tmax        70
Present_Tmin        70
LDAPS_RHmin         75
LDAPS_RHmax         75
LDAPS_Tmax_lapse    75
LDAPS_Tmin_lapse    75
LDAPS_WS            75
LDAPS_LH            75
LDAPS_CC1           75
LDAPS_CC2           75
LDAPS_CC3           75
LDAPS_CC4           75
LDAPS_PPT1          75
LDAPS_PPT2          75
LDAPS_PPT3          75
LDAPS_PPT4          75
lat                  0
lon                  0
DEM                  0
Slope                0
Solar radiation      0
Next_Tmax           27
Next_Tmin           27
dtype: int64

In [4]:
data.shape

(7752, 25)

In [5]:
data.dropna(inplace=True)

In [6]:
data.isnull().sum()

station             0
Date                0
Present_Tmax        0
Present_Tmin        0
LDAPS_RHmin         0
LDAPS_RHmax         0
LDAPS_Tmax_lapse    0
LDAPS_Tmin_lapse    0
LDAPS_WS            0
LDAPS_LH            0
LDAPS_CC1           0
LDAPS_CC2           0
LDAPS_CC3           0
LDAPS_CC4           0
LDAPS_PPT1          0
LDAPS_PPT2          0
LDAPS_PPT3          0
LDAPS_PPT4          0
lat                 0
lon                 0
DEM                 0
Slope               0
Solar radiation     0
Next_Tmax           0
Next_Tmin           0
dtype: int64

In [7]:
data.describe()

Unnamed: 0,station,Present_Tmax,Present_Tmin,LDAPS_RHmin,LDAPS_RHmax,LDAPS_Tmax_lapse,LDAPS_Tmin_lapse,LDAPS_WS,LDAPS_LH,LDAPS_CC1,...,LDAPS_PPT2,LDAPS_PPT3,LDAPS_PPT4,lat,lon,DEM,Slope,Solar radiation,Next_Tmax,Next_Tmin
count,7588.0,7588.0,7588.0,7588.0,7588.0,7588.0,7588.0,7588.0,7588.0,7588.0,...,7588.0,7588.0,7588.0,7588.0,7588.0,7588.0,7588.0,7588.0,7588.0,7588.0
mean,13.014101,29.748366,23.195809,56.724969,88.360823,29.620128,23.511786,7.094097,62.492606,0.36851,...,0.480738,0.275007,0.265373,37.544792,126.99142,61.918136,1.259755,5343.724208,30.241526,22.91082
std,7.217858,2.967401,2.40088,14.626559,7.199456,2.943496,2.342579,2.177034,33.686158,0.26226,...,1.743327,1.146087,1.179661,0.050428,0.07922,54.323529,1.372748,429.782561,3.111807,2.482256
min,1.0,20.0,11.3,19.794666,58.936283,17.624954,14.272646,2.88258,-13.603212,0.0,...,0.0,0.0,0.0,37.4562,126.826,12.37,0.0985,4329.520508,17.4,11.3
25%,7.0,27.8,21.6,45.960243,84.203724,27.673756,22.08682,5.675358,37.206201,0.146546,...,0.0,0.0,0.0,37.5102,126.937,28.7,0.2713,5001.485717,28.2,21.3
50%,13.0,29.9,23.4,55.023199,89.784122,29.709537,23.758249,6.547838,56.898324,0.315706,...,0.0,0.0,0.0,37.5507,126.995,45.716,0.618,5441.987305,30.4,23.1
75%,19.0,32.0,24.8,67.115099,93.742725,31.711109,25.15566,8.02896,84.235666,0.574174,...,0.017735,0.007855,1.7e-05,37.5776,127.042,59.8324,1.7678,5729.48584,32.6,24.6
max,25.0,37.6,29.9,98.524734,100.000153,38.542255,29.619342,21.857621,213.414006,0.967277,...,21.621661,15.841235,16.655469,37.645,127.135,212.335,5.1782,5992.895996,38.9,29.8


In [8]:
data.dtypes

station             float64
Date                 object
Present_Tmax        float64
Present_Tmin        float64
LDAPS_RHmin         float64
LDAPS_RHmax         float64
LDAPS_Tmax_lapse    float64
LDAPS_Tmin_lapse    float64
LDAPS_WS            float64
LDAPS_LH            float64
LDAPS_CC1           float64
LDAPS_CC2           float64
LDAPS_CC3           float64
LDAPS_CC4           float64
LDAPS_PPT1          float64
LDAPS_PPT2          float64
LDAPS_PPT3          float64
LDAPS_PPT4          float64
lat                 float64
lon                 float64
DEM                 float64
Slope               float64
Solar radiation     float64
Next_Tmax           float64
Next_Tmin           float64
dtype: object

- we convert date for our dataset to make a model.

In [9]:
# Ensure Date is in datetime format
data['Date'] = pd.to_datetime(data['Date'], errors='coerce')
# Extract features from Date
data['Day_of_Year'] = data['Date'].dt.dayofyear
data['Month'] = data['Date'].dt.month

# Drop the 'Date' column
data = data.drop(['Date'], axis=1)

In [10]:
X = data.drop(['Next_Tmax', 'Next_Tmin'], axis=1)
y = data['Next_Tmax']
z = data['Next_Tmin']

In [11]:
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

In [12]:
X_scaled

array([[-1.66460654, -0.35331763, -0.74802891, ...,  1.51056517,
        -1.70742759, -2.77335883],
       [-1.52605214,  0.72513836, -0.66472063, ...,  1.22299734,
        -1.70742759, -2.77335883],
       [-1.38749775,  0.62403311,  0.0433998 , ...,  1.20960169,
        -1.70742759, -2.77335883],
       ...,
       [ 1.38359018, -2.17321212, -2.53915704, ..., -2.09517477,
         1.6965582 ,  1.01021665],
       [ 1.52214457, -2.17321212, -2.28923218, ..., -2.10667071,
         1.6965582 ,  1.01021665],
       [ 1.66069897, -2.20691387, -2.41419461, ..., -2.07648676,
         1.6965582 ,  1.01021665]])

In [13]:
X_train_max, X_test_max, y_train_max, y_test_max = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [15]:
# Initialize regression models
lr = LinearRegression()
dtr = DecisionTreeRegressor(random_state=42)
rfr = RandomForestRegressor(random_state=42)

In [16]:
# List of models
models = {'Linear Regression': lr, 'Decision Tree Regressor': dtr, 'Random Forest Regressor': rfr}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train_max, y_train_max)
    y_pred = model.predict(X_test_max)
    mse = mean_squared_error(y_test_max, y_pred)
    r2 = r2_score(y_test_max, y_pred)
    print(f'{name}:\nMSE: {mse:.2f}, R-squared: {r2:.2f}\n')

Linear Regression:
MSE: 2.14, R-squared: 0.77

Decision Tree Regressor:
MSE: 2.22, R-squared: 0.76

Random Forest Regressor:
MSE: 0.99, R-squared: 0.90



In [22]:
#Hyperparameter tuning on RandomForest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [5, 10]
}

grid_search = GridSearchCV(rfr, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train_max, y_train_max)

# Best parameters and model performance
print(f'Best parameters: {grid_search.best_params_}')
best_model = grid_search.best_estimator_

# Evaluate tuned model
y_pred_best = best_model.predict(X_test_max)
mse_best = mean_squared_error(y_test_max, y_pred_best)
r2_best = r2_score(y_test_max, y_pred_best)

print(f'Best model MSE: {mse_best}')
print(f'Best model R2 Score: {r2_best}')

Best parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 200}
Best model MSE: 0.9806188772390811
Best model R2 Score: 0.8958723221217965


In [19]:
X_train_min, X_test_min, y_train_min, y_test_min = train_test_split(X_scaled, z, test_size=0.2, random_state=42)

In [20]:
# List of models
models = {'Linear Regression': lr, 'Decision Tree Regressor': dtr, 'Random Forest Regressor': rfr}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train_min, y_train_min)
    y_pred = model.predict(X_test_min)
    mse = mean_squared_error(y_test_min, y_pred)
    r2 = r2_score(y_test_min, y_pred)
    print(f'{name}:\nMSE: {mse:.2f}, R-squared: {r2:.2f}\n')

Linear Regression:
MSE: 1.01, R-squared: 0.84

Decision Tree Regressor:
MSE: 1.21, R-squared: 0.80

Random Forest Regressor:
MSE: 0.62, R-squared: 0.90



In [21]:
#Hyperparameter tuning on RandomForest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [
        5, 10]
}

grid_search = GridSearchCV(rfr, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train_min, y_train_min)

# Best parameters and model performance
print(f'Best parameters: {grid_search.best_params_}')
best_model = grid_search.best_estimator_

# Evaluate tuned model
y_pred_best = best_model.predict(X_test_min)
mse_best = mean_squared_error(y_test_min, y_pred_best)
r2_best = r2_score(y_test_min, y_pred_best)

print(f'Best model MSE: {mse_best}')
print(f'Best model R2 Score: {r2_best}')

Best parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 200}
Best model MSE: 0.6121239374336801
Best model R2 Score: 0.9003572057979496


# Temperature Forecast Project

## Steps Involved:
1. **Data Preprocessing**  
   - Handling missing values and cleaning the dataset.
   - Converting the `Date` column into a more usable format for the model by extracting features like `Day_of_Year` and `Month`.

2. **Exploratory Data Analysis (EDA)**  
   - Understanding the distribution of key variables like `Next_Tmax` and `Next_Tmin`.

3. **Feature Engineering**  
   - Extracted features from the `Date` column to add temporal insights such as `Day_of_Year` and `Month`.
   - Dropped the original `Date` column after feature extraction.

4. **Model Building**  
   - Implemented regression models (Linear Regression, Decision Tree Regressor, Random Forest Regressor) to predict the target variables (`Next_Tmax` and `Next_Tmin`).

5. **Model Evaluation**  
   - Evaluated models using Mean Squared Error (MSE) and R-squared (R²) score for both maximum and minimum temperature predictions.

6. **Hyperparameter Tuning**  
   - Fine-tuned the Random Forest Regressor using GridSearchCV to optimize model performance by tuning parameters like `n_estimators`, `max_depth`, and `min_samples_split`.

---

## 1. Data Preprocessing
   - Dropped rows with missing values.
   - Converted the `Date` column to datetime format.
   - Extracted day of the year and month from the `Date` column to create the features `Day_of_Year` and `Month`.
   - Dropped the original `Date` column.

## 2. Feature Engineering
   - Features used for prediction include:
     - **Day_of_Year**: Numeric representation of the day in the year.
     - **Month**: Numeric representation of the month.
     - Additional meteorological features such as temperature, relative humidity, wind speed, and solar radiation.

## 3. Model Building
   - Target Variables:
     - `Next_Tmax`: Next day's maximum temperature.
     - `Next_Tmin`: Next day's minimum temperature.
   - **Regression Models** used:
     - Linear Regression
     - Decision Tree Regressor
     - Random Forest Regressor

## 4. Model Evaluation
   - Used Mean Squared Error (MSE) and R-squared (R²) as evaluation metrics for both maximum and minimum temperature models.
   - Initial model results:
     - For **Next_Tmax**:
       - Linear Regression, Decision Tree Regressor, and Random Forest Regressor were trained and evaluated.
     - For **Next_Tmin**:
       - The same models were trained and evaluated.

## 5. Hyperparameter Tuning
   - **GridSearchCV** was applied to fine-tune the Random Forest Regressor:
     - Parameters tuned include `n_estimators`, `max_depth`, and `min_samples_split`.
   - After tuning, the best hyperparameters were used to refit the Random Forest Regressor.

## 6. Final Results:
   - The best model for predicting `Next_Tmax` and `Next_Tmin` after hyperparameter tuning showed:
     - Improved MSE and R² scores for both maximum and minimum temperature predictions.
