In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Load the CSV data into a DataFrame
data = pd.read_csv("weather_data_2024.csv")

In [3]:
# Check for missing values (optional)
print(data.isnull().sum())  # This will display the number of missing values in each column

Date                    0
Month                   0
Max_Temperature_(°F)    0
Avg_Temperature_(°F)    0
Low_Temperature_(°F)    0
Max_Dew Point_(°F)      0
Avg_Dew Point_(°F)      0
Low_Dew Point_(°F)      0
Max_Humidity_(%)        0
Avg_Humidity_(%)        0
Low_Humidity_(%)        0
Max_WindSpeed_(mph)     0
Avg_WindSpeed_(mph)     0
Low_WindSpeed_(mph)     0
Max_Pressure_(in)       0
Avg_Pressure_(in)       0
Low_Pressure_(in)       0
Precipitation_(in)      0
dtype: int64


In [4]:
data = pd.read_csv("weather_data_2024.csv")

In [5]:
print(data.columns)

Index(['Date', 'Month', 'Max_Temperature_(°F)', 'Avg_Temperature_(°F)',
       'Low_Temperature_(°F)', 'Max_Dew Point_(°F)', 'Avg_Dew Point_(°F)',
       'Low_Dew Point_(°F)', 'Max_Humidity_(%)', 'Avg_Humidity_(%)',
       'Low_Humidity_(%)', 'Max_WindSpeed_(mph)', 'Avg_WindSpeed_(mph)',
       'Low_WindSpeed_(mph)', 'Max_Pressure_(in)', 'Avg_Pressure_(in)',
       'Low_Pressure_(in)', 'Precipitation_(in)'],
      dtype='object')


In [6]:
# Select features (independent variables)
features = [
    "Max_Temperature_(°F)",
    "Avg_Temperature_(°F)",
    "Max_Humidity_(%)",
    "Avg_WindSpeed_(mph)"
]

In [7]:
# Target variable (dependent variable)
target = "Max_Temperature_(°F)"  # Replace with the variable to predict

In [8]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.2, random_state=42)

In [9]:
# Model 1: Linear Regression
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

In [10]:
# Model 2: Random Forest Regression
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)  # Adjust n_estimators as needed
model_rf.fit(X_train, y_train)

In [11]:
# Make predictions on the testing set for both models
predictions_lr = model_lr.predict(X_test)
predictions_rf = model_rf.predict(X_test)

In [12]:
print("Models created and trained successfully!")

Models created and trained successfully!


In [13]:
from sklearn.metrics import mean_squared_error, r2_score

Calculate evaluation metrics:

In [14]:
# MSE (Mean Squared Error) for both models
mse_lr = mean_squared_error(y_test, predictions_lr)
mse_rf = mean_squared_error(y_test, predictions_rf)

In [15]:
# R-squared for both models
r2_lr = r2_score(y_test, predictions_lr)
r2_rf = r2_score(y_test, predictions_rf)

In [16]:
# Print the results
print("Linear Regression MSE:", mse_lr)
print("Linear Regression R-squared:", r2_lr)
print("Random Forest Regression MSE:", mse_rf)
print("Random Forest Regression R-squared:", r2_rf)

Linear Regression MSE: 0.0
Linear Regression R-squared: 1.0
Random Forest Regression MSE: 0.10207894736842078
Random Forest Regression R-squared: 0.9817209821428572


Compare Model Performance

In [17]:
# Sample new data (replace with actual values)
new_data = {
    "Max_Temperature_(°F)": [70],
    "Avg_Temperature_(°F)": [65],
    "Max_Humidity_(%)": [80],
    "Avg_WindSpeed_(mph)": [10]
}

In [18]:
# Convert the new data to a DataFrame
new_data_df = pd.DataFrame(new_data)

In [19]:
# Make predictions using the chosen model
predictions = model_lr.predict(new_data_df)  # Replace model_lr with the chosen model

In [20]:
# Print the prediction
print("Predicted Precipitation:", predictions[0])

Predicted Precipitation: 70.0


The new results obtained are more realistic for weather prediction. Here's a breakdown of what they indicate:

Linear Regression:

MSE: 0.0: This is still an unusually low value, suggesting a near-perfect fit between predictions and actual values. It might be due to:
Overfitting: The model might be memorizing the training data too well, leading to poor performance on unseen data.
Limited data complexity: If the data doesn't have a lot of variation, a simple model like linear regression might achieve a perfect fit, but it might not generalize well.
Random Forest Regression:

MSE: 0.102: This is a more reasonable value for weather prediction, indicating some error between predictions and actual values.
R-squared: 0.982: This is a high R-squared value, indicating the model explains a large proportion of the variance in the target variable.
Recommendations:

Investigate Overfitting (if concerned about Linear Regression):

Use validation set evaluation (as discussed previously) to assess overfitting.
Consider regularization techniques (L1 or L2) to prevent overfitting.
Choose the Model Based on the Needs:

Random Forest Regression: Due to the high R-squared and a more realistic MSE, Random Forest Regression seems to be performing better in this case.
However, if interpretability is crucial, Linear Regression might be preferred even with a risk of overfitting (as it's easier to understand the relationships between features and predictions).
Further Exploration:

Feature Engineering: Experiment with creating new features from existing ones. This might improve model performance.
Hyperparameter Tuning: Consider tuning hyperparameters (e.g., number of trees in Random Forest) to potentially improve model accuracy.
More Complex Models: Explore other machine learning models like Gradient Boosting Regression or Support Vector Regression to see if they outperform the current choices.
Overall, the new results are a positive sign. We have a more realistic assessment of the model's performance. By addressing overfitting concerns (if applicable) and exploring further improvements, we can continue to refine the weather prediction models.

Gradient Boosting Regression

In [21]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [22]:
# Load the CSV data into a DataFrame
data = pd.read_csv("weather_data_2024.csv")

In [23]:
# Print the first 5 rows of the DataFrame
print(data.head())

       Date  Month  Max_Temperature_(°F)  Avg_Temperature_(°F)  \
0  1/1/2024      1                    52                  47.3   
1  1/2/2024      1                    51                  49.6   
2  1/3/2024      1                    50                  48.2   
3  1/4/2024      1                    48                  46.7   
4  1/5/2024      1                    46                  45.1   

   Low_Temperature_(°F)  Max_Dew Point_(°F)  Avg_Dew Point_(°F)  \
0                    43                  49                41.8   
1                    47                  49                47.0   
2                    46                  47                45.2   
3                    45                  44                41.0   
4                    43                  41                39.3   

   Low_Dew Point_(°F)  Max_Humidity_(%)  Avg_Humidity_(%)  Low_Humidity_(%)  \
0                  34                98              81.9                62   
1                  45                95   

In [24]:
# Print the last 5 rows of the DataFrame
print(data.tail())

         Date  Month  Max_Temperature_(°F)  Avg_Temperature_(°F)  \
86  3/27/2024      3                    45                  42.6   
87  3/28/2024      3                    49                  42.2   
88  3/29/2024      3                    52                  46.8   
89  3/30/2024      3                    54                  47.8   
90  3/31/2024      3                    46                  46.0   

    Low_Temperature_(°F)  Max_Dew Point_(°F)  Avg_Dew Point_(°F)  \
86                    41                  41                39.0   
87                    34                  37                34.8   
88                    42                  45                41.6   
89                    44                  42                40.6   
90                    46                  42                42.0   

    Low_Dew Point_(°F)  Max_Humidity_(%)  Avg_Humidity_(%)  Low_Humidity_(%)  \
86                  36                95              87.3                78   
87                  32

In [25]:
data['Date'] = pd.to_datetime(data['Date'])  # Convert to datetime
# Extract features like year, month, day, etc.
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day
# Or create a numerical representation
data['Days_Since_Ref'] = (data['Date'] - pd.to_datetime('2024-01-01')).dt.days

In [44]:
# Assuming 'Max_Temperature_(°F)' is the target variable 
if pd.api.types.is_string_dtype(data['Max_Temperature_(°F)']):
  data['Max_Temperature_(°F)'] = pd.to_numeric(data['Max_Temperature_(°F)'], errors='coerce')  # Convert to numeric, handling errors

In [27]:
# Separate features (X) and target variable (y) from the DataFrame
# Replace "target_column" with the actual name of the target variable (e.g., 'Precipitation_(in)')
target_column = "Max_Temperature_(°F)"  # Replace with the target variable
X = data.drop(target_column, axis=1)
y = data[target_column]

In [28]:
# Split data into training and testing sets (adjust test_size as needed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# Define the model (consider feature scaling if necessary)
model_gbr = GradientBoostingRegressor()

In [30]:
# Define hyperparameter grid for GridSearchCV (adjust parameters as needed)
param_grid = {'learning_rate': [0.01, 0.1, 1],  # Learning rate
              'n_estimators': [100, 200, 500],  # Number of boosting stages
              'max_depth': [3, 5, 8]}  # Maximum depth of individual regression trees

In [31]:
# Create the GridSearchCV object (consider adding validation data for cv)
grid_gbr = GridSearchCV(model_gbr, param_grid, cv=5, scoring='neg_mean_squared_error')

In [32]:
# Check data types of all features (including converted dates)
print(data.dtypes)

Date                    datetime64[ns]
Month                            int32
Max_Temperature_(°F)             int64
Avg_Temperature_(°F)           float64
Low_Temperature_(°F)             int64
Max_Dew Point_(°F)               int64
Avg_Dew Point_(°F)             float64
Low_Dew Point_(°F)               int64
Max_Humidity_(%)                 int64
Avg_Humidity_(%)               float64
Low_Humidity_(%)                 int64
Max_WindSpeed_(mph)              int64
Avg_WindSpeed_(mph)            float64
Low_WindSpeed_(mph)              int64
Max_Pressure_(in)              float64
Avg_Pressure_(in)              float64
Low_Pressure_(in)              float64
Precipitation_(in)               int64
Year                             int32
Day                              int32
Days_Since_Ref                   int64
dtype: object


In [33]:
# Ensure target variable is numeric (assuming 'Max_Temperature_(°F)')
if pd.api.types.is_string_dtype(data['Max_Temperature_(°F)']):
  data['Max_Temperature_(°F)'] = pd.to_numeric(data['Max_Temperature_(°F)'], errors='coerce')  # Convert to numeric, handling errors

In [34]:
# Separate features (X) and target variable (y) again after potential changes
X = data.drop(target_column, axis=1)
y = data[target_column]

In [35]:
# Examine first few rows after data preprocessing
print(data.head())

        Date  Month  Max_Temperature_(°F)  Avg_Temperature_(°F)  \
0 2024-01-01      1                    52                  47.3   
1 2024-01-02      1                    51                  49.6   
2 2024-01-03      1                    50                  48.2   
3 2024-01-04      1                    48                  46.7   
4 2024-01-05      1                    46                  45.1   

   Low_Temperature_(°F)  Max_Dew Point_(°F)  Avg_Dew Point_(°F)  \
0                    43                  49                41.8   
1                    47                  49                47.0   
2                    46                  47                45.2   
3                    45                  44                41.0   
4                    43                  41                39.3   

   Low_Dew Point_(°F)  Max_Humidity_(%)  Avg_Humidity_(%)  ...  \
0                  34                98              81.9  ...   
1                  45                95              90.7  ...

In [36]:
from sklearn.preprocessing import StandardScaler  # Import StandardScaler

In [37]:
# Feature scaling (optional)
scaler = StandardScaler()  # Or MinMaxScaler()
X = scaler.fit_transform(data.drop(target_column, axis=1))
y = data[target_column]

DTypePromotionError: The DType <class 'numpy.dtypes.DateTime64DType'> could not be promoted by <class 'numpy.dtypes.Float64DType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.Int32DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int32DType'>, <class 'numpy.dtypes.Int32DType'>, <class 'numpy.dtypes.Int64DType'>)

In [38]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
# Define the model
model_gbr = GradientBoostingRegressor()

# Define hyperparameter grid (adjust parameters as needed)
param_grid = {'learning_rate': [0.01, 0.1, 1],  # Learning rate
              'n_estimators': [100, 200, 500],  # Number of boosting stages
              'max_depth': [3, 5, 8]}  # Maximum depth of individual regression trees

In [40]:
# Create the GridSearchCV object
grid_gbr = GridSearchCV(model_gbr, param_grid, cv=5, scoring='neg_mean_squared_error')  # Use negative MSE for regression

In [41]:
# Fit the GridSearchCV object (consider setting error_score='raise' for detailed debugging)
grid_gbr.fit(X_train, y_train)

ValueError: 
All the 135 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
135 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\I346462\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\I346462\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\I346462\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\ensemble\_gb.py", line 659, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\I346462\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\I346462\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\validation.py", line 1263, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "C:\Users\I346462\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\validation.py", line 879, in check_array
    dtype_orig = np.result_type(*dtypes_orig)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
numpy.exceptions.DTypePromotionError: The DType <class 'numpy.dtypes.DateTime64DType'> could not be promoted by <class 'numpy.dtypes.Float64DType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.Int32DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int32DType'>, <class 'numpy.dtypes.Int32DType'>, <class 'numpy.dtypes.Int64DType'>)


In [42]:
# Best model selection
best_model_gbr = grid_gbr.best_estimator_

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [43]:
# Evaluate on test set (optional)
predictions_gbr_test = best_model_gbr.predict(X_test)
mse_gbr_test = mean_squared_error(y_test, predictions_gbr_test)
r2_gbr_test = r2_score(y_test, predictions_gbr_test)
print("Gradient Boosting MSE (test):", mse_gbr_test)
print("Gradient Boosting R-squared (test):", r2_gbr_test)

NameError: name 'best_model_gbr' is not defined