In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

# Load the dataset (assuming you've uploaded it in Colab)
from google.colab import files
uploaded = files.upload()  # Upload your dataset manually in Colab

# Assuming the dataset file is named 'dataset.csv'
dataset = pd.read_csv('dataset.csv')  # Use the actual filename if different

# Step 1: Preprocessing
# Convert all columns to numeric where possible
dataset = dataset.apply(pd.to_numeric, errors='coerce')

# Encode categorical data ('District')
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dataset['District'] = label_encoder.fit_transform(dataset['District'])

# Fill missing values with the mean
dataset.fillna(dataset.mean(), inplace=True)

# Create a target variable ('Total Cases') by summing columns 37-50 (adjusted)
dataset['Total Cases'] = dataset.iloc[:, 37:49].sum(axis=1)  # Summing columns 37 to 50 (inclusive)

# Drop unused columns (e.g., months used for summation and 'Year')
dataset = dataset.drop(dataset.columns[37:49], axis=1)  # Drop columns 37-50
dataset = dataset.drop(columns=['Year'])  # Drop 'Year' column

# Step 2: Splitting Data
# Define features (X) and target (Y)
X = dataset.iloc[:, :-1].values  # All columns except 'Total Cases'
Y = dataset.iloc[:, -1].values  # 'Total Cases'

# Split into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=0)

# Step 3: Polynomial Regression
# Create a pipeline for polynomial regression
poly_reg = Pipeline([
    ('poly', PolynomialFeatures(degree=2)),  # Degree 2 Polynomial features
    ('linear', LinearRegression())  # Linear regression model
])

# Train the model
poly_reg.fit(X_train, Y_train)

# Step 4: Evaluate the model
Y_pred = poly_reg.predict(X_test)

# Calculate Mean Squared Error (MSE) and R-squared (R²)
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

# Display results
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared Score (R²): {r2:.2f}")



Saving dataset.csv to dataset.csv
Mean Squared Error (MSE): 34.69
R-squared Score (R²): -1.64


In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score


from google.colab import files
uploaded = files.upload()


dataset = pd.read_csv('dataset.csv')

# Step 1: Preprocessing
# Convert all columns to numeric where possible
dataset = dataset.apply(pd.to_numeric, errors='coerce')

# Encode categorical data ('District')
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dataset['District'] = label_encoder.fit_transform(dataset['District'])

# Fill missing values with the mean, i checked with mode, mean works better here
dataset.fillna(dataset.mean(), inplace=True)

# Create a target variable ('Total Cases')
dataset['Total Cases'] = dataset.iloc[:, 37:49].sum(axis=1)

# Drop unused columns year
dataset = dataset.drop(dataset.columns[37:49], axis=1)
dataset = dataset.drop(columns=['Year'])

# Step 2: Splitting Data
# Define features (X) and target (Y)
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, -1].values

# Split into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=0)

# Step 3: Polynomial Regression with Feature Scaling
# Create a pipeline that first scales the features, then applies polynomial regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling (important for polynomial regression)
    ('poly', PolynomialFeatures()),  # Polynomial feature generation
    ('linear', LinearRegression())  # Linear regression model
])

# Step 4: Hyperparameter Tuning for Polynomial Degree (1 to 5)
param_grid = {
    'poly__degree': [1, 2, 3, 4, 5],  # Trying different polynomial degrees
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1, n_jobs=-1)
grid_search.fit(X_train, Y_train)

# Get the best polynomial degree and model
best_degree = grid_search.best_params_['poly__degree']
best_model = grid_search.best_estimator_

# Step 5: Evaluate the Best Model
Y_pred = best_model.predict(X_test)

# Calculate Mean Squared Error (MSE) and R-squared (R²)
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

# Display results
print(f"Best Polynomial Degree: {best_degree}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared Score (R²): {r2:.2f}")

# Optional: Cross-Validation Results
cv_scores = cross_val_score(best_model, X, Y, cv=5, scoring='r2')
print(f"Cross-Validation R² Scores: {cv_scores}")
print(f"Average Cross-Validation R²: {cv_scores.mean():.2f}")


Saving dataset.csv to dataset (2).csv
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Polynomial Degree: 1
Mean Squared Error (MSE): 5.66
R-squared Score (R²): 0.57
Cross-Validation R² Scores: [0.69672467 0.71580399 0.47378571 0.26712643 0.3942082 ]
Average Cross-Validation R²: 0.51
