<a href="https://colab.research.google.com/github/harshavaka04/ev-vehicle-forecasting-internship_harshavaka/blob/main/evforecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset (update path as needed)
df = pd.read_csv("3ae033f50fa345051652.csv")

# Handle commas in numeric column if present
df["Electric Vehicle (EV) Total"] = df["Electric Vehicle (EV) Total"].replace(",", "", regex=True).astype(float)

# Check initial rows
print(df.head())

# Basic info
print("Initial Shape:", df.shape)
print(df.info())
print("Null Values:\n", df.isnull().sum())

# Handle missing values
df['County'] = df['County'].fillna('Unknown')
df['State'] = df['State'].fillna('Unknown')
df = df[df['Electric Vehicle (EV) Total'].notnull()]

# Convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df[df['Date'].notnull()]  # Remove invalid dates

# -----------------------------
# Outlier Capping (NOT removal)
# -----------------------------
# Compute IQR for 'Percent Electric Vehicles'
Q1 = df['Percent Electric Vehicles'].quantile(0.25)
Q3 = df['Percent Electric Vehicles'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print('Lower Bound:', lower_bound)
print('Upper Bound:', upper_bound)

# Cap outliers to bounds
df['Percent Electric Vehicles'] = np.where(df['Percent Electric Vehicles'] > upper_bound, upper_bound,
                                           np.where(df['Percent Electric Vehicles'] < lower_bound, lower_bound,
                                                    df['Percent Electric Vehicles']))

# Re-check outliers after capping
remaining_outliers = df[(df['Percent Electric Vehicles'] < lower_bound) | (df['Percent Electric Vehicles'] > upper_bound)]
print("✅ Outliers remaining after capping:", remaining_outliers.shape[0])  # This should be 0

# Final data check
print("✅ Final cleaned dataset shape:", df.shape)
print(df.head())


                Date          County State Vehicle Primary Use  \
0  September 30 2022       Riverside    CA           Passenger   
1   December 31 2022  Prince William    VA           Passenger   
2    January 31 2020          Dakota    MN           Passenger   
3       June 30 2022           Ferry    WA               Truck   
4       July 31 2021         Douglas    CO           Passenger   

  Battery Electric Vehicles (BEVs) Plug-In Hybrid Electric Vehicles (PHEVs)  \
0                                7                                        0   
1                                1                                        2   
2                                0                                        1   
3                                0                                        0   
4                                0                                        1   

   Electric Vehicle (EV) Total Non-Electric Vehicle Total Total Vehicles  \
0                          7.0                      