In [1]:
import pandas as pd
import plotly.express as px
from sklearn.cluster import KMeans


## 1. Preprocess the data by selecting the relevant features and normalizing the numerical features

In [2]:

df = pd.read_csv('Electric_Vehicle_Population_Data.csv', index_col=False)

### rename the columns and select relevant features

In [3]:
new_column_names = {
    'VIN': 'vin_number',
    'County': 'county_name',
    'City': 'city_name',
    'State': 'state_name',
    'Postal Code': 'postal_code',
    'Model Year': 'model_year',
    'Make': 'car_make',
    'Model': 'car_model',
    'Electric Vehicle Type': 'ev_type',
    'Clean Alternative Fuel Vehicle (CAFV) Eligibility': 'cafve',
    'Electric Range': 'ev_range',
    'Base MSRP': 'msrp',
    'Legislative District': 'legislative_district',
    'DOL Vehicle ID': 'dol_vehicle_id',
    'Vehicle Location': 'location',
    'Electric Utility': 'electric_utility',
    '2020 Census Tract': 'census_tract'
}

# Use the rename() method to rename the column names
df = df.rename(columns=new_column_names)


### *Convert the location feature to a Point type and split into latitude and longitude*

In [4]:
df['long'] = df['location'].apply(lambda x: float(
    x.split()[1].strip("()")) if isinstance(x, str) else None)
df['lat'] = df['location'].apply(lambda x: float(
    x.split()[2].strip("()")) if isinstance(x, str) else None)
df = df.drop('location', axis=1)

In [5]:
selected_features = ['model_year', 'car_make', 'car_model', 'ev_type', 'ev_range', 'long','lat']

### Create a new DataFrame with only the selected features

In [6]:
df_selected = df[selected_features].copy()

### Normalize the electric range feature

In [7]:
electric_range_mean = df_selected['ev_range'].mean()
electric_range_std = df_selected['ev_range'].std()
df_selected['ev_range'] = (df_selected['ev_range'] - electric_range_mean) / electric_range_std

In [10]:
df_selected.tail(10)

Unnamed: 0,model_year,car_make,car_model,ev_type,ev_range,long,lat
124706,2022,VOLVO,C40,Battery Electric Vehicle (BEV),-0.79209,-122.41666,47.30682
124707,2022,JEEP,GRAND CHEROKEE,Plug-in Hybrid Electric Vehicle (PHEV),-0.542917,-119.55125,47.31867
124708,2016,BMW,I3,Plug-in Hybrid Electric Vehicle (PHEV),-0.074472,-122.30866,47.57874
124709,2021,AUDI,E-TRON,Battery Electric Vehicle (BEV),-0.79209,-122.45516,48.74487
124710,2021,TESLA,MODEL 3,Battery Electric Vehicle (BEV),-0.79209,-122.27734,47.83785
124711,2022,TESLA,MODEL 3,Battery Electric Vehicle (BEV),-0.79209,-121.98087,47.8526
124712,2020,KIA,NIRO,Plug-in Hybrid Electric Vehicle (PHEV),-0.53295,-122.52054,47.26887
124713,2023,TESLA,MODEL Y,Battery Electric Vehicle (BEV),-0.79209,-122.49756,48.7999
124714,2018,CHEVROLET,BOLT EV,Battery Electric Vehicle (BEV),1.580035,-122.46495,47.16778
124715,2015,CHEVROLET,VOLT,Plug-in Hybrid Electric Vehicle (PHEV),-0.413347,-119.48756,46.26543


In [19]:
df_selected['lat'].iloc== null

0         False
1         False
2         False
3         False
4         False
          ...  
124711    False
124712    False
124713    False
124714    False
124715    False
Name: lat, Length: 124716, dtype: bool

## 2. Use clustering techniques to group the vehicles based on their features
.  using  `K-means` clustering

In [11]:
features_for_clustering = ['ev_range', 'long','lat']


In [12]:
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(df_selected[features_for_clustering])
cluster_labels = kmeans.labels_

ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [13]:
df_selected['Cluster'] = cluster_labels

NameError: name 'cluster_labels' is not defined