In [None]:
# import  libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('EV_cars.csv')
data.head()

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.columns

In [None]:
data.isnull().sum()

In [None]:
data[['Price.DE.']].isnull().sum()

In [None]:
data.tail()

In [None]:
data.drop('Car_name_link', axis=1, inplace=True)

In [None]:
data.head()

In [None]:
f= data.groupby(['Car_name'])['Price.DE.'].mean().nlargest(20)
f.plot(kind='bar')

In [None]:
f= data.groupby(['Car_name'])['Fast_charge'].mean().nlargest(30)
f.plot(kind='bar')

In [None]:
f= data.groupby(['Car_name'])['Range'].mean().nlargest(10)
f.plot(kind='bar')

In [None]:
data.describe().T

In [None]:
data.groupby(['Car_name']).value_counts().head()

In [None]:
data['Car_name'].value_counts()[:5]

In [None]:
data.duplicated('Car_name')

In [None]:
data.columns

In [None]:
data['Price'].fillna(method='ffill', inplace=True)
data['Fast_Charge'].fillna(method='ffill', inplace=True)

In [None]:
data.drop_duplicates(subset='Car_name', inplace=True)

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.groupby(['Car_name'])['Efficiency'].sum().nlargest().head()

In [None]:
efficiency_1 = data.groupby(['Car_name'])['Efficiency'].sum().nlargest().head()
efficiency_1.plot(kind='pie',autopct="%0.1f%%").set_title('Top Ten Car Company Contribution in the market')

In [None]:
data.columns

In [None]:
sns.histplot(data['Range'])

In [None]:
sns.histplot(data['Top_speed'])

In [None]:
sns.histplot(data['Price.DE.'])

In [None]:
data['Battery'].nunique()

In [None]:
sns.scatterplot(data=data,x='Battery',y='Price.DE.',hue='Battery')

In [None]:
y = data.groupby(['Car_name'])['Battery'].mean()
plt.plot(y)

In [None]:
import seaborn as sns 
sns.heatmap(data.corr(), cmap='coolwarm', annot=True)

In [None]:
data.corr(method='pearson', numeric_only=True)

In [None]:
sns.pairplot(data, hue=None,diag_kind="hist")

In [None]:
data.rename({'Fast_charge':'Fast_Charge',
            'Price.DE.':'Price',
            'Top_speed':'Top_Speed',
            'acceleration..0.100.':'Acceleration_Time'}, axis=1, inplace=True)

In [None]:
data.head()

In [None]:
corr = data.corr()['Price']
print(corr)

In [None]:
data.head()

In [None]:
data.drop('Car_name', axis=1, inplace=True)

In [None]:
#scoring and tuning 
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
#models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [None]:
X=data.drop('Price',axis=1)
y=data['Price'] 

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#Linear Regression

Model = LinearRegression()
Model.fit(X_train,y_train)

y_pred = Model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R2): {r2:.2f}")


In [None]:
#Decision Tree Regression

Dtree = DecisionTreeRegressor(max_depth=3, random_state=42)
Dtree.fit(X_train,y_train)

y_pred = Dtree.predict(X_test)

D_mae = mean_absolute_error(y_test, y_pred)
D_mse = mean_squared_error(y_test, y_pred)
D_rmse = np.sqrt(mse)
D_r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {D_mae:.2f}")
print(f"Mean Squared Error (MSE): {D_mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {D_rmse:.2f}")
print(f"R-squared (R2): {D_r2:.2f}")

In [None]:
# random forest regression model
rforest = RandomForestRegressor(n_estimators=100, max_depth=3, random_state=42)

# Fit the model on the training data
rforest.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = rforest.predict(X_test)

r_mae = mean_absolute_error(y_test, y_pred)
r_mse = mean_squared_error(y_test, y_pred)
r_rmse = np.sqrt(mse)
r_r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {r_mae:.2f}")
print(f"Mean Squared Error (MSE): {r_mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {r_rmse:.2f}")
print(f"R-squared (R2): {r_r2:.2f}")

In [None]:
#polynomial regression 

poly_features = PolynomialFeatures(degree=2)

# Transform the input features to polynomial features
X_train_poly = poly_features.fit_transform(X_train)
X_test_poly = poly_features.transform(X_test)

# Create a linear regression model
model = LinearRegression()

# Fit the model on the transformed training data
model.fit(X_train_poly, y_train)

# Make predictions on the transformed testing data
y_pred = model.predict(X_test_poly)

p_mae = mean_absolute_error(y_test, y_pred)
p_mse = mean_squared_error(y_test, y_pred)
p_rmse = np.sqrt(mse)
p_r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {p_mae:.2f}")
print(f"Mean Squared Error (MSE): {p_mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {p_rmse:.2f}")
print(f"R-squared (R2): {p_r2:.2f}")

In [None]:
# Create Ridge and Lasso regression models
ridge = Ridge(alpha=1)
lasso = Lasso(alpha=1)

# Fit the models on the training data
ridge.fit(X_train, y_train)
lasso.fit(X_train, y_train)

# Make predictions on the testing data
y_pred_ridge = ridge.predict(X_test)
y_pred_lasso = lasso.predict(X_test)

# Evaluate the models
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r1 = r2_score(y_test, y_pred_ridge)
r2 = r2_score(y_test, y_pred_lasso)

print(f"Mean Squared Error (Ridge): {mse_ridge:.2f}")
print(f"Mean Squared Error (Lasso): {mse_lasso: .2f}")
print(f"R-squared (R2) (Ridge): {r1: .2f}")
print(f"R-squared (R2) (Lasso): {r2: .2f}")

In [None]:
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [5, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Create a random forest regressor
rforest = RandomForestRegressor(random_state=42)

# Instantiate the grid search with cross-validation
grid_search = GridSearchCV(estimator=rforest, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best combination of hyperparameters and their score
print("Best hyperparameters:", grid_search.best_params_)
print("Best score:", -grid_search.best_score_)

In [None]:
%%time
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Instantiate the best model from Step 4 (e.g., Random Forests)
best_model = RandomForestClassifier(n_estimators=100, max_depth=5,min_samples_split= 10)

# Create the RFECV object and fit it to the training data
selector = RFECV(best_model, step=1, cv=3, scoring='accuracy')
selector.fit(X_train, y_train)

# Get the selected features and their ranks
selected_features = X_train.columns[selector.support_]
feature_ranks = selector.ranking_

print(f"Selected features: {selected_features}")
print(f"Feature ranks: {feature_ranks}")

In [None]:
# Convert selected_features to a list
selected_features_list = selected_features.tolist()

# Remove target variable from the list of selected features if it's present
if 'Price' in selected_features_list:
    selected_features_list.remove('Price')

# Create new dataframes with only the selected features
X_train_selected = X_train[selected_features_list]
X_test_selected = X_test[selected_features_list]

In [None]:
best_model=best_model.fit(X_train_selected, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test_selected)

# Evaluate the model using accuracy_score
from sklearn.metrics import accuracy_score

test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test accuracy with selected features: {test_accuracy}")

In [None]:
import json
selected_features_list = selected_features.tolist()

with open("selected_features.json", "w") as f:
    json.dump(selected_features_list, f)

In [None]:
import joblib

# Save the best model to a file
#joblib.dump(best_model, "best_model.pkl")

In [None]:
import pickle
pickle.dump(best_model, open('best_model1.pkl', 'wb'))