**Importing necessary libraries**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
sns.set()

In [65]:
data = pd.read_csv('Vehicle_Performance.csv')

In [66]:
data.shape

(398, 9)

In [67]:
data.describe()

Unnamed: 0,origin,cylinders,displacement,weight,acceleration,year,Kilometer_per_liter
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,1.572864,5.454774,193.425879,2970.424623,15.56809,1976.01005,9.997073
std,0.802055,1.701004,104.269838,846.841774,2.757689,3.697627,3.322917
min,1.0,3.0,68.0,1613.0,8.0,1970.0,3.826293
25%,1.0,4.0,104.25,2223.75,13.825,1973.0,7.440015
50%,1.0,4.0,148.5,2803.5,15.5,1976.0,9.778305
75%,2.0,8.0,262.0,3608.0,17.175,1979.0,12.329168
max,3.0,8.0,455.0,5140.0,24.8,1982.0,19.811697


In [68]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   origin               398 non-null    int64  
 1   cylinders            398 non-null    int64  
 2   displacement         398 non-null    float64
 3   horsepower           398 non-null    object 
 4   weight               398 non-null    int64  
 5   acceleration         398 non-null    float64
 6   year                 398 non-null    int64  
 7   name                 398 non-null    object 
 8   Kilometer_per_liter  398 non-null    float64
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


**Type casting necessary attributes**

In [69]:
data['origin'].unique()

array([1, 3, 2], dtype=int64)

In [70]:
# Origin contains only 3 unique values representing 3 countries, so better type cast it to an object.
data['origin'] = data['origin'].astype('object')

In [71]:
data['cylinders'].unique()

array([8, 4, 6, 3, 5], dtype=int64)

In [72]:
# Cylinders also contain only 4 unique values, so its better to type cast it as an object.
data['cylinders'] = data['cylinders'].astype('object')

In [73]:
data['horsepower'].head()

0    130
1    165
2    150
3    150
4    140
Name: horsepower, dtype: object

In [74]:
# Horsepower is considered an object type, so there may be a chance of string values in the column.
try:
    data['horsepower'] = data['horsepower'].astype('int')
except:
    anomaly = []
    for i in data['horsepower']:
        if i.isdigit():
            pass
        else:
            if i not in anomaly:
                anomaly.append(i)
    print(anomaly)

['?']


In [75]:
# Replacing ? by Nan values for easy imputation.
data['horsepower'] = [i.replace('?','-1') for i in data['horsepower']]
data['horsepower'] = data['horsepower'].astype('int')
data['horsepower'].replace(-1,data['horsepower'].mean(), inplace = True)

In [76]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   origin               398 non-null    object 
 1   cylinders            398 non-null    object 
 2   displacement         398 non-null    float64
 3   horsepower           398 non-null    float64
 4   weight               398 non-null    int64  
 5   acceleration         398 non-null    float64
 6   year                 398 non-null    int64  
 7   name                 398 non-null    object 
 8   Kilometer_per_liter  398 non-null    float64
dtypes: float64(4), int64(2), object(3)
memory usage: 28.1+ KB


**EDA**