In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error

# car_price_train

In [2]:
df_train = pd.read_csv('car_price_train.csv')
df_train.head()
print(df_train.iloc[104, :]) #Kms_Driver is CNG
df_train = df_train.drop([104])
# df_train.columns
# df_train.shape
# df_train['Transmission'].value_counts().count()

Car_Name            NaN
Year               2.95
Selling_Price      7.74
Present_Price     49998
Kms_Driven          CNG
Fuel_Type        Dealer
Seller_Type      Manual
Transmission          0
Owner               NaN
Name: 104, dtype: object


In [3]:
feature_columns = ['Car_Name', 'Year', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']
label_columns = 'Selling_Price'

features = df_train.loc[:, feature_columns]
label = df_train.loc[:, label_columns]

In [4]:
X_train = features.values
y_train = label.values

# Missing value

In [5]:
df_train.isnull().sum()

Car_Name         4
Year             0
Selling_Price    0
Present_Price    3
Kms_Driven       8
Fuel_Type        3
Seller_Type      3
Transmission     1
Owner            0
dtype: int64

In [6]:
car_name_impute = SimpleImputer(np.nan, strategy = 'most_frequent')
X_train[:, 0] = car_name_impute.fit_transform(X_train[:, 0].reshape(-1, 1)).ravel()

In [7]:
present_price_imputer = SimpleImputer(np.nan, strategy = 'most_frequent')
X_train[:, 2] = present_price_imputer.fit_transform(X_train[:, 2].reshape(-1, 1)).ravel()

In [8]:
kms_driven_imputer = SimpleImputer(np.nan, strategy = 'mean')
X_train[:, 3] = present_price_imputer.fit_transform(X_train[:, 3].reshape(-1, 1)).ravel()

In [9]:
value_imputer = SimpleImputer(np.nan, strategy = 'most_frequent')
X_train[:, 4:8] = value_imputer.fit_transform(X_train[:, 4:8])

In [10]:
features = pd.DataFrame.from_records(X_train, columns = feature_columns)
print(features.isnull().sum())
features.head()

Car_Name         0
Year             0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64


Unnamed: 0,Car_Name,Year,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,Bajaj Pulsar 150,2008.0,13.6,15000,Petrol,Dealer,Manual,1.0
1,Bajaj Avenger 220,2017.0,0.95,3500,Petrol,Individual,Manual,0.0
2,Hero CBZ Xtreme,2008.0,0.787,15000,Petrol,Individual,Manual,0.0
3,etios g,2015.0,6.8,36000,Petrol,Dealer,Manual,0.0
4,Royal Enfield Thunder 350,2016.0,1.5,8700,Petrol,Individual,Manual,0.0


## Normalize

###  Year, Kms_Driven

In [11]:
minmax_scaler = MinMaxScaler()
X_train[:, 1] = minmax_scaler.fit_transform(X_train[:, 1].reshape(-1, 1)).ravel()
X_train[:, 3] = minmax_scaler.fit_transform(X_train[:, 3].reshape(-1, 1)).ravel()

### Seller_Type

In [12]:
# print(df_train['Seller_Type'].value_counts().count())
label_encoder = LabelEncoder()
X_train[:, 5] = label_encoder.fit_transform(X_train[:, 5])

### Transmission

In [13]:
print(df_train['Transmission'].value_counts().count())
label_encoder = LabelEncoder()
X_train[:, -2] = label_encoder.fit_transform(X_train[:, -2])

# one_hot = OneHotEncoder()
# fuel_type = one_hot.fit_transform(X_train[:, -2].reshape(-1, 1)).toarray()
# X_train = np.delete(X_train, -2, axis = 1)
# X_train = np.concatenate((X_train[:, :-2], fuel_type, X_train[:, -2:]), axis = 1)

2


###  Fuel_Type

In [14]:
#  print(df_train['Fuel_Type'].value_counts().count())
# label_encoder = LabelEncoder()
# X_train[:, 4] = label_encoder.fit_transform(X_train[:, 4])
one_hot = OneHotEncoder()
fuel_type = one_hot.fit_transform(X_train[:, 4].reshape(-1, 1)).toarray()
X_train = np.delete(X_train, 4, axis = 1)
X_train = np.concatenate((X_train[:, :4], fuel_type, X_train[:, 4:]), axis = 1)

In [15]:
X_train = np.delete(X_train, 0, axis = 1)

# car_price_test

In [16]:
df_test = pd.read_csv('car_price_test.csv')
df_test.head()
df_test.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [17]:
features_test = df_train.loc[:, feature_columns]
label_test = df_train.loc[:, label_columns]

X_test = features.values
y_test = label.values

## Normalize

In [18]:
minmax_scaler = MinMaxScaler()
X_test[:, 1] = minmax_scaler.fit_transform(X_test[:, 1].reshape(-1, 1)).ravel()
X_test[:, 3] = minmax_scaler.fit_transform(X_test[:, 3].reshape(-1, 1)).ravel()

In [19]:
label_encoder = LabelEncoder()
X_test[:, 5] = label_encoder.fit_transform(X_test[:, 5])

In [20]:
label_encoder = LabelEncoder()
X_test[:, -2] = label_encoder.fit_transform(X_test[:, -2])

# one_hot = OneHotEncoder()
# fuel_type = one_hot.fit_transform(X_test[:, -2].reshape(-1, 1)).toarray()
# X_test = np.delete(X_test, -2, axis = 1)
# X_test = np.concatenate((X_test[:, :-2], fuel_type, X_test[:, -2:]), axis = 1)

In [21]:
# label_encoder = LabelEncoder()
# X_test[:, 4] = label_encoder.fit_transform(X_test[:, 4])

one_hot = OneHotEncoder()
fuel_type = one_hot.fit_transform(X_test[:, 4].reshape(-1, 1)).toarray()
X_test = np.delete(X_test, 4, axis = 1)
X_test = np.concatenate((X_test[:, :4], fuel_type, X_test[:, 4:]), axis = 1)

In [22]:
X_test = np.delete(X_test, 0, axis = 1)
X_test

array([[0.3571428571428612, 13.6, 0.029029029029029027, ..., 0, 1, 1.0],
       [1.0, 0.95, 0.006006006006006006, ..., 1, 1, 0.0],
       [0.3571428571428612, 0.787, 0.029029029029029027, ..., 1, 1, 0.0],
       ...,
       [0.785714285714306, 3.45, 0.03203203203203203, ..., 1, 1, 1.0],
       [0.5714285714285836, 10.0, 0.1378198198198198, ..., 0, 1, 0.0],
       [1.0, 1.78, 0.007007007007007007, ..., 1, 1, 0.0]], dtype=object)

# Train model

## Linear

In [23]:
linear = LinearRegression()
linear.fit(X_train, y_train)

predict_train = linear.predict(X_train)
print("Root Mean Square Error Train: {}".format(np.sqrt(mean_squared_error(y_train, predict_train))))
predict_test = linear.predict(X_test)
print("Root Mean Square Error Test: {}".format(np.sqrt(mean_squared_error(y_test, predict_test))))

Root Mean Square Error Train: 1.7409686778109805
Root Mean Square Error Test: 1.7409686778109805


## Rigde

In [24]:
ridge = Ridge(alpha = 0.01)
ridge.fit(X_train, y_train)

predict_train = ridge.predict(X_train)
print("Root Mean Square Error Train: {}".format(np.sqrt(mean_squared_error(y_train, predict_train))))

predict_test = ridge.predict(X_test)
print("Root Mean Square Error Test: {}".format(np.sqrt(mean_squared_error(y_test, predict_test))))

Root Mean Square Error Train: 1.7409694726810852
Root Mean Square Error Test: 1.7409694726810852


## Lasso

In [25]:
lasso = Lasso(alpha = 0.001)
lasso.fit(X_train, y_train)

predict_train = lasso.predict(X_train)
predict_test = lasso.predict(X_test)

print("Root Mean Square Error: {}".format(np.sqrt(mean_squared_error(y_train, predict_train))))
print("Root Mean Square Error: {}".format(np.sqrt(mean_squared_error(y_test, predict_test))))

Root Mean Square Error: 1.7410972321928218
Root Mean Square Error: 1.7410972321928218
