In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error

In [2]:
def DataframetoArray(dataframe):
    feature_columns = ['Car_Name', 'Year', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']
    label_columns = 'Selling_Price'

    features = dataframe.loc[:, feature_columns]
    label = dataframe.loc[:, label_columns]
    X = features.values
    y = label.values
    return X, y

# car_price_train

In [3]:
df_train = pd.read_csv('car_price_train.csv')
df_train.head()
# print(df_train.iloc[104, :]) #Kms_Driver is CNG
df_train = df_train.drop([104])
# df_train.columns
# df_train.shape
print(df_train.isnull().sum())

Car_Name         4
Year             0
Selling_Price    0
Present_Price    3
Kms_Driven       8
Fuel_Type        3
Seller_Type      3
Transmission     1
Owner            0
dtype: int64


# car_price_test

In [4]:
df_test = pd.read_csv('car_price_test.csv')
print(df_test.shape)
df_test.isnull().sum()

(61, 9)


Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [5]:
X_train, y_train =  DataframetoArray(df_train)
X_test, y_test =  DataframetoArray(df_test)

# Missing value train

In [6]:
df_train.isnull().sum()

Car_Name         4
Year             0
Selling_Price    0
Present_Price    3
Kms_Driven       8
Fuel_Type        3
Seller_Type      3
Transmission     1
Owner            0
dtype: int64

In [7]:
car_name_impute = SimpleImputer(np.nan, strategy = 'most_frequent')
X_train[:, 0] = car_name_impute.fit_transform(X_train[:, 0].reshape(-1, 1)).ravel()

In [8]:
present_price_imputer = SimpleImputer(np.nan, strategy = 'most_frequent')
X_train[:, 2] = present_price_imputer.fit_transform(X_train[:, 2].reshape(-1, 1)).ravel()

In [9]:
kms_driven_imputer = SimpleImputer(np.nan, strategy = 'mean')
X_train[:, 3] = present_price_imputer.fit_transform(X_train[:, 3].reshape(-1, 1)).ravel()

In [10]:
value_imputer = SimpleImputer(np.nan, strategy = 'most_frequent')
X_train[:, 4:8] = value_imputer.fit_transform(X_train[:, 4:8])

In [11]:
new_df = pd.DataFrame.from_records(X_train, columns = ['Car_Name', 'Year', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner'])
print(new_df.isnull().sum())
new_df.head()

Car_Name         0
Year             0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64


Unnamed: 0,Car_Name,Year,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,Bajaj Pulsar 150,2008.0,13.6,15000,Petrol,Dealer,Manual,1.0
1,Bajaj Avenger 220,2017.0,0.95,3500,Petrol,Individual,Manual,0.0
2,Hero CBZ Xtreme,2008.0,0.787,15000,Petrol,Individual,Manual,0.0
3,etios g,2015.0,6.8,36000,Petrol,Dealer,Manual,0.0
4,Royal Enfield Thunder 350,2016.0,1.5,8700,Petrol,Individual,Manual,0.0


# Normalize

In [12]:
def Normalize(X, y):
    #Year
    minmax_scaler = MinMaxScaler()
    X[:, 1] = minmax_scaler.fit_transform(X[:, 1].reshape(-1, 1)).ravel()
    # Kms_Driven
    X[:, 3] = minmax_scaler.fit_transform(X[:, 3].reshape(-1, 1)).ravel()
    
    #Seller_Type
    label_encoder = LabelEncoder()
    X[:, 5] = label_encoder.fit_transform(X[:, 5])
    
    #Transmission
    X[:, -2] = label_encoder.fit_transform(X[:, -2])
    
    #Fuel_Type
    X[:, 4] = label_encoder.fit_transform(X[:, 4])
    
    X = np.delete(X, 0, axis = 1)
    return X, y
    

In [13]:
X_train, y_train = Normalize(X_train, y_train)
X_test, y_test = Normalize(X_test, y_test)

# Train model

## Linear

In [14]:
linear = LinearRegression()
linear.fit(X_train, y_train)

predict_train = linear.predict(X_train)
print("Root Mean Square Error Train: {}".format(np.sqrt(mean_squared_error(y_train, predict_train))))
predict_test = linear.predict(X_test)
print("Root Mean Square Error Test: {}".format(np.sqrt(mean_squared_error(y_test, predict_test))))

Root Mean Square Error Train: 1.7620326008720275
Root Mean Square Error Test: 2.0374249389015016


## Rigde

In [15]:
ridge = Ridge(alpha = 0.0001)
ridge.fit(X_train, y_train)

predict_train = ridge.predict(X_train)
print("Root Mean Square Error Train: {}".format(np.sqrt(mean_squared_error(y_train, predict_train))))

# predict_test = ridge.predict(X_test)
print("Root Mean Square Error Test: {}".format(np.sqrt(mean_squared_error(y_test, predict_test))))

Root Mean Square Error Train: 1.762032600931133
Root Mean Square Error Test: 2.0374249389015016


## Lasso

In [16]:
lasso = Lasso(alpha = 0.001)
lasso.fit(X_train, y_train)

predict_train = lasso.predict(X_train)
predict_test = lasso.predict(X_test)

print("Root Mean Square Error: {}".format(np.sqrt(mean_squared_error(y_train, predict_train))))
print("Root Mean Square Error: {}".format(np.sqrt(mean_squared_error(y_test, predict_test))))

Root Mean Square Error: 1.7620919756982845
Root Mean Square Error: 2.0511870258402873
