In [37]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='darkgrid')


data_train_path = ("Data_Train.xlsx")
data_test_path = ("Data_Test.xlsx")

data_train = pd.read_excel(data_train_path)
data_test = pd.read_excel(data_test_path)
data_train.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,17.74


## The Data-Preprocessing part

In [38]:
df1 = data_train[data_train.isna().any(axis=1)]
df2 = data_test[data_test.isna().any(axis=1)]

# Null present as string in the data 
data_test_null_string = data_test[data_test['Power'].str.contains('null' , na = True)]


columns_to_be_filled_with_NaN = ["Power"]

def fill_columns_with_NaN(columns_to_be_filled_with_NaN):
    for i in range (len(columns_to_be_filled_with_NaN)):    
        data_train[columns_to_be_filled_with_NaN[i]] = data_train[columns_to_be_filled_with_NaN[i]].replace(to_replace="[null]" , value = np.NaN , regex = True)
        data_test[columns_to_be_filled_with_NaN[i]] = data_test[columns_to_be_filled_with_NaN[i]].replace(to_replace="[null]" , value = np.NaN , regex = True)

fill_columns_with_NaN(columns_to_be_filled_with_NaN)

empty_cols_list_train = ["Mileage","Engine","Power","Seats"] 
empty_cols_list_test = ["Engine","Power","Seats"] 


def my_Nan_filling_function(dataset , empty_cols_list):
    for i in range(len(empty_cols_list)):
        
        # Replacing Nan with mean values
        # dataset[empty_cols_list[i]].fillna(dataset[empty_cols_list[i]].mean , inplace = True)
        
        # Replacing Nan with mode values
        dataset[empty_cols_list[i]].fillna(dataset[empty_cols_list[i]].mode()[0] , inplace = True)

    print(dataset)

my_Nan_filling_function(data_train , empty_cols_list_train)

my_Nan_filling_function(data_test , empty_cols_list_test)

list_of_columns_with_units = ["Mileage" , "Engine" , "Power"] 

def remove_units_from_columns(list_of_columns_with_units):
    for i in range(len(list_of_columns_with_units)):
        data_train[list_of_columns_with_units[i]] = data_train[list_of_columns_with_units[i]].str.split(" " , expand = True)
        data_train[list_of_columns_with_units[i]] = data_train[list_of_columns_with_units[i]].astype("float")
        data_test[list_of_columns_with_units[i]] = data_test[list_of_columns_with_units[i]].str.split(" " , expand = True)
        data_test[list_of_columns_with_units[i]] = data_test[list_of_columns_with_units[i]].astype("float")

remove_units_from_columns(list_of_columns_with_units)

data_train["Year"] = data_train["Year"].astype("int32" , copy = False)
data_test["Year"] = data_test["Year"].astype("int32" , copy = False)

                                  Name    Location  Year  Kilometers_Driven  \
0               Maruti Wagon R LXI CNG      Mumbai  2010              72000   
1     Hyundai Creta 1.6 CRDi SX Option        Pune  2015              41000   
2                         Honda Jazz V     Chennai  2011              46000   
3                    Maruti Ertiga VDI     Chennai  2012              87000   
4      Audi A4 New 2.0 TDI Multitronic  Coimbatore  2013              40670   
...                                ...         ...   ...                ...   
6014                  Maruti Swift VDI       Delhi  2014              27365   
6015          Hyundai Xcent 1.1 CRDi S      Jaipur  2015             100000   
6016             Mahindra Xylo D4 BSIV      Jaipur  2012              55000   
6017                Maruti Wagon R VXI     Kolkata  2013              46000   
6018             Chevrolet Beat Diesel   Hyderabad  2011              47000   

     Fuel_Type Transmission Owner_Type     Mileage 

## Removing the outliers part

In [39]:
def get_numerical_features(dataset):
    df = dataset
    df_numerics_only = df.select_dtypes(include=np.number)
    # print(df_numerics_only)
    colnames_numerics_only = df.select_dtypes(include=np.number).columns.tolist()
    # print(colnames_numerics_only)
    return colnames_numerics_only

df_train = get_numerical_features(data_train)
print(df_train)

df_test = get_numerical_features(data_test)
print(df_test)

print(data_train.shape)

remove_train_outliers_list = df_train
remove_test_outliers_list = df_test

def remove_outliers(dataset , remove_outliers_list):
    # Iterating across the list of numerical categories for which the outliers have to be removed
    for i in range(len(remove_outliers_list)):
        # Change the value of the outlier criterion for better results
        dataset.drop(dataset[dataset[remove_outliers_list[i]] >=( 0.99999 * dataset[remove_outliers_list[i]].max()) ].index, inplace = True )

remove_outliers(data_train , remove_train_outliers_list)
remove_outliers(data_test , remove_test_outliers_list)

print(data_train.shape)

['Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats', 'Price']
['Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats']
(6019, 12)
(5903, 12)


## Skipping the EDA part (in the other notebook)

In [6]:
"""
 Summary of some of the vairables
"""

'''Actual train data '''
# data_train  
'''Actual train data xlsx file '''
# data_train_path   
'''Actual train data csv file '''
# data_train_csv_path 
''' Only Numerical Features of train data  '''
# df_train  


'''Actual test data '''
# data_test  
'''Actual test data xlsx file '''
# data_test_path   
'''Actual test data csv file '''
# data_test_csv_path 
''' Only Numerical Features of test data  '''
# df_test  


' Only Numerical Features of test data  '

# Feature Engineering and Feature Selection

### Univariate Feature Selection

- Constant, Quasi-Constant and Duplicate Feature Removal


In [7]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold

In [8]:
data_train.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6,998.0,58.16,5.0,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74


In [9]:
'''
CONSTANT FEATURES REMOVAL
 Dropping Categorical variables Temporarily
 This is necessary for this particaular feature selection method
'''
X = data_train.drop(["Name","Location","Fuel_Type", "Transmission" , "Owner_Type" ,  "Price"] , axis = 1)
y = data_train["Price"]

X.shape , y.shape

((5903, 6), (5903,))

In [10]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 0 )
X_train.shape , X_test.shape , y_train.shape , y_test.shape 

((4722, 6), (1181, 6), (4722,), (1181,))

In [12]:
# Removing the constant features
constant_filter.get_support().sum()

6

In [13]:
# inverting the filter list
constant_list = [not temp for temp in constant_filter.get_support()]
print(constant_list)

[False, False, False, False, False, False]


In [14]:
# Printing the list of constant features
X.columns[constant_list]

Index([], dtype='object')

In [15]:
'''
 Transforming the dataset into non-constant feaure space
 Bascially removing the constant features
 done to prevent overfitting of the model
 '''
X_train_filter = constant_filter.transform(X_train)
X_test_filter = constant_filter.transform(X_test)

In [16]:
X_train_filter.shape, X_test_filter.shape, X_train.shape , X_test.shape

((4722, 6), (1181, 6), (4722, 6), (1181, 6))

In [17]:
'''
QUASI CONSTANT FEATURE REMOVAL
Removing feature almost constant or somewhat near to constant(Quasi Constant)
'''
# This method removes features with variation below a certain cutoff.
quasi_constant_filter = VarianceThreshold(threshold = 0.01)

In [18]:
quasi_constant_filter.fit(X_train_filter)

VarianceThreshold(threshold=0.01)

In [19]:
quasi_constant_filter.get_support().sum()

6

In [20]:
X_train_quasi_filter = quasi_constant_filter.transform(X_train_filter)
X_test_quasi_filter = quasi_constant_filter.transform(X_test_filter)
X_train.shape,X_test.shape,X_train_filter.shape,X_test_filter.shape, X_train_quasi_filter.shape,X_test_quasi_filter.shape

((4722, 6), (1181, 6), (4722, 6), (1181, 6), (4722, 6), (1181, 6))

In [43]:
X_train_T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4712,4713,4714,4715,4716,4717,4718,4719,4720,4721
0,2011.0,2015.0,2007.0,2012.0,2009.0,2017.0,2014.0,2010.0,2017.0,2014.0,...,2008.0,2015.0,2013.0,2002.0,2012.0,2013.0,2013.0,2017.0,2012.0,2014.0
1,68000.0,39208.0,79000.0,47629.0,42000.0,35000.0,71649.0,38000.0,38056.0,34233.0,...,86000.0,57266.0,69000.0,75000.0,52000.0,30852.0,43000.0,47357.0,60000.0,62497.0
2,17.0,17.57,17.0,15.1,13.0,18.19,25.1,18.5,23.9,14.21,...,13.93,20.5,22.7,0.0,15.26,22.74,20.5,18.49,19.81,15.8
3,1497.0,1193.0,1086.0,2179.0,2987.0,1968.0,1498.0,1197.0,1582.0,2143.0,...,2179.0,1248.0,1498.0,2112.0,2143.0,796.0,1598.0,1493.0,1086.0,1499.0
4,118.0,88.7,74.0,140.0,210.0,174.5,98.6,80.0,126.2,203.0,...,138.0,91.72,89.84,74.0,203.2,47.3,105.0,100.0,68.05,110.0
5,5.0,5.0,5.0,7.0,5.0,5.0,5.0,5.0,5.0,5.0,...,7.0,5.0,5.0,6.0,5.0,5.0,5.0,7.0,5.0,5.0


In [21]:
'''
DUPLICATE FEATURE REMOVAL
'''
X_train_T = X_train_quasi_filter.T
X_test_T = X_test_quasi_filter.T
type(X_train_T),type(X_test_T)

(numpy.ndarray, numpy.ndarray)

In [22]:
# Changing it back to Pandas Dataframe
X_train_T = pd.DataFrame(X_train_T)
X_test_T = pd.DataFrame(X_test_T)
X_train_T.shape , X_test_T.shape

((6, 4722), (6, 1181))

In [23]:
X_train_T.duplicated().sum()

0

In [24]:
# Finding the duplicated features
duplicated_features = X_train_T.duplicated()
print(duplicated_features)

0    False
1    False
2    False
3    False
4    False
5    False
dtype: bool


In [25]:
# Inverting the dupkicated list to find the features to keep
features_to_keep = [not index for index in duplicated_features]
print(features_to_keep)

[True, True, True, True, True, True]


In [26]:
# After removing constant , quasi-constant and duplicate features
X_train_unique = X_train_T[features_to_keep].T
X_test_unique = X_test_T[features_to_keep].T
X_train_unique.shape,X_train.shape

((4722, 6), (4722, 6))

In [27]:
'''
UNFORTUNATELY,there are:
0 constant
0 quasi-constant 
0 duplicate features
'''

'\nUNFORTUNATELY,there are:\n0 constant\n0 quasi-constant \n0 duplicate features\n'

In [28]:
import sklearn
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error , mean_squared_error , r2_score

In [29]:
X_train

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats
1862,2011,68000,17.00,1497.0,118.00,5.0
5181,2015,39208,17.57,1193.0,88.70,5.0
5861,2007,79000,17.00,1086.0,74.00,5.0
4805,2012,47629,15.10,2179.0,140.00,7.0
1477,2009,42000,13.00,2987.0,210.00,5.0
...,...,...,...,...,...,...
5023,2013,30852,22.74,796.0,47.30,5.0
3330,2013,43000,20.50,1598.0,105.00,5.0
1687,2017,47357,18.49,1493.0,100.00,7.0
2657,2012,60000,19.81,1086.0,68.05,5.0


In [30]:
def run_Linear_Regression(X_train, X_test, y_train , y_test):
    print("The Results of Linear Regression Model")
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    print("The R2 score is:" , r2_score(y_test, y_predict))
    print("The RMSE is:" , np.sqrt(mean_squared_error(y_test , y_predict)))
    print("The Standard deviation of y is ",np.std(y))
    print()
    
run_Linear_Regression(X_train , X_test, y_train , y_test)
run_Linear_Regression(X_train_filter , X_test_filter, y_train , y_test)
run_Linear_Regression(X_train_quasi_filter , X_test_quasi_filter, y_train , y_test)
run_Linear_Regression(X_train_unique , X_test_unique, y_train , y_test)

The Results of Linear Regression Model
The R2 score is: 0.6943010059050618
The RMSE is: 5.7615207169111855
The Standard deviation of y is  10.63176352420769

The Results of Linear Regression Model
The R2 score is: 0.6943010059050618
The RMSE is: 5.7615207169111855
The Standard deviation of y is  10.63176352420769

The Results of Linear Regression Model
The R2 score is: 0.6943010059050618
The RMSE is: 5.7615207169111855
The Standard deviation of y is  10.63176352420769

The Results of Linear Regression Model
The R2 score is: 0.6943010059050618
The RMSE is: 5.7615207169111855
The Standard deviation of y is  10.63176352420769

