In [2]:
import pandas as pd
import numpy as np
df=pd.read_csv("car_fuel_efficiency.csv")
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [3]:

def data_prep(seed_value,df):
    print (seed_value)
    np.random.seed(seed_value)

    column_list=['engine_displacement',
    'horsepower',
    'vehicle_weight',
    'model_year',
    'fuel_efficiency_mpg']
    
    df=df[column_list]
    n=len(df)

    # print("number of records in df %s" % n)

    idx=np.arange(n)
    np.random.shuffle(idx)

    # train - 60%, val - 20%, test - 20%
    n_val=int(n * 0.2)
    n_test=int(n * 0.2)
    n_train= n - n_val - n_test

    # print("Record count for all datasets -",n_val , n_test , n_train, n_val + n_test + n_train)

    df_train=df.iloc[idx[:n_train]]
    df_val=df.iloc[idx[n_train:n_train+n_val]]
    df_test=df.iloc[idx[n_train+n_val:]]

    # drop index to avoid any confusion 

    df_train=df_train.reset_index(drop=True)
    df_test=df_test.reset_index(drop=True)
    df_val=df_val.reset_index(drop=True)

    # convert target matrix to logrithmic form

    # y_train=np.log1p(df_train.fuel_efficiency_mpg.values)
    # y_val=np.log1p(df_val.fuel_efficiency_mpg.values)
    # y_test=np.log1p(df_test.fuel_efficiency_mpg.values)

    y_train=df_train.fuel_efficiency_mpg.values
    y_val=df_val.fuel_efficiency_mpg.values
    y_test=df_test.fuel_efficiency_mpg.values

    # remove target from feature matrix

    del df_train['fuel_efficiency_mpg']
    del df_test['fuel_efficiency_mpg']
    del df_val['fuel_efficiency_mpg']

    return df_train,df_val,df_test,y_train,y_val,y_test

In [4]:
def prepare_X(df,mean):
    df_new = df.copy() # avoid overwriting df

    if mean:
        # print("Replacing null values with mean")
        fvalue=df_new.horsepower.mean()
    else:
        # print("Replacing null values with 0")
        fvalue=0
    # print("fvalue", fvalue)
    df_new['horsepower']=df_new.horsepower.fillna(fvalue)
    X=df_new.values
    return X

def train_linear_regression(X,y):

    ones=np.ones(X.shape[0])
    X=np.column_stack((ones,X))
    # adding bias term to feature matrix

    XTX=np.dot(X.T,X)
    XTX_inv=np.linalg.inv(XTX)
    # w = (X^T X)^(-1) X^T y
    # adding bias term to feature matrix

    w_full=XTX_inv.dot(X.T).dot(y)
    return w_full[0],w_full[1:]

def train_linear_regression_regularised(X,y,r=0.001):

    ones=np.ones(X.shape[0])
    X=np.column_stack((ones,X))
    # adding bias term to feature matrix

    XTX=np.dot(X.T,X)

    # regularisation step
    XTX= XTX+ r * np.eye(XTX.shape[0])

    XTX_inv=np.linalg.inv(XTX)
    # w = (X^T X)^(-1) X^T y
    # adding bias term to feature matrix

    w_full=XTX_inv.dot(X.T).dot(y)
    
    return w_full[0],w_full[1:]

def rmse(y,y_pred):

    se=(y - y_pred)**2
    mse=se.mean()
    return np.sqrt(mse)

In [5]:
df_train,df_val,df_test,y_train,y_val,y_test = data_prep(9,df)
df_full=pd.concat([df_train,df_val])
y_full=np.concatenate([y_train,y_val])

x_train = prepare_X(df=df_full, mean=False)

w0,w = train_linear_regression_regularised(x_train, y_full,r=0.001)

X_test = prepare_X(df_test, mean=False)
y_pred = w0 + X_test.dot(w)
score = rmse(y_test, y_pred)
score

9


np.float64(0.5156261299185628)