In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasRegressor

2024-07-17 09:58:22.012562: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-17 09:58:22.107041: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-17 09:58:22.862229: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load datasets

train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [3]:
# make copy to add price back to the newly encoded df

train_df_copy = train_df.copy()
test_df_copy = test_df.copy()

# Inspect the data

In [4]:
train_df.shape, test_df.shape

((54273, 13), (36183, 12))

In [5]:
train_df.head(3)

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,Yes,11000
1,1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,8250
2,2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,Yes,15000


In [6]:
test_df.head(3)

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,54273,Mercedes-Benz,E-Class E 350,2014,73000,Gasoline,302.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,A/T,White,Beige,None reported,Yes
1,54274,Lexus,RX 350 Base,2015,128032,Gasoline,275.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,8-Speed A/T,Silver,Black,None reported,Yes
2,54275,Mercedes-Benz,C-Class C 300,2015,51983,Gasoline,241.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Blue,White,None reported,Yes


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54273 entries, 0 to 54272
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            54273 non-null  int64 
 1   brand         54273 non-null  object
 2   model         54273 non-null  object
 3   model_year    54273 non-null  int64 
 4   milage        54273 non-null  int64 
 5   fuel_type     54273 non-null  object
 6   engine        54273 non-null  object
 7   transmission  54273 non-null  object
 8   ext_col       54273 non-null  object
 9   int_col       54273 non-null  object
 10  accident      54273 non-null  object
 11  clean_title   54273 non-null  object
 12  price         54273 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 5.4+ MB


# Drop columns you don't want to be encoded

In [8]:
# dropping the price so that train_df & test_df both have the same cols

train_df.drop(columns= ['id', 'price'], inplace=True)
test_df.drop(columns=['id'], inplace=True)

In [9]:
train_df.shape, test_df.shape

((54273, 11), (36183, 11))

# Identify both object and non-object columns

In [10]:
# Identify object columns and non-object columns

object_cols = train_df.select_dtypes(include=['object']).columns
non_object_cols = train_df.select_dtypes(exclude=['object']).columns

In [11]:
object_cols, non_object_cols

(Index(['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col',
        'int_col', 'accident', 'clean_title'],
       dtype='object'),
 Index(['model_year', 'milage'], dtype='object'))

In [12]:
# Separate non-object columns

train_non_object = train_df[non_object_cols]
test_non_object = test_df[non_object_cols]

In [13]:
train_non_object.head(3)

Unnamed: 0,model_year,milage
0,2018,74349
1,2007,80000
2,2009,91491


In [14]:
test_non_object.head(3)

Unnamed: 0,model_year,milage
0,2014,73000
1,2015,128032
2,2015,51983


In [15]:
# Ensure both datasets have the same columns

missing_cols = set(train_df.columns) - set(test_df.columns)
for col in missing_cols:
    test_df[col] = 0

# Start encoding process
- Encoding all object columns in both training and testing data

In [16]:
# Initialize the OneHotEncoder with sparse output

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=True)

In [17]:
# fit the data

encoder.fit(train_df[object_cols])

In [18]:
# transform the data

train_encoded = encoder.transform(train_df[object_cols])
test_encoded = encoder.transform(test_df[object_cols])

In [19]:
# check the types, ensure that they are sparse matrices

type(train_encoded), type(test_encoded)

(scipy.sparse._csr.csr_matrix, scipy.sparse._csr.csr_matrix)

In [20]:
# check a few of the new encoded features

encoder.get_feature_names_out()

array(['brand_Acura', 'brand_Alfa', 'brand_Aston', ...,
       'accident_At least 1 accident or damage reported',
       'accident_None reported', 'clean_title_Yes'], dtype=object)

In [21]:
# new df containing the encoded object cols for the train_df

train_one_hot_encoded_frame = pd.DataFrame.sparse.from_spmatrix(train_encoded, columns=encoder.get_feature_names_out())

In [22]:
# new df containing the encoded object cols for the test_df

test_one_hot_encoded_frame = pd.DataFrame.sparse.from_spmatrix(test_encoded, columns=encoder.get_feature_names_out())

In [23]:
train_one_hot_encoded_frame.head(3)

Unnamed: 0,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,brand_Cadillac,brand_Chevrolet,...,int_col_Very Light Cashmere,int_col_Walnut,int_col_Whisper Beige,int_col_White,int_col_White / Brown,int_col_Yellow,int_col_–,accident_At least 1 accident or damage reported,accident_None reported,clean_title_Yes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [24]:
test_one_hot_encoded_frame.head(3)

Unnamed: 0,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,brand_Cadillac,brand_Chevrolet,...,int_col_Very Light Cashmere,int_col_Walnut,int_col_Whisper Beige,int_col_White,int_col_White / Brown,int_col_Yellow,int_col_–,accident_At least 1 accident or damage reported,accident_None reported,clean_title_Yes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


In [25]:
# ensure that the both newly created object cols (encoded) dfs have the same rows

train_one_hot_encoded_frame.shape, test_one_hot_encoded_frame.shape

((54273, 3381), (36183, 3381))

## Combine both dfs

In [26]:
# Concatenate the non-object columns with the encoded object columns

train_data_final = pd.concat([train_non_object.reset_index(drop=True), train_one_hot_encoded_frame], axis=1)
test_data_final = pd.concat([test_non_object.reset_index(drop=True), test_one_hot_encoded_frame], axis=1)

In [27]:
train_data_final.shape, test_data_final.shape

((54273, 3383), (36183, 3383))

In [28]:
train_data_final.head(3)

Unnamed: 0,model_year,milage,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,...,int_col_Very Light Cashmere,int_col_Walnut,int_col_Whisper Beige,int_col_White,int_col_White / Brown,int_col_Yellow,int_col_–,accident_At least 1 accident or damage reported,accident_None reported,clean_title_Yes
0,2018,74349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,2007,80000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,2009,91491,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [29]:
test_data_final.head(3)

Unnamed: 0,model_year,milage,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,...,int_col_Very Light Cashmere,int_col_Walnut,int_col_Whisper Beige,int_col_White,int_col_White / Brown,int_col_Yellow,int_col_–,accident_At least 1 accident or damage reported,accident_None reported,clean_title_Yes
0,2014,73000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,2015,128032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,2015,51983,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


# test/train split and setup

In [30]:
# need to add 'Price' back to the train_data_final

train_data_final = pd.concat([train_data_final.reset_index(drop=True), train_df_copy['price']], axis=1)

In [31]:
train_data_final.shape

(54273, 3384)

In [32]:
train_data_final.head(3)

Unnamed: 0,model_year,milage,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,...,int_col_Walnut,int_col_Whisper Beige,int_col_White,int_col_White / Brown,int_col_Yellow,int_col_–,accident_At least 1 accident or damage reported,accident_None reported,clean_title_Yes,price
0,2018,74349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,11000
1,2007,80000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,8250
2,2009,91491,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,15000


In [33]:
# test/train split

X = train_data_final.drop(columns= ['price'])
y = train_data_final['price']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

In [35]:
X.shape, y.shape

((54273, 3383), (54273,))

In [36]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((43418, 3383), (10855, 3383), (43418,), (10855,))

# Helper functions to print out R2, MSE and MAE scores from below models
- make_results => takes in a model name and returns metrics
- get_test_scores => creates a nice table with my results/metrics

## Mean Absolute Error (MAE) - The average of the absolute differences between predicted values and actual values.
- Useful for understanding the average error in predictions.
## Mean Squared Error (MSE) - The average of the squared differences between predicted values and actual values.
- Useful for penalizing larger errors more heavily. Sensitive to outliers.
## R-Squared (R2) - Measures how well the regression line from the model approximates the real data points.
- Measures how well the model explains variability of the dependent variable.

## function: make_results

In [37]:
def make_results(model_name:str, model_object, metric:str):
    '''
    Arguments:
    model_name (string): user labeled the model
    model_object: a fit GridSearchCV object
    metric (string): neg_mean_absolute_error, neg_mean_squared_error or r2

    Returns a pandas df with the neg_mean_absolute_error, neg_mean_squared_error and r2 scores
    for the model with the best mean 'metric' score across all validation folds.
    '''

    # Create dictionary that maps input metric to actual metric name in GridSearchCV
    metric_dict = {'neg_mean_absolute_error': 'mean_test_neg_mean_absolute_error',
                 'neg_mean_squared_error': 'mean_test_neg_mean_squared_error',
                 'r2': 'mean_test_r2',
                 }

    # Get all the results from the CV and put them in a df
    cv_results = pd.DataFrame(model_object.cv_results_)

    # Isolate the row of the df with the max(metric) score
    best_estimator_results = cv_results.iloc[cv_results[metric_dict[metric]].idxmax(), :]

    # Extract neg_mean_absolute_error, neg_mean_squared_error, and r2 score from that row
    neg_mean_absolute_error = best_estimator_results.mean_test_neg_mean_absolute_error
    neg_mean_squared_error = best_estimator_results.mean_test_neg_mean_squared_error
    r2 = best_estimator_results.mean_test_r2

    # Create table of results
    table = pd.DataFrame({'model': [model_name],
                        'neg_mean_absolute_error': [neg_mean_absolute_error],
                        'neg_mean_squared_error': [neg_mean_squared_error],
                        'r2': [r2],
                        },
                       )

    return table

## function: get_test_scores

In [38]:
def get_test_scores(model_name:str, preds, y_test_data):
    '''
    Generate a table of test scores.

    In:
    model_name (string): how the model will be named in the output table
    preds: numpy array of test predictions
    y_test_data: numpy array of y_test data

    Out:
    table: a pandas df of neg_mean_absolute_error, neg_mean_squared_error and r2 scores for your model
    '''
    negative_mean_absolute_error = mean_absolute_error(y_test_data, preds)
    negative_mean_squared_error = mean_squared_error(y_test_data, preds)
    r2 = r2_score(y_test_data, preds)

    table = pd.DataFrame({'model': [model_name],
                        'neg_mean_absolute_error': [negative_mean_absolute_error],
                        'neg_mean_squared_error': [negative_mean_squared_error],
                        'r2': [r2]
                        })

    return table

# LinearRegression and GridSearchCV

In [39]:
# Instantiate linear regression model
lr = LinearRegression()

# Create a dictionary of hyperparameters to tune
cv_params = {'fit_intercept': [False],
             'positive': [True]             
}

# Define a set of scoring metrics to capture
scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']

lr1 = GridSearchCV(lr, cv_params, scoring=scoring, cv=4, refit='neg_mean_absolute_error', n_jobs=-1, verbose=1)

In [40]:
%%time

lr1.fit(X_train, y_train)

Fitting 4 folds for each of 1 candidates, totalling 4 fits




CPU times: user 8min 10s, sys: 2.82 s, total: 8min 12s
Wall time: 26min 39s


In [41]:
# Examine best score

lr1.best_score_

-19924.29408944604

In [42]:
# Obtain best parameters

lr1.best_params_

{'fit_intercept': False, 'positive': True}

In [43]:
# Call 'make_results()' on the GridSearch object

results = make_results('LR CV', lr1, 'neg_mean_absolute_error')
# results = pd.concat([results, results], axis=0)
results

Unnamed: 0,model,neg_mean_absolute_error,neg_mean_squared_error,r2
0,LR CV,-19924.294089,-5036322000.0,0.012942


In [44]:
# Get scores on test data

lr_preds = lr1.best_estimator_.predict(X_test)



In [45]:
# Get scores on test data

lr_test_scores = get_test_scores('LR test', lr_preds, y_test)
results = pd.concat([results, lr_test_scores], axis=0)
results

Unnamed: 0,model,neg_mean_absolute_error,neg_mean_squared_error,r2
0,LR CV,-19924.294089,-5036322000.0,0.012942
0,LR test,20031.098488,5680592000.0,0.040122


# RandomForestRegressor and GridSearchCV

In [46]:
# Instantiate random forest classifier
rfr = RandomForestRegressor()

# Create a dictionary of hyperparameters to tune
cv_params = {'max_depth': [5],
             'min_samples_split': [10],
             'n_estimators': [50]
}

# Define a set of scoring metrics to capture
scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']

# Instantiate the GridSearchCV object
rf1 = GridSearchCV(rfr, cv_params, scoring=scoring, cv=4, refit='neg_mean_absolute_error', n_jobs=-1, verbose=1)

In [47]:
%%time

rf1.fit(X_train, y_train)

Fitting 4 folds for each of 1 candidates, totalling 4 fits




CPU times: user 1min 7s, sys: 586 ms, total: 1min 8s
Wall time: 2min 17s


In [48]:
# Examine best score

rf1.best_score_

-19412.829282254956

In [49]:
# Obtain best parameters

rf1.best_params_

{'max_depth': 5, 'min_samples_split': 10, 'n_estimators': 50}

In [50]:
# Call 'make_results()' on the GridSearch object

rf_cv_results = make_results('RF CV', rf1, 'neg_mean_absolute_error')
results = pd.concat([results, rf_cv_results], axis=0)
results

Unnamed: 0,model,neg_mean_absolute_error,neg_mean_squared_error,r2
0,LR CV,-19924.294089,-5036322000.0,0.012942
0,LR test,20031.098488,5680592000.0,0.040122
0,RF CV,-19412.829282,-4900808000.0,0.048282


In [51]:
# Get scores on test data

rf_preds = rf1.best_estimator_.predict(X_test)



In [52]:
# Get scores on test data

rf_test_scores = get_test_scores('RF test', rf_preds, y_test)
results = pd.concat([results, rf_test_scores], axis=0)
results

Unnamed: 0,model,neg_mean_absolute_error,neg_mean_squared_error,r2
0,LR CV,-19924.294089,-5036322000.0,0.012942
0,LR test,20031.098488,5680592000.0,0.040122
0,RF CV,-19412.829282,-4900808000.0,0.048282
0,RF test,19291.191762,5535498000.0,0.064639


# XGBoost and GridSearchCV

In [53]:
# Instantiate the XGBoost classifier
xgb = xgb.XGBRegressor(objective='reg:squarederror', random_state=25)

# Create a dictionary of hyperparameters to tune
cv_params = {'learning_rate': [0.2],
             'max_depth': [3],
             'n_estimators': [50]
             }

# Define a set of scoring metrics to capture
scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']

# Instantiate the GridSearchCV object
xgb1 = GridSearchCV(xgb, cv_params, scoring=scoring, cv=4, refit='neg_mean_absolute_error', n_jobs=-1, verbose=1)

In [54]:
%%time

xgb1.fit(X_train, y_train)

Fitting 4 folds for each of 1 candidates, totalling 4 fits
CPU times: user 48.1 s, sys: 1.96 s, total: 50 s
Wall time: 29.5 s


In [55]:
# Examine best score
xgb1.best_score_

-18400.682969972717

In [56]:
# Examine best parameters
xgb1.best_params_

{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 50}

In [57]:
# Call make_results() on the GridSearch object

xgb1_cv_results = make_results('XGB CV', xgb1, 'neg_mean_absolute_error')
results = pd.concat([results, xgb1_cv_results], axis=0)
results

Unnamed: 0,model,neg_mean_absolute_error,neg_mean_squared_error,r2
0,LR CV,-19924.294089,-5036322000.0,0.012942
0,LR test,20031.098488,5680592000.0,0.040122
0,RF CV,-19412.829282,-4900808000.0,0.048282
0,RF test,19291.191762,5535498000.0,0.064639
0,XGB CV,-18400.68297,-4886125000.0,0.044883


In [58]:
# Get scores on test data

xgb_preds = xgb1.best_estimator_.predict(X_test)

In [59]:
# Get scores on test data

xgb_test_scores = get_test_scores('XGB test', xgb_preds, y_test)
results = pd.concat([results, xgb_test_scores], axis=0)
results

Unnamed: 0,model,neg_mean_absolute_error,neg_mean_squared_error,r2
0,LR CV,-19924.294089,-5036322000.0,0.012942
0,LR test,20031.098488,5680592000.0,0.040122
0,RF CV,-19412.829282,-4900808000.0,0.048282
0,RF test,19291.191762,5535498000.0,0.064639
0,XGB CV,-18400.68297,-4886125000.0,0.044883
0,XGB test,18564.635809,5531917000.0,0.065244


# DecisionTreeClassifier and GridSearchCV

In [60]:
# Instantiate random forest classifier
dtc = DecisionTreeRegressor()

# Create a dictionary of hyperparameters to tune
cv_params = {'max_depth': [5],
             'min_samples_split': [10],
             'min_samples_leaf': [10]
             }

# Define a set of scoring metrics to capture
scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']

# Instantiate the GridSearchCV object
dtc1 = GridSearchCV(dtc, cv_params, scoring=scoring, cv=4, refit='neg_mean_absolute_error', n_jobs=-1, verbose=1)

In [61]:
%%time

dtc1.fit(X_train, y_train)

Fitting 4 folds for each of 1 candidates, totalling 4 fits




CPU times: user 2.63 s, sys: 241 ms, total: 2.87 s
Wall time: 8.83 s


In [62]:
# Examine best score

dtc1.best_score_

-19348.23736822779

In [63]:
# Examine best parameters

dtc1.best_params_

{'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 10}

In [64]:
# Call 'make_results()' on the GridSearch object

dtv_cv_results = make_results('DTC CV', dtc1, 'neg_mean_absolute_error')
results = pd.concat([results, dtv_cv_results], axis=0)
results

Unnamed: 0,model,neg_mean_absolute_error,neg_mean_squared_error,r2
0,LR CV,-19924.294089,-5036322000.0,0.012942
0,LR test,20031.098488,5680592000.0,0.040122
0,RF CV,-19412.829282,-4900808000.0,0.048282
0,RF test,19291.191762,5535498000.0,0.064639
0,XGB CV,-18400.68297,-4886125000.0,0.044883
0,XGB test,18564.635809,5531917000.0,0.065244
0,DTC CV,-19348.237368,-4829772000.0,0.060186


In [65]:
# Get scores on test data

dtc_preds = dtc1.best_estimator_.predict(X_test)



In [66]:
# Get scores on test data

dtc_test_scores = get_test_scores('DTC test', dtc_preds, y_test)
results = pd.concat([results, dtc_test_scores], axis=0)
results

Unnamed: 0,model,neg_mean_absolute_error,neg_mean_squared_error,r2
0,LR CV,-19924.294089,-5036322000.0,0.012942
0,LR test,20031.098488,5680592000.0,0.040122
0,RF CV,-19412.829282,-4900808000.0,0.048282
0,RF test,19291.191762,5535498000.0,0.064639
0,XGB CV,-18400.68297,-4886125000.0,0.044883
0,XGB test,18564.635809,5531917000.0,0.065244
0,DTC CV,-19348.237368,-4829772000.0,0.060186
0,DTC test,19451.246515,5511835000.0,0.068638


# Tensorflow and GridSearchCV

## NeuralNetwork Setup

In [67]:
# Convert sparse matrices to dense tensors

X_train_array = X_train.values
X_test_array = X_test.values
y_train_array = y_train.values
y_test_array = y_test.values

In [68]:
# Get the number of features

input_dim = X_train_array.shape[1]
input_dim

3383

In [69]:
# Define the model

def create_model(optimizer='adam', learning_rate=1e-3):
    model = Sequential()
    model.add(Dense(10, input_dim=input_dim, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mae'])
    return model

In [70]:
# Wrap the model using the KerasRegressor
# Needed to allow compatibility between the Tensorflow object and scikit-learn's GridSearchCV

tf_model = KerasRegressor(build_fn=create_model, verbose=1)

In [71]:
# Create a dictionary of hyperparameters to tune
cv_params = {'batch_size': [32],
             'epochs': [200, 300]
            }

# Define a set of scoring metrics to capture
scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']

# Instantiate the GridSearchCV object
tf_model1 = GridSearchCV(tf_model, cv_params, scoring=scoring, cv=4, refit='neg_mean_absolute_error', n_jobs=-1, verbose=1)

In [72]:
%%time

tf_model1.fit(X_train_array, y_train_array)

Fitting 4 folds for each of 2 candidates, totalling 8 fits


2024-07-17 10:28:20.117908: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-17 10:28:20.117908: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-17 10:28:20.117909: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-17 10:28:20.117911: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-17 10:28:20.120942: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-17 10:28:20.120944: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-17 10:28:20.120944: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not b

Epoch 1/300
Epoch 1/300
Epoch 1/300
Epoch 1/300
[1m 541/1018[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m1s[0m 3ms/step - loss: 7673294336.0000 - mae: 35248.5977

2024-07-17 10:28:29.706738: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 440642516 exceeds 10% of free system memory.
2024-07-17 10:28:29.865575: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 440656048 exceeds 10% of free system memory.


[1m 849/1018[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 2ms/step - loss: 6805466112.0000 - mae: 35114.5938Epoch 1/200
[1m 851/1018[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 3ms/step - loss: 7402331136.0000 - mae: 34596.7930

2024-07-17 10:28:30.389258: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 440656048 exceeds 10% of free system memory.
2024-07-17 10:28:30.427077: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 440642516 exceeds 10% of free system memory.


[1m 871/1018[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 3ms/step - loss: 7380293632.0000 - mae: 34559.0312Epoch 1/200
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 5468774912.0000 - mae: 34310.6172
Epoch 2/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 6157713920.0000 - mae: 34014.8711
Epoch 2/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 6627950080.0000 - mae: 34764.5312
Epoch 2/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 7224775168.0000 - mae: 34327.4688
Epoch 2/300
[1m  46/1018[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 1ms/step - loss: 5191720448.0000 - mae: 31615.9219  04.47Epoch 1/200
Epoch 1/200
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 6782364672.0000 - mae: 27036.158284
Epoch 3/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/s

2024-07-17 10:38:36.188395: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 146889860 exceeds 10% of free system memory.


[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/stepep - loss: 5496364544.0000 - mae: 20240.04
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 5193255936.0000 - mae: 19830.4375
Epoch 197/200
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 4066668288.0000 - mae: 18733.2227
[1m 787/1018[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 2ms/step - loss: 4903704064.0000 - mae: 20315.0410Epoch 185/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 6246808576.0000 - mae: 38565.9180
Epoch 199/200
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 4878098944.0000 - mae: 20151.6504
Epoch 190/200
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 3937093632.0000 - mae: 19508.8438
Epoch 202/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 5500168192.0000

2024-07-17 10:38:43.185405: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 146889860 exceeds 10% of free system memory.


[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 5921467904.0000 - mae: 20378.7012
Epoch 199/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 4805985280.0000 - mae: 19490.2910
[1m136/340[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m0s[0m 1ms/stepEpoch 208/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 3480597248.0000 - mae: 18965.3633
Epoch 204/300
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/stepep - loss: 5340717568.0000 - mae: 20325.9398
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 5235129856.0000 - mae: 20215.4043
Epoch 200/200
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 5402716672.0000 - mae: 19996.0664
Epoch 188/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 3240549376.0000 - mae: 18670.9473
Epoch 200/300
[1m1018/

2024-07-17 10:38:49.318461: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 146876328 exceeds 10% of free system memory.


[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 5626278912.0000 - mae: 19752.4258
Epoch 207/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 6720172032.0000 - mae: 20930.7871
Epoch 211/300
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 866us/step - loss: 3801767168.0000 - mae: 19594.9097
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 5850526208.0000 - mae: 20075.4453
Epoch 195/200
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 4099487232.0000 - mae: 19507.5273
Epoch 204/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 4148085504.0000 - mae: 19638.4355
Epoch 191/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 4429996544.0000 - mae: 18994.2480
Epoch 208/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss

2024-07-17 10:39:00.599953: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 146876328 exceeds 10% of free system memory.


[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 839us/step - loss: 5635175936.0000 - mae: 19720.5801
Epoch 199/300
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 981us/step - loss: 4452677632.0000 - mae: 20705.2994
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 5796135936.0000 - mae: 19815.0625
Epoch 209/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 982us/step - loss: 6151159808.0000 - mae: 20346.9375
Epoch 200/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 5470539776.0000 - mae: 20192.9902
[1m 370/1018[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m0s[0m 819us/step - loss: 4965432320.0000 - mae: 19077.1133Epoch 214/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 4851638784.0000 - mae: 20452.2949
Epoch 219/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 834us/step - loss: 49270

2024-07-17 10:40:46.802014: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 146889860 exceeds 10% of free system memory.


[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 5353538048.0000 - mae: 19837.83201
Epoch 279/300
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 571us/step - loss: 10697401344.0000 - mae: 23737.87
[1m 138/1018[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 736us/step - loss: 4287959808.0000 - mae: 19429.5781

2024-07-17 10:40:47.179207: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 146889860 exceeds 10% of free system memory.


[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 613us/stepep - loss: 4724122624.0000 - mae: 19322.42
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 863us/step - loss: 7220253184.0000 - mae: 21257.1191
Epoch 295/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 724us/step - loss: 4635955712.0000 - mae: 19431.2969
Epoch 280/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 695us/step - loss: 3638231808.0000 - mae: 18966.9023
Epoch 296/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 714us/step - loss: 4020799488.0000 - mae: 18889.7480
Epoch 281/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 685us/step - loss: 4705466368.0000 - mae: 19388.3613
Epoch 297/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 635us/step - loss: 3270156288.0000 - mae: 19021.9238
Epoch 282/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 673

2024-07-17 10:40:52.044127: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 146876328 exceeds 10% of free system memory.


[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 490us/stepep - loss: 5501355520.0000 - mae: 18927.75
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 663us/step - loss: 5233849856.0000 - mae: 19055.7324
Epoch 287/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 663us/step - loss: 4829892096.0000 - mae: 19279.0391
Epoch 288/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 654us/step - loss: 4656275968.0000 - mae: 19070.8496
Epoch 289/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 627us/step - loss: 3837089280.0000 - mae: 18877.6133
Epoch 290/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 635us/step - loss: 6019085312.0000 - mae: 20111.0762
Epoch 291/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 621us/step - loss: 3866378496.0000 - mae: 19041.8047
Epoch 292/300
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 612

2024-07-17 10:41:02.031943: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 146876328 exceeds 10% of free system memory.


[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 439us/step


  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/300
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 631us/step - loss: 5443931136.0000 - mae: 33271.0039
Epoch 2/300
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 684us/step - loss: 4142927872.0000 - mae: 20063.2227
Epoch 3/300
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 625us/step - loss: 5547051008.0000 - mae: 19777.1738
Epoch 4/300
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 643us/step - loss: 4610949632.0000 - mae: 19912.8633
Epoch 5/300
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 615us/step - loss: 4511784448.0000 - mae: 19516.8711
Epoch 6/300
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 632us/step - loss: 3812902912.0000 - mae: 19199.9648
Epoch 7/300
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 618us/step - loss: 5652322304.0000 - mae: 20094.5645
Epoch 8/300
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [73]:
# Examine best score

tf_model1.best_score_

-20667.467647854428

In [74]:
# Examine best parameters

tf_model1.best_params_

{'batch_size': 32, 'epochs': 300}

In [75]:
# Call 'make_results()' on the GridSearch object

tf_cv_results = make_results('TF CV', tf_model1, 'neg_mean_absolute_error')
results = pd.concat([results, tf_cv_results], axis=0)
results

Unnamed: 0,model,neg_mean_absolute_error,neg_mean_squared_error,r2
0,LR CV,-19924.294089,-5036322000.0,0.012942
0,LR test,20031.098488,5680592000.0,0.040122
0,RF CV,-19412.829282,-4900808000.0,0.048282
0,RF test,19291.191762,5535498000.0,0.064639
0,XGB CV,-18400.68297,-4886125000.0,0.044883
0,XGB test,18564.635809,5531917000.0,0.065244
0,DTC CV,-19348.237368,-4829772000.0,0.060186
0,DTC test,19451.246515,5511835000.0,0.068638
0,TF CV,-20667.467648,-4686534000.0,0.09472


In [76]:
# Get scores on test data

tf_preds = tf_model1.best_estimator_.predict(X_test_array)

[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 476us/step


In [77]:
# Get scores on test data

tf_test_scores = get_test_scores('TF test', tf_preds, y_test_array)
results = pd.concat([results, tf_test_scores], axis=0)
results

Unnamed: 0,model,neg_mean_absolute_error,neg_mean_squared_error,r2
0,LR CV,-19924.294089,-5036322000.0,0.012942
0,LR test,20031.098488,5680592000.0,0.040122
0,RF CV,-19412.829282,-4900808000.0,0.048282
0,RF test,19291.191762,5535498000.0,0.064639
0,XGB CV,-18400.68297,-4886125000.0,0.044883
0,XGB test,18564.635809,5531917000.0,0.065244
0,DTC CV,-19348.237368,-4829772000.0,0.060186
0,DTC test,19451.246515,5511835000.0,0.068638
0,TF CV,-20667.467648,-4686534000.0,0.09472
0,TF test,17489.360555,5341247000.0,0.097463


# Selecting the appropriate model

Looking at the scores above it is clear that the Tensorflow model has the best fit/predictions against both the training and test sets. As such, the object is to predict the prices using this Tensorflow model. Below is the code to do just that

# Now re-run to get a new prediction on the test data set
- previously we used predicted on the train data set

In [78]:
# Convert the test data to a dense tensor for processing

test_final_array = test_data_final.values

In [79]:
%%time

# Running Tensorflow model on the final dataset 
final_tf_prediction = tf_model1.predict(test_final_array)

[1m1131/1131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 398us/step
CPU times: user 1.54 s, sys: 437 ms, total: 1.98 s
Wall time: 1.54 s


# Output File

In [80]:
submission = test_df_copy[['id']]

In [81]:
submission["price"] = final_tf_prediction

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission["price"] = final_tf_prediction


In [82]:
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36183 entries, 0 to 36182
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      36183 non-null  int64  
 1   price   36183 non-null  float32
dtypes: float32(1), int64(1)
memory usage: 424.1 KB


In [83]:
# Format the output

pd.options.display.float_format = '{:.3f}'.format

In [84]:
submission

Unnamed: 0,id,price
0,54273,30676.840
1,54274,19012.117
2,54275,42983.664
3,54276,55154.234
4,54277,23065.941
...,...,...
36178,90451,88641.492
36179,90452,12851.711
36180,90453,12501.518
36181,90454,56224.906


In [85]:
submission.to_csv("./modelSubmission1.csv", index=None)