<a href="https://colab.research.google.com/github/ixcel87/machineLearning/blob/main/combined2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

In [2]:
train_df = pd.read_csv('/content/sample_data/train.csv')
test_df = pd.read_csv('/content/sample_data/test.csv')

In [3]:
# make copy to add price back to the newly encoded df

train_df_copy = train_df.copy()
test_df_copy = test_df.copy()

In [4]:
train_df.shape, test_df.shape

((54273, 13), (36183, 12))

In [5]:
train_df.head(3)

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,Yes,11000
1,1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,8250
2,2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,Yes,15000


In [6]:
test_df.head(3)

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,54273,Mercedes-Benz,E-Class E 350,2014,73000,Gasoline,302.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,A/T,White,Beige,None reported,Yes
1,54274,Lexus,RX 350 Base,2015,128032,Gasoline,275.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,8-Speed A/T,Silver,Black,None reported,Yes
2,54275,Mercedes-Benz,C-Class C 300,2015,51983,Gasoline,241.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Blue,White,None reported,Yes


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54273 entries, 0 to 54272
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            54273 non-null  int64 
 1   brand         54273 non-null  object
 2   model         54273 non-null  object
 3   model_year    54273 non-null  int64 
 4   milage        54273 non-null  int64 
 5   fuel_type     54273 non-null  object
 6   engine        54273 non-null  object
 7   transmission  54273 non-null  object
 8   ext_col       54273 non-null  object
 9   int_col       54273 non-null  object
 10  accident      54273 non-null  object
 11  clean_title   54273 non-null  object
 12  price         54273 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 5.4+ MB


# Drop columns you don't want to be encoded

In [8]:
# dropping the price so that train_df & test_df both have the same cols

train_df.drop(columns= ['id', 'price'], inplace=True)
test_df.drop(columns=['id'], inplace=True)

In [9]:
train_df.shape, test_df.shape

((54273, 11), (36183, 11))

# Identify both object and non-object columns

In [10]:
# Identify object columns and non-object columns

object_cols = train_df.select_dtypes(include=['object']).columns
non_object_cols = train_df.select_dtypes(exclude=['object']).columns

In [11]:
object_cols, non_object_cols

(Index(['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col',
        'int_col', 'accident', 'clean_title'],
       dtype='object'),
 Index(['model_year', 'milage'], dtype='object'))

In [12]:
# Separate non-object columns

train_non_object = train_df[non_object_cols]
test_non_object = test_df[non_object_cols]

In [13]:
train_non_object.head(3)

Unnamed: 0,model_year,milage
0,2018,74349
1,2007,80000
2,2009,91491


In [14]:
test_non_object.head(3)

Unnamed: 0,model_year,milage
0,2014,73000
1,2015,128032
2,2015,51983


In [15]:
# Ensure both datasets have the same columns

missing_cols = set(train_df.columns) - set(test_df.columns)
for col in missing_cols:
    test_df[col] = 0

# Start encoding process

In [16]:
# Initialize the OneHotEncoder with sparse output

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=True)

In [17]:
encoder.fit(train_df[object_cols])

In [18]:
train_encoded = encoder.transform(train_df[object_cols])
test_encoded = encoder.transform(test_df[object_cols])

In [19]:
type(train_encoded), type(test_encoded)

(scipy.sparse._csr.csr_matrix, scipy.sparse._csr.csr_matrix)

In [20]:
encoder.get_feature_names_out()

array(['brand_Acura', 'brand_Alfa', 'brand_Aston', ...,
       'accident_At least 1 accident or damage reported',
       'accident_None reported', 'clean_title_Yes'], dtype=object)

In [21]:
# new df containing the encoded object cols for the train_df

train_one_hot_encoded_frame = pd.DataFrame.sparse.from_spmatrix(train_encoded, columns=encoder.get_feature_names_out())

In [22]:
# new df containing the encoded object cols for the test_df

test_one_hot_encoded_frame = pd.DataFrame.sparse.from_spmatrix(test_encoded, columns=encoder.get_feature_names_out())

In [23]:
train_one_hot_encoded_frame.head(3)

Unnamed: 0,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,brand_Cadillac,brand_Chevrolet,...,int_col_Very Light Cashmere,int_col_Walnut,int_col_Whisper Beige,int_col_White,int_col_White / Brown,int_col_Yellow,int_col_–,accident_At least 1 accident or damage reported,accident_None reported,clean_title_Yes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [24]:
test_one_hot_encoded_frame.head(3)

Unnamed: 0,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,brand_Cadillac,brand_Chevrolet,...,int_col_Very Light Cashmere,int_col_Walnut,int_col_Whisper Beige,int_col_White,int_col_White / Brown,int_col_Yellow,int_col_–,accident_At least 1 accident or damage reported,accident_None reported,clean_title_Yes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


In [25]:
# ensure that the both newly created object cols (encoded) dfs have the same rows

train_one_hot_encoded_frame.shape, test_one_hot_encoded_frame.shape

((54273, 3381), (36183, 3381))

## Combine both dfs

In [26]:
# Concatenate the non-object columns with the encoded object columns

train_data_final = pd.concat([train_non_object.reset_index(drop=True), train_one_hot_encoded_frame], axis=1)
test_data_final = pd.concat([test_non_object.reset_index(drop=True), test_one_hot_encoded_frame], axis=1)

In [27]:
train_data_final.shape, test_data_final.shape

((54273, 3383), (36183, 3383))

In [28]:
train_data_final.head(3)

Unnamed: 0,model_year,milage,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,...,int_col_Very Light Cashmere,int_col_Walnut,int_col_Whisper Beige,int_col_White,int_col_White / Brown,int_col_Yellow,int_col_–,accident_At least 1 accident or damage reported,accident_None reported,clean_title_Yes
0,2018,74349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,2007,80000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,2009,91491,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [29]:
test_data_final.head(3)

Unnamed: 0,model_year,milage,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,...,int_col_Very Light Cashmere,int_col_Walnut,int_col_Whisper Beige,int_col_White,int_col_White / Brown,int_col_Yellow,int_col_–,accident_At least 1 accident or damage reported,accident_None reported,clean_title_Yes
0,2014,73000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,2015,128032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,2015,51983,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


# test/train split and setup

In [30]:
# need to add 'Price' back to the train_data_final

train_data_final = pd.concat([train_data_final.reset_index(drop=True), train_df_copy['price']], axis=1)

In [31]:
train_data_final.shape

(54273, 3384)

In [32]:
train_data_final.head(3)

Unnamed: 0,model_year,milage,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,...,int_col_Walnut,int_col_Whisper Beige,int_col_White,int_col_White / Brown,int_col_Yellow,int_col_–,accident_At least 1 accident or damage reported,accident_None reported,clean_title_Yes,price
0,2018,74349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,11000
1,2007,80000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,8250
2,2009,91491,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,15000


In [33]:
# test/train split

X = train_data_final.drop(columns= ['price'])
y = train_data_final['price']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

In [35]:
X.shape, y.shape

((54273, 3383), (54273,))

In [36]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((43418, 3383), (10855, 3383), (43418,), (10855,))

# Helper functions to print out accuracy/loss from below models

- make_results => takes in a model name and returns metrics
- get_test_scores => creates a nice table with my results/metrics

## function: make_results

In [37]:
def make_results(model_name:str, model_object, metric:str):
    '''
    Arguments:
    model_name (string): user labeled the model
    model_object: a fit GridSearchCV object
    metric (string): neg_mean_absolute_error, neg_mean_squared_error or r2

    Returns a pandas df with the neg_mean_absolute_error, neg_mean_squared_error and r2 scores
    for the model with the best mean 'metric' score across all validation folds.
    '''

    # Create dictionary that maps input metric to actual metric name in GridSearchCV

    metric_dict = {'neg_mean_absolute_error': 'mean_test_neg_mean_absolute_error',
                 'neg_mean_squared_error': 'mean_test_neg_mean_squared_error',
                 'r2': 'mean_test_r2',
                 }

    # Get all the results from the CV and put them in a df
    cv_results = pd.DataFrame(model_object.cv_results_)

    # Isolate the row of the df with the max(metric) score
    best_estimator_results = cv_results.iloc[cv_results[metric_dict[metric]].idxmax(), :]

    # Extract neg_mean_absolute_error, neg_mean_squared_error, and r2 score from that row

    neg_mean_absolute_error = best_estimator_results.mean_test_neg_mean_absolute_error
    neg_mean_squared_error = best_estimator_results.mean_test_neg_mean_squared_error
    r2 = best_estimator_results.mean_test_r2

    # Create table of results
    table = pd.DataFrame({'model': [model_name],
                        'neg_mean_absolute_error': [neg_mean_absolute_error],
                        'neg_mean_squared_error': [neg_mean_squared_error],
                        'r2': [r2],
                        },
                       )

    return table

## function: get_test_scores

In [38]:
def get_test_scores(model_name:str, preds, y_test_data):
    '''
    Generate a table of test scores.

    In:
    model_name (string): how the model will be named in the output table
    preds: numpy array of test predictions
    y_test_data: numpy array of y_test data

    Out:
    table: a pandas df of neg_mean_absolute_error, neg_mean_squared_error and r2 scores for your model
    '''
    negative_mean_absolute_error = mean_absolute_error(y_test_data, preds)
    negative_mean_squared_error = mean_squared_error(y_test_data, preds)
    r2 = r2_score(y_test_data, preds)

    table = pd.DataFrame({'model': [model_name],
                        'neg_mean_absolute_error': [negative_mean_absolute_error],
                        'neg_mean_squared_error': [negative_mean_squared_error],
                        'r2': [r2]
                        })

    return table

# LinearRegression and GridSearchCV

In [39]:
# Instantiate linear regression model
lr = LinearRegression()

# Create a dictionary of hyperparameters to tune
cv_params = {'fit_intercept': [True, False],
             'positive': [True, False]
}

# Define a set of scoring metrics to capture
scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']

lr1 = GridSearchCV(lr, cv_params, scoring=scoring, cv=4, refit='neg_mean_absolute_error', n_jobs=-1, verbose=1)

In [40]:
%%time

lr1.fit(X_train, y_train)

Fitting 4 folds for each of 4 candidates, totalling 16 fits




CPU times: user 14min 40s, sys: 6.17 s, total: 14min 47s
Wall time: 27min 36s


In [41]:
# Examine best score

lr1.best_score_

-19924.29408944604

In [42]:
# Obtain best parameters

lr1.best_params_

{'fit_intercept': False, 'positive': True}

In [43]:
# Call 'make_results()' on the GridSearch object

results = make_results('LR CV', lr1, 'neg_mean_absolute_error')
results

Unnamed: 0,model,neg_mean_absolute_error,neg_mean_squared_error,r2
0,LR CV,-19924.294089,-5036322000.0,0.012942


In [44]:
# Get scores on test data

lr_preds = lr1.best_estimator_.predict(X_test)



In [45]:
# Get scores on test data

lr_test_scores = get_test_scores('LR test', lr_preds, y_test)
results = pd.concat([results, lr_test_scores], axis=0)
results

Unnamed: 0,model,neg_mean_absolute_error,neg_mean_squared_error,r2
0,LR CV,-19924.294089,-5036322000.0,0.012942
0,LR test,20031.098488,5680592000.0,0.040122


# RandomForestRegressor and GridSearchCV

In [46]:
# Instantiate random forest classifier
rfr = RandomForestRegressor()

# Create a dictionary of hyperparameters to tune
cv_params = {'max_depth': [5, None],
             'max_features': [1.0, 2.0],
             'max_samples': [None, 0.7],
             'min_samples_leaf': [1, 2],
             'min_samples_split': [2, 3],
             'n_estimators': [200]
}

# Define a set of scoring metrics to capture
scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']

# Instantiate the GridSearchCV object
rf1 = GridSearchCV(rfr, cv_params, scoring=scoring, cv=4, refit='neg_mean_absolute_error', n_jobs=-1)

In [None]:
%%time

rf1.fit(X_train, y_train)



In [None]:
# Examine best score

rf1.best_score_

In [None]:
# Obtain best parameters

rf1.best_params_

In [None]:
# Call 'make_results()' on the GridSearch object

results = make_results('RF CV', rf1, 'neg_mean_absolute_error')
results

In [None]:
# Get scores on test data

rf_preds = rf1.best_estimator_.predict(X_test)

In [None]:
# Get scores on test data

rf_test_scores = get_test_scores('RF test', rf_preds, y_test)
results = pd.concat([results, rf_test_scores], axis=0)
results

# XGBoost and GridSearchCV

In [None]:
# Instantiate the XGBoost classifier
xgb = xgb.XGBRegressor(objective='reg:squarederror', random_state=25)

# Create a dictionary of hyperparameters to tune
cv_params = {'learning_rate': [0.1, 0.01],
             'max_depth': [3, 4],
             'min_child_weight': [2, 5],
             'n_estimators': [100, 200]
             }

# Define a set of scoring metrics to capture
scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']

# Instantiate the GridSearchCV object
xgb1 = GridSearchCV(xgb, cv_params, scoring=scoring, cv=4, refit='neg_mean_absolute_error', n_jobs=-1)

In [None]:
%%time

xgb1.fit(X_train, y_train)

In [None]:
# Examine best score
xgb1.best_score_

In [None]:
# Examine best parameters
xgb1.best_params_

In [None]:
# Call make_results() on the GridSearch object

xgb1_cv_results = make_results('XGB CV', xgb1, 'neg_mean_absolute_error')
results = pd.concat([results, xgb1_cv_results], axis=0)
results

In [None]:
# Get scores on test data

xgb_preds = xgb1.best_estimator_.predict(X_test)

In [None]:
# Get scores on test data

xgb_test_scores = get_test_scores('XGB test', xgb_preds, y_test)
results = pd.concat([results, xgb_test_scores], axis=0)
results

# DecisionTreeClassifier and GridSearchCV

In [None]:
# Instantiate random forest classifier
dtc = DecisionTreeRegressor()

# Create a dictionary of hyperparameters to tune
cv_params = {'max_depth': [None, 10, 20],
             'max_features': ['auto', 'sqrt'],
             'max_samples': [None, 0.7],
             'min_samples_leaf': [2, 5],
             'min_samples_split': [1, 4],
             'max_leaf_nodes': [None, 10, 20],
             'n_estimators': [100, 200]
             }

# Define a set of scoring metrics to capture
scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']

# Instantiate the GridSearchCV object
dtc1 = GridSearchCV(dtc, cv_params, scoring=scoring, cv=4, refit='neg_mean_absolute_error')

In [None]:
%%time

dtc1.fit(X_train, y_train)

In [None]:
# Examine best score

dtc1.best_score_

In [None]:
# Examine best parameters

dtc1.best_params_

In [None]:
# Call 'make_results()' on the GridSearch object

results = make_results('DTC CV', dtc1, 'neg_mean_absolute_error')
results

In [None]:
# Get scores on test data

dtc_preds = dtc1.best_estimator_.predict(X_test)

In [None]:
# Get scores on test data

dtc_test_scores = get_test_scores('DTC test', dtc_preds, y_test)
results = pd.concat([results, dtc_test_scores], axis=0)
results

# SupportVectorRegressor and GridSearchCV

In [None]:
# Instantiate random forest classifier
svr = SVR()

# Create a dictionary of hyperparameters to tune
cv_params = {'C': [1, 10],
             'epsilon': [0.01],
             'kernel': ['linear', 'rbf']
            }

# Define a set of scoring metrics to capture
scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']

# Instantiate the GridSearchCV object
svr1 = GridSearchCV(dtc, cv_params, scoring=scoring, cv=4, refit='neg_mean_absolute_error', n_jobs=-1)

In [None]:
%%time

svr1.fit(X_train, y_train)

In [None]:
# Examine best score

svr1.best_score_

In [None]:
# Examine best parameters

svr1.best_params_

In [None]:
# Call 'make_results()' on the GridSearch object

results = make_results('SVR CV', svr1, 'neg_mean_absolute_error')
results

In [None]:
# Get scores on test data

svr_preds = svr1.best_estimator_.predict(X_test)

In [None]:
svr_test_scores = get_test_scores('SVR test', svr_preds, y_test)
results = pd.concat([results, svr_test_scores], axis=0)
results

# Now re-run to get a new prediction on the test data set
- previously we used predicted on the train data set

In [None]:
# %%time

# lr = LinearRegression()
# lr.fit(X, y)
# lr_prediction = lr.predict(test_data_final)

In [None]:
# lr_prediction.shape

# Submission File

In [None]:
# submission = test_df_copy[['id']]

In [None]:
# submission["price"] = lr_prediction

In [None]:
# submission.info()

In [None]:
# pd.options.display.float_format = '{:.3f}'.format

In [None]:
# submission

In [None]:
# submission.to_csv("output/linearRegression/submission.csv", index=None)