In [30]:
# imports
from __future__ import absolute_import, division, print_function, unicode_literals
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib

import plotly.express as px

# Tensorflow
import tensorflow.compat.v2.feature_column as fc
import tensorflow as tf

# Sci-kit learn
from sklearn.model_selection import train_test_split


### Exploring and Cleaning Data

Using some built-in tools to explore the titanic dataset we will be focusing on here is the  dataset. 

In [125]:
# Importing the dataset
all_cars_df = pd.read_csv(os.path.join('1b_data','car_data.csv')) 

#Data Cleaning
# Dropping any rows with nans 
all_cars_df.dropna(inplace=True)
all_cars_df = all_cars_df.rename(columns=str.lower)

# remove whitespace
all_cars_df.columns = all_cars_df.columns.str.replace(' ', '_') 

all_cars_df.sample(5) #previewing dataset randomly



Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,msrp
7822,Audi,Q5,2017,premium unleaded (required),272.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Luxury",Midsize,4dr SUV,26,18,3105,53200
6007,Volkswagen,Jetta GLI,2013,premium unleaded (recommended),210.0,4.0,MANUAL,front wheel drive,4.0,Performance,Midsize,Sedan,33,22,873,24170
11468,Subaru,WRX,2017,premium unleaded (required),305.0,4.0,MANUAL,all wheel drive,4.0,"Factory Tuner,High-Performance",Compact,Sedan,23,17,640,39995
84,Chrysler,200,2017,flex-fuel (unleaded/E85),184.0,4.0,AUTOMATIC,front wheel drive,4.0,Flex Fuel,Midsize,Sedan,36,23,1013,27795
4695,Pontiac,Firebird,2001,premium unleaded (required),310.0,8.0,AUTOMATIC,rear wheel drive,2.0,"Hatchback,Performance",Midsize,2dr Hatchback,23,16,210,24035


In [128]:
# Print top 5 most common car makes in the dataset
print(all_cars_df['make'].value_counts()[:5]) 

Chevrolet        614
Volkswagen       581
Ford             492
Cadillac         397
Mercedes-Benz    352
Name: make, dtype: int64


In [129]:
# split the dataset into testing and evaluation
dftrain, dfeval = train_test_split(all_cars_df)


In [131]:
# MSRP is what we car about predicting, so get that parameter for y_train and y_eval
y_train = dftrain.pop('msrp')
y_eval = dfeval.pop('msrp')

In [132]:
# Make and input function

def make_input_fn(data_df, label_df, num_epochs=100, shuffle=True, batch_size=32):
    """Returns a function object of input function for use"""
    def input_function():  # inner function, this will be returned
        """Takes  Pandas dataframe and its label and creates a tf.data.Dataset object"""
        ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df)) 
        if shuffle:
            ds = ds.shuffle(1000)  # randomize order of data
        ds = ds.batch(batch_size).repeat(num_epochs)  ## split dataset into batches of 32 and repeat process for number of epochs
        return ds  # return a batch of the dataset
    return input_function  # return a function object for use


In [133]:
# Using make_input_fn to create a train input function
# which can be called to create a dataset object we can feed to the model
train_input_fn = make_input_fn(dftrain, y_train)  

# Using make_input_fn to create an evaluation input function
eval_input_fn = make_input_fn(dfeval, y_eval, num_epochs=1, shuffle=False)

In [136]:
dfeval.columns

Index(['make', 'model', 'year', 'engine_fuel_type', 'engine_hp',
       'engine_cylinders', 'transmission_type', 'driven_wheels',
       'number_of_doors', 'market_category', 'vehicle_size', 'vehicle_style',
       'highway_mpg', 'city_mpg', 'popularity'],
      dtype='object')

In [140]:
# Identifying non-numeric columns 
CATEGORICAL_COLUMNS = ['make', 'model', 'year', 'engine_fuel_type', 'transmission_type', 'driven_wheels', 'market_category', 'vehicle_size', 'vehicle_style']

# Identifying numeric columns (whatever is remaining)
NUMERIC_COLUMNS = list(set(dfeval.columns) - set(CATEGORICAL_COLUMNS))

# make empty list for feature cols
feature_columns = []

# loop the cat cols to 
for feature_name in CATEGORICAL_COLUMNS:
  vocabulary = dftrain[feature_name].unique()  # gets a list of all unique values from given feature column
    # use tf.feature_column to get VocabularyListCategoricalColumn item in list
  feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))

for feature_name in NUMERIC_COLUMNS:
    # use tf.feature_column to get representation of numeric columns as NumericColumn 
  feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))

print(feature_columns)

[VocabularyListCategoricalColumn(key='make', vocabulary_list=('Mazda', 'Buick', 'Mercedes-Benz', 'McLaren', 'Chevrolet', 'Lincoln', 'Honda', 'Aston Martin', 'Subaru', 'Infiniti', 'Kia', 'Cadillac', 'Volkswagen', 'BMW', 'Land Rover', 'GMC', 'Saab', 'Dodge', 'Mitsubishi', 'Volvo', 'Hyundai', 'Ford', 'Toyota', 'FIAT', 'Rolls-Royce', 'Lexus', 'Nissan', 'Audi', 'Scion', 'Chrysler', 'Porsche', 'Suzuki', 'Ferrari', 'Acura', 'Lotus', 'Pontiac', 'Bentley', 'Maserati', 'Plymouth', 'Lamborghini', 'Spyker', 'Maybach', 'HUMMER', 'Oldsmobile', 'Bugatti', 'Genesis', 'Alfa Romeo'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='model', vocabulary_list=('MX-5 Miata', 'Envision', '560-Class', 'MP4-12C', 'TrailBlazer', 'Mark LT', 'Prelude', 'V8 Vantage', 'B9 Tribeca', 'Q60 Coupe', 'Cadenza', 'CTS', 'New Beetle', 'X4', 'CR-V', 'LR2', 'Sierra 1500 Classic', '3', 'Z4', 'Crosstour', '9-2X', 'Neon', 'Outlander Sport', 'Traverse', 'SLR McLaren', 'Regal', '500-Class'

### Creating the Model

Use a linear estimator to utilize the linear regression algorithm. 




In [141]:
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns)
# We create a linear estimtor by passing the feature columns we created earlier

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpukcpiinm', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


### Training the Model
Training the model is as easy as passing the input functions that we created earlier.

In [142]:
# train the linear estimator
linear_est.train(train_input_fn)  

# Evaluate the model with evaluate method
# get model metrics/stats by testing on tetsing data
result = linear_est.evaluate(eval_input_fn)  

clear_output()  # clears console output
print(result['accuracy'])  # the result variable is simply a dict of stats about our model

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmpukcpiinm/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...


InvalidArgumentError: assertion failed: [Labels must be <= n_classes - 1] [Condition x <= y did not hold element-wise:] [x (head/losses/Cast:0) = ] [[219900][28060][77200]...] [y (head/losses/check_label_range/Const:0) = ] [1]
	 [[{{node Assert}}]]

And we now we have a model with a 74% accuracy (this will change each time)! Not crazy impressive but decent for our first try.

Now let's see how we can actually use this model to make predicitons.

We can use the ```.predict()``` method to get survival probabilities from the model. This method will return a list of dicts that store a predicition for each of the entries in our testing data set. Below we've used some pandas magic to plot a nice graph of the predictions.

As you can see the survival rate is not very high :/

In [161]:
pred_dicts = list(linear_est.predict(eval_input_fn))


INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpsrzhz_3c/model.ckpt-20000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


[{'logits': array([-2.5388134], dtype=float32),
  'logistic': array([0.07318163], dtype=float32),
  'probabilities': array([0.9268184 , 0.07318162], dtype=float32),
  'class_ids': array([0]),
  'classes': array([b'0'], dtype=object),
  'all_class_ids': array([0, 1], dtype=int32),
  'all_classes': array([b'0', b'1'], dtype=object)},
 {'logits': array([0.17630282], dtype=float32),
  'logistic': array([0.5439619], dtype=float32),
  'probabilities': array([0.45603812, 0.54396194], dtype=float32),
  'class_ids': array([1]),
  'classes': array([b'1'], dtype=object),
  'all_class_ids': array([0, 1], dtype=int32),
  'all_classes': array([b'0', b'1'], dtype=object)},
 {'logits': array([1.192032], dtype=float32),
  'logistic': array([0.76710427], dtype=float32),
  'probabilities': array([0.23289572, 0.7671043 ], dtype=float32),
  'class_ids': array([1]),
  'classes': array([b'1'], dtype=object),
  'all_class_ids': array([0, 1], dtype=int32),
  'all_classes': array([b'0', b'1'], dtype=object)},
 

In [181]:
# Extract the probabilities 
probs = pd.Series([pred['probabilities'][1] for pred in pred_dicts])
probs.rename('probabilities',inplace=True)

0      0.073182
1      0.543962
2      0.767104
3      0.615301
4      0.237978
         ...   
259    0.827902
260    0.078051
261    0.294093
262    0.176253
263    0.442519
Name: probabilities, Length: 264, dtype: float64

In [182]:
fig = px.histogram(probs, nbins=20, 
                   x='probabilities',
                   title='Predicted Probabilities of Survival')
fig.show()