In [1]:
import pandas as pd
import pyforest
import warnings
warnings.filterwarnings("ignore")
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

from sklearn.linear_model import LogisticRegression

In [2]:
car_data = 'data/car.data'
headers = ['buying','maint','doors','persons','lug_boot','safety','class']

In [3]:
df = pd.read_csv(car_data, names=headers)

## High level exploratory data

In [4]:
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [5]:
df.describe()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
count,1728,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3,4
top,low,low,4,4,small,low,unacc
freq,432,432,432,576,576,576,1210


Check distribution of records for 'buying' column to ensure no skewness of data

In [6]:
df.groupby('buying').count()

Unnamed: 0_level_0,maint,doors,persons,lug_boot,safety,class
buying,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
high,432,432,432,432,432,432
low,432,432,432,432,432,432
med,432,432,432,432,432,432
vhigh,432,432,432,432,432,432


## Encode categorical data to numerical values

In [7]:
input_oe = OrdinalEncoder()
target_oe = OrdinalEncoder()

def preprocess_input(X):
    X_enc = input_oe.fit_transform(X)
    return X_enc


def preprocess_target(y):
    y_enc = target_oe.fit_transform(y)
    return y_enc

In [8]:
X = df.drop(['buying'], axis=1)
y = df.buying
y = y.to_frame()

In [9]:
X_enc = preprocess_input(X)
y_enc = preprocess_target(y)

## Split into train and test sets

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_enc, y_enc, 
                                                    test_size=0.2, 
                                                    stratify=y_enc)

In [11]:
# Check on final shapes of training and test sets
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1382, 6)
(1382, 1)
(346, 6)
(346, 1)


## Fit logistic regression model

In [12]:
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

## Make predictions

In [13]:
new_input = {
    'maint':'high',
    'doors':'4',
    'persons':'',
    'lug_boot':'big',
    'safety':'high',
    'class':'good'
}
df_new = pd.DataFrame(new_input, index =[0])
df_new

Unnamed: 0,maint,doors,persons,lug_boot,safety,class
0,high,4,,big,high,good


In [14]:
df_new_enc = preprocess_input(df_new)

In [15]:
results = model.predict(df_new_enc)
print('Before reshape: ', results.shape)

results = results.reshape(1,1)
print('After reshape: ', results.shape)

Before reshape:  (1,)
After reshape:  (1, 1)


In [16]:
r = target_oe.inverse_transform(results)
print('Buying price prediction = ', r[0][0])

Buying price prediction =  high


___

##### Trying other models

In [17]:
import lazypredict
from lazypredict.Supervised import LazyClassifier

In [18]:
clf = LazyClassifier(verbose=0,ignore_warnings=True)
models, predictions = clf.fit(X_train, 
                              X_test,
                              y_train, 
                              y_test)
models

100%|██████████| 29/29 [00:01<00:00, 18.03it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AdaBoostClassifier,0.3,0.3,,0.29,0.07
Perceptron,0.29,0.29,,0.23,0.01
GaussianNB,0.29,0.29,,0.21,0.01
CalibratedClassifierCV,0.29,0.29,,0.27,0.22
SGDClassifier,0.27,0.27,,0.21,0.04
BernoulliNB,0.26,0.26,,0.18,0.01
LogisticRegression,0.25,0.25,,0.23,0.01
DummyClassifier,0.25,0.25,,0.25,0.01
RidgeClassifierCV,0.25,0.25,,0.23,0.01
RidgeClassifier,0.25,0.25,,0.23,0.01
