In [1]:
import pandas as pd
A=pd.read_csv("Desktop/ds/Cars93.csv")

In [2]:
A.columns=['id', 'Manufacturer', 'Model', 'Type', 'MinPrice', 'Price',
       'MaxPrice', 'MPGcity', 'MPGhighway', 'AirBags', 'DriveTrain',
       'Cylinders', 'EngineSize', 'Horsepower', 'RPM', 'Revpermile',
       'Mantransavail', 'Fueltankcapacity', 'Passengers', 'Length',
       'Wheelbase', 'Width', 'Turncircle', 'Rearseatroom', 'Luggageroom',
       'Weight', 'Origin', 'Make']

In [3]:
A.head(3)

Unnamed: 0,id,Manufacturer,Model,Type,MinPrice,Price,MaxPrice,MPGcity,MPGhighway,AirBags,...,Passengers,Length,Wheelbase,Width,Turncircle,Rearseatroom,Luggageroom,Weight,Origin,Make
0,1,Acura,Integra,Small,12.9,15.9,18.8,25,31,,...,5,177,102,68,37,26.5,11.0,2705,non-USA,Acura Integra
1,2,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,...,5,195,115,71,38,30.0,15.0,3560,non-USA,Acura Legend
2,3,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,...,5,180,102,67,37,28.0,14.0,3375,non-USA,Audi 90


# Missing Data Treatment

In [4]:
from warnings import filterwarnings
filterwarnings("ignore")

In [5]:
from PM8wd import replacer
replacer(A)

# Drop Unwanted

In [6]:
A = A.drop(labels=["id","Make","Model"],axis=1)

# Define X and Y

In [7]:
X = A.drop(labels=["Type"],axis=1)
Y = A[["Type"]]

# Feature selection based on EDA

In [8]:
def ANOVA(df,cat,con):
    from pandas import DataFrame
    from statsmodels.api import OLS
    from statsmodels.formula.api import ols
    rel = con + " ~ " + cat
    model = ols(rel,df).fit()
    from statsmodels.stats.anova import anova_lm
    anova_results = anova_lm(model)
    Q = DataFrame(anova_results)
    a = Q['PR(>F)'][cat]
    return round(a,3)

In [9]:
imp_cols = []
for i in X.columns:
    if(X[i].dtypes!="object"):
        x = ANOVA(A,"Type",i)
        if(x < 0.05):
            print("Type vs ",i,"-->",x)
            imp_cols.append(i)

Type vs  MinPrice --> 0.0
Type vs  Price --> 0.0
Type vs  MaxPrice --> 0.0
Type vs  MPGcity --> 0.0
Type vs  MPGhighway --> 0.0
Type vs  EngineSize --> 0.0
Type vs  Horsepower --> 0.0
Type vs  RPM --> 0.0
Type vs  Revpermile --> 0.0
Type vs  Fueltankcapacity --> 0.0
Type vs  Passengers --> 0.0
Type vs  Length --> 0.0
Type vs  Wheelbase --> 0.0
Type vs  Width --> 0.0
Type vs  Turncircle --> 0.0
Type vs  Rearseatroom --> 0.0
Type vs  Luggageroom --> 0.0
Type vs  Weight --> 0.0


In [10]:
imp_cols

['MinPrice',
 'Price',
 'MaxPrice',
 'MPGcity',
 'MPGhighway',
 'EngineSize',
 'Horsepower',
 'RPM',
 'Revpermile',
 'Fueltankcapacity',
 'Passengers',
 'Length',
 'Wheelbase',
 'Width',
 'Turncircle',
 'Rearseatroom',
 'Luggageroom',
 'Weight']

In [11]:
from scipy.stats import chi2_contingency
def chisquare(df,cat1,cat2):
    import pandas as pd
    ct = pd.crosstab(df[cat1],df[cat2])
    a,b,c,d = chi2_contingency(ct)
    return b

In [12]:
for i in X.columns:
    if(X[i].dtypes=="object"):
        x = chisquare(A,"Type",i)
        if(x < 0.05):
            print("Type vs ",i,"-->",x)
            imp_cols.append(i)

Type vs  AirBags --> 0.00022621310210534361
Type vs  DriveTrain --> 0.00012480179759447616
Type vs  Cylinders --> 1.674244316924938e-07
Type vs  Mantransavail --> 3.661469229213669e-10
Type vs  Origin --> 0.015110051037674484


# Preprocessing


In [13]:
imp_cols

['MinPrice',
 'Price',
 'MaxPrice',
 'MPGcity',
 'MPGhighway',
 'EngineSize',
 'Horsepower',
 'RPM',
 'Revpermile',
 'Fueltankcapacity',
 'Passengers',
 'Length',
 'Wheelbase',
 'Width',
 'Turncircle',
 'Rearseatroom',
 'Luggageroom',
 'Weight',
 'AirBags',
 'DriveTrain',
 'Cylinders',
 'Mantransavail',
 'Origin']

In [15]:
from PM8wd import preprocessing
Xnew = preprocessing(X[imp_cols])

# Split the Data in training and testing set

In [16]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

# Create LR model

In [17]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
model = lr.fit(xtrain,ytrain)
pred_tr = model.predict(xtrain)
pred_ts = model.predict(xtest)

from sklearn.metrics import accuracy_score
tr_acc = accuracy_score(ytrain,pred_tr)
ts_acc = accuracy_score(ytest,pred_ts)

In [18]:
tr_acc

1.0

In [19]:
ts_acc

0.9473684210526315

In [20]:
from sklearn.metrics import confusion_matrix
confusion_matrix(ytest,pred_ts)


array([[1, 0, 0, 0, 0, 0],
       [0, 3, 0, 0, 0, 0],
       [0, 0, 4, 0, 0, 0],
       [1, 0, 0, 3, 0, 0],
       [0, 0, 0, 0, 5, 0],
       [0, 0, 0, 0, 0, 2]], dtype=int64)

In [21]:
ytest['pred']=pred_ts

In [22]:
ytest.sort_values(by=["Type","pred"])

Unnamed: 0,Type,pred
91,Compact,Compact
21,Large,Large
17,Large,Large
19,Large,Large
10,Midsize,Midsize
3,Midsize,Midsize
1,Midsize,Midsize
36,Midsize,Midsize
23,Small,Compact
82,Small,Small
