In [1]:
import warnings
warnings.filterwarnings("ignore")

# Import Libraries

In [2]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from missingpy import KNNImputer
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.mixture import GMM
from sklearn.cluster import AgglomerativeClustering
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import cdist
from sklearn.cluster import DBSCAN
from sklearn.cluster import MiniBatchKMeans



# Load Data

In [3]:
veh = pd.read_csv("https://raw.githubusercontent.com/justicejanak/Stat-517/master/vehicles.csv")
veh.head()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,15.695714,0.0,0,0.0,19,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,29.964545,0.0,0,0.0,9,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,12.207778,0.0,0,0.0,23,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,29.964545,0.0,0,0.0,10,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,17.347895,0.0,0,0.0,17,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [4]:
veh.shape

(39588, 83)

# Preprocessing the Dataset

In [5]:
veh = veh.drop([u'cityUF', u'co2',
       u'co2A', u'co2TailpipeAGpm', u'co2TailpipeGpm', u'comb08', u'comb08U',
       u'combA08', u'combA08U', u'combE', u'combinedCD', u'combinedUF', u'cylinders',
       u'displ', u'drive', u'engId', u'eng_dscr', u'feScore', u'fuelCost08',
       u'fuelCostA08', u'fuelType', u'fuelType1', u'ghgScore', u'ghgScoreA',
       u'highway08', u'highway08U', u'highwayA08', u'highwayA08U', u'highwayCD',
       u'highwayE', u'highwayUF', u'hlv', u'hpv', u'id', u'lv2', u'lv4',
       u'model', u'mpgData', u'phevBlended', u'pv2', u'pv4', u'range', u'rangeCity',
       u'rangeCityA', u'rangeHwy', u'rangeHwyA', u'trany', u'UCity', u'UCityA',
       u'UHighway', u'UHighwayA', u'VClass', u'year', u'youSaveSpend', u'guzzler',
       u'trans_dscr', u'tCharger', u'sCharger', u'atvType', u'fuelType2', u'rangeA',
       u'evMotor'],axis =1)
veh.head()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,15.695714,0.0,0,0.0,19,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,29.964545,0.0,0,0.0,9,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,12.207778,0.0,0,0.0,23,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,29.964545,0.0,0,0.0,10,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,17.347895,0.0,0,0.0,17,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [15]:
from sklearn.feature_extraction.text import CountVectorizer
#Converting createdOn and modifiedOn into vectors
vector = CountVectorizer()
X = vector.fit_transform(veh['createdOn'].values.astype('str'))
createdOn = pd.DataFrame(X.toarray(), columns=vector.get_feature_names())
X = vector.fit_transform(veh['modifiedOn'].values.astype('str'))
modifiedOn = pd.DataFrame(X.toarray(), columns=vector.get_feature_names())

In [16]:
#Merging the variables
veh_mer = pd.merge(veh, createdOn, left_index=True, right_index=True)
veh_merg = pd.merge(veh_mer, modifiedOn, left_index=True, right_index=True)
veh_merg_dr = veh_merg.drop(['createdOn','modifiedOn'], axis = 1)

In [17]:
veh_dum = pd.get_dummies(veh_merg_dr, columns =[u'mfrCode',
       u'c240Dscr', u'c240bDscr', u'startStop'])
veh_dum.head()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,c240Dscr_6.6 kW charger,c240Dscr_7.2 kW charger,c240Dscr_single charger,c240Dscr_standard charger,c240bDscr_3.6 kW charger,c240bDscr_6.6 kW charger,c240bDscr_80 amp dual charger,c240bDscr_dual charger,startStop_N,startStop_Y
0,15.695714,0.0,0,0.0,19,0.0,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,29.964545,0.0,0,0.0,9,0.0,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,12.207778,0.0,0,0.0,23,0.0,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,29.964545,0.0,0,0.0,10,0.0,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,17.347895,0.0,0,0.0,17,0.0,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
#Association was performed here for groupings
apriori(veh, min_support=0.25, use_colnames=True)

TypeError: '<' not supported between instances of 'int' and 'str'

In [14]:
#Imputation was the focus here to replace missing values
Y_veh = veh[u'make']
X_veh = veh.drop(u'make', axis=1)
X_veh.head()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,15.695714,0.0,0,0.0,19,0.0,0,0.0,0.0,0.0,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,29.964545,0.0,0,0.0,9,0.0,0,0.0,0.0,0.0,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,12.207778,0.0,0,0.0,23,0.0,0,0.0,0.0,0.0,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,29.964545,0.0,0,0.0,10,0.0,0,0.0,0.0,0.0,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,17.347895,0.0,0,0.0,17,0.0,0,0.0,0.0,0.0,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [7]:
X_veh= X_veh.select_dtypes(include=['uint','int8','int64','float64']).copy()
X_veh.head()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,charge240b,phevCity,phevHwy,phevComb
0,15.695714,0.0,0,0.0,19,0.0,0,0.0,0.0,0.0,0.0,0,0,0
1,29.964545,0.0,0,0.0,9,0.0,0,0.0,0.0,0.0,0.0,0,0,0
2,12.207778,0.0,0,0.0,23,0.0,0,0.0,0.0,0.0,0.0,0,0,0
3,29.964545,0.0,0,0.0,10,0.0,0,0.0,0.0,0.0,0.0,0,0,0
4,17.347895,0.0,0,0.0,17,0.0,0,0.0,0.0,0.0,0.0,0,0,0


In [8]:
#The actual imputation was done here
imputer = KNNImputer(n_neighbors=1)
X_imputed = imputer.fit_transform(X_veh)
X_imputed = pd.DataFrame(X_imputed) 
X_imputed.columns = X_veh.columns

In [9]:
#Number of missing values after imputation was zero(0)
X_imputed.isna().sum().sum()

0

In [10]:
X_veh = X_imputed
X_veh.head()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,charge240b,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,29.964545,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12.207778,0.0,0.0,0.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,29.964545,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,17.347895,0.0,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
#Association was performed here for groupings
apriori(X_veh, min_support=0.25, use_colnames=True)

ValueError: The allowed values for a DataFrame are True, False, 0, 1. Found value 0.06