## Different Models, Same Data

### Import Required Libraries

In [15]:
import pandas as pd
import numpy as np
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

### Load Data

In [16]:
df = pd.read_csv("../../data/mobile_classification/train.csv")
df.sample(3)

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
1149,1396,0,1.5,1,0,1,44,0.1,118,6,...,173,1843,1386,7,1,8,1,1,1,1
1957,1589,0,0.6,0,0,0,5,0.8,112,5,...,776,1397,3430,7,5,19,1,1,1,3
1917,1802,0,2.7,1,0,1,10,0.6,101,7,...,253,590,696,14,7,3,1,1,0,0


### Split input and output variable

In [17]:
x = df.iloc[:, :-1]
y = df.iloc[:, -1]

print(x.shape)
print(y.shape)

(2000, 20)
(2000,)


### Individual Models

In [18]:
# initialize classifiers
dt = DecisionTreeClassifier()
lr = LogisticRegression()
knn = KNeighborsClassifier()
rf = RandomForestClassifier()

# estimators for cross_val_score
estimators = [('dt', dt), ('lr', lr), ('knn', knn), ('rf', rf)]

# apply cross validation to individual models
for est in estimators:
        acc = cross_val_score(est[1], x, y, cv=10, scoring='accuracy', n_jobs=-1)
        print(est[0], np.round(np.mean(acc), 2))

dt 0.83
lr 0.64
knn 0.92
rf 0.88


### Voting Classifier

#### Hard Voting

In [19]:

vc = VotingClassifier(estimators=estimators)
acc = cross_val_score(vc, x, y, cv=10, scoring='accuracy', n_jobs=-1)
print("vc", np.round(np.mean(acc), 2))

vc 0.89


#### Soft Voting

In [20]:
vc = VotingClassifier(estimators=estimators, voting='soft')
acc = cross_val_score(vc, x, y, cv=10, scoring='accuracy', n_jobs=-1)
print("vc", np.round(np.mean(acc), 2))

vc 0.88


## Same Models, Different Configuration

### Load Data

In [21]:
df = pd.read_csv("../../data/titanic/train.csv")
df.sample(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
265,266,0,2,"Reeves, Mr. David",male,36.0,0,0,C.A. 17248,10.5,,S
148,149,0,2,"Navratil, Mr. Michel (""Louis M Hoffman"")",male,36.5,0,2,230080,26.0,F2,S
352,353,0,3,"Elias, Mr. Tannous",male,15.0,1,1,2695,7.2292,,C


### Preprocessing

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [23]:
# drop unwanted column
df.drop(['Cabin', 'Name', 'Ticket'], axis=1, inplace=True)

# drop null values
df.dropna(inplace=True)

In [24]:
# label encode categorical columns
def cton(df):

    for col in df.columns:
        if df[col].dtype == 'O':
            encoder=LabelEncoder()
            df[col]=encoder.fit_transform(df[col])

    return df

df = cton(df)
df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.2500,2
1,2,1,1,0,38.0,1,0,71.2833,0
2,3,1,3,0,26.0,0,0,7.9250,2
3,4,1,1,0,35.0,1,0,53.1000,2
4,5,0,3,1,35.0,0,0,8.0500,2
...,...,...,...,...,...,...,...,...,...
885,886,0,3,0,39.0,0,5,29.1250,1
886,887,0,2,1,27.0,0,0,13.0000,2
887,888,1,1,0,19.0,0,0,30.0000,2
889,890,1,1,1,26.0,0,0,30.0000,0


### Split input and output variable

In [25]:
# split input and output columns
y = df['Survived']
df.drop(['Survived'], axis=1, inplace=True)
x = df

### Individual Models

In [26]:
# multiple classifiers with different configurations
knn1 = KNeighborsClassifier(n_neighbors=51)
knn2 = KNeighborsClassifier(n_neighbors=61)
knn3 = KNeighborsClassifier(n_neighbors=71)
knn4 = KNeighborsClassifier(n_neighbors=81)
knn5 = KNeighborsClassifier(n_neighbors=121)

#estimators for classifiaction and voting
estimators = [('knn1', knn1), ('knn2', knn2), ('knn3', knn3), ('knn4', knn4), ('knn5', knn5)]

for est in estimators:
        acc = cross_val_score(est[1], x, y, cv=10, scoring='accuracy', n_jobs=-1)
        print(est[0], np.round(np.mean(acc), 2))

knn1 0.62
knn2 0.61
knn3 0.62
knn4 0.63
knn5 0.61


### Voting Classifier

#### Hard Voting

In [27]:
vc = VotingClassifier(estimators=estimators, voting='soft')
acc = cross_val_score(vc, x, y, cv=10, scoring='accuracy')
print("vc", np.round(np.mean(acc), 2))

vc 0.63


#### Soft Voting

In [28]:
print("vc", np.round(np.mean(acc), 2))

vc 0.63
