In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # plot data, visualizations
import seaborn as sns #visualization with seaborn

from pandas import Series
from sklearn.preprocessing import LabelEncoder #Label Encoding
#ensemble classifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier, BaggingClassifier
#Classification Algorithm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold # split data into train and test dataset
from sklearn.metrics import mean_squared_error, confusion_matrix, accuracy_score # Model Evaluation

import sys
import os
#kaggle Data source
print(os.listdir("../input"))
#print(os.listdir("D:\DataScience@DKB\Classification")) # Files for analysis

## Data Preparation and Basic Statistics

In [2]:
#load data and print first 5 rows
df = pd.read_csv('../input/lettersdata.csv', index_col=0)
#df = pd.read_csv('D:\DataScience@DKB\Classification\lettersdata.csv', index_col=0)
print(df.head(5))

In [3]:
#show basic statistics
df.describe()

In [4]:
#select first two letters of name 'MALI'
#show data types
df.dtypes

#copy data to new df with selected columns
df_c = df.loc[df['lettr'].isin(['M','A','L','I'])]

print(df_c.head(10))

#show data types
print(df.dtypes)

In [5]:
#calculate correlations between dimensions
cor = df_c.corr()

#make correlation heatmap using seaborn lib
colormap = plt.cm.RdBu
plt.figure(figsize=(14,12))
sns.heatmap(cor,linewidths=0.1,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True)

plt.show()

In [6]:
X = df_c.drop(['lettr'], axis=1)
print(X.head(2))
y = df_c['lettr']
print(y.head(2))
#set seed value
seed = 20
#split data into train and test (and make validation set)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)
#X_train_val, X_val, y_train_val, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=seed)
print(X_train.shape)
print(X_train.head(2))
print(X_test.shape)
print(X_test.head(2))
print(y_train.head(2))
print(y_test.head(2))

## Ensembling: Voting Classifiers

In [7]:
#Ensembling Method: Voting Classifiers (aggregate predictions of each classifier 
#and predict the class that gets the most votes)
lr_clf = LogisticRegression(random_state=seed)
rf_clf = RandomForestClassifier(random_state=seed)
lda_clf = LinearDiscriminantAnalysis()
svm_clf = SVC(random_state=seed, probability=True, gamma='auto')

voting_clf = VotingClassifier(
            estimators=[('lr', lr_clf),('rf', rf_clf),('lda', lda_clf),('svm',svm_clf)],
            voting='soft',
            n_jobs=-1)
voting_clf.fit(X_train, y_train)

for clf in (lr_clf, rf_clf, lda_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
    #print(confusion_matrix(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Pastel2)
    classNames = ['M','A','L','I']
    tick_marks = np.arange(len(classNames))
    plt.xticks(tick_marks, classNames)
    plt.yticks(tick_marks, classNames)
    plt.ylabel('True')
    plt.xlabel('Predicted')
    for i in range(4):
        for j in range(4):
            plt.text(j, i, str(cm[i][j]))
    plt.show()

## Ensembling: Bagging

In [8]:
#Ensembling Method: Bagging and Pasting (train the same classifier with different random subsets)
#of training data (Bagging: sampling with replacement (=bootstrap), Pasting: sampling without replacement)

#Train an ensemble of 500 Desicion Tree classifiers on each subset of 100 instances randomly sampled 
#from the entire training set with replacement (bagging). 
#Parameter n_jobs tells Sklearn the number of CPU cores to use (-1 = use all available cores)  
#oob_score = Out of Bag (no need for separate validation set), model can be evaluated on oob instances 

#standard DecisionTree Classifier
dt_clf = DecisionTreeClassifier(random_state=seed)
dt_clf.fit(X_train, y_train)
y_pred = dt_clf.predict(X_test)
#print('y_test',y_test, 'y_pred',y_pred)
print('DT accuracy: ', str(accuracy_score(y_test, y_pred)))

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=seed), n_estimators=500, max_samples=100,
    bootstrap=True, n_jobs=-1, random_state=seed, oob_score=True)
#train the model
bag_clf.fit(X_train, y_train)
y_pred_bag = bag_clf.predict(X_test)
print('Out-of-Bag Score: ', bag_clf.oob_score_)

print('DT with Bagging accuracy: ', str(accuracy_score(y_test, y_pred_bag)))
#print(confusion_matrix(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Pastel2)
classNames = ['M','A','L','I']
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames)
plt.yticks(tick_marks, classNames)
plt.ylabel('True')
plt.xlabel('Predicted')
for i in range(4):
    for j in range(4):
        plt.text(j, i, str(cm[i][j]))
plt.show()    

## Random Forest and Feature Importance

In [9]:
#feature importance provided by Random Forest Classifier measures the importance of different 
#features by looking at how much the tree nodes that use that feature reduce impurity
#on average

feature_list = []
score_list = []

rf_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rf_clf.fit(X_train, y_train)
for feature, score in zip(list(X_train), rf_clf.feature_importances_):
    #print(feature, score)
    feature_list.append(feature)
    score_list.append(score)
    
#print(feature_list)
#print(score_list)

plt.figure(figsize=(14,6))
plt.title("Feature Importance")
plt.ylabel('feature_importance')
plt.xlabel('feature')
x = feature_list
plt.bar(x, score_list)

plt.show()

## Ensembling: Boosting

In [38]:
#Ada Boost (Adaptive Boosting) 
#First base classifier is trained and used to make predictions on the training set.
#The relative weight of misclassified training instances is then increased. 
#The second classifier is trained using the updated weights and again it makes predictions
#on the training set (sequential training (learning)).
#SAMME.R (Stagewise Additive Modeling using Multiclass Exp. Loss function)
#R = Real, using class_probabilities (if provided by classifier)

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=2), n_estimators=200, 
    algorithm='SAMME.R', learning_rate=0.01)
ada_clf.fit(X_train, y_train)
y_pred = ada_clf.predict(X_test)
print('ADA accuracy: ', str(accuracy_score(y_test, y_pred)))

cm = confusion_matrix(y_test, y_pred)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Pastel2)
classNames = ['M','A','L','I']
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames)
plt.yticks(tick_marks, classNames)
plt.ylabel('True')
plt.xlabel('Predicted')
for i in range(4):
    for j in range(4):
        plt.text(j, i, str(cm[i][j]))
plt.show()    


In [39]:
#Gradient Boosting
#sequentially adding predictors to an ensemble, each on correcting its predecessor
#tries to fit the new predictor to the residual errors made by the previous predictor

gb_clf = GradientBoostingClassifier(max_depth=2, n_estimators=100, learning_rate=1.0)
gb_clf.fit(X_train, y_train)
y_pred = gb_clf.predict(X_test)
#print(y_pred)
print('GB accuracy: ', str(accuracy_score(y_test, y_pred)))

cm = confusion_matrix(y_test, y_pred)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Pastel2)
classNames = ['M','A','L','I']
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames)
plt.yticks(tick_marks, classNames)
plt.ylabel('True')
plt.xlabel('Predicted')
for i in range(4):
    for j in range(4):
        plt.text(j, i, str(cm[i][j]))
plt.show() 

## Ensembling: Stacking

In [70]:
#Stacking (stacked generalization): train several models and make predictions on subsets (folds)
#of the training data. Take the predictions as a new training set for the next (or final) layer (model/classifier)
#Use hold-out sets (folds) to ensure the predictors never saw the instances during training.

#Build Method to make predictions on stratiefied n-folds (preserving the percentage of samples for each class)

print(X_train.head(5))
print((X_train.shape[0],1))
print(X_test.head(5))
print((X_test.shape[0],1))
print(y_train.head(5))

def Stacking(model,X_train,y_train,X_test,kfold):
    
    kf=StratifiedKFold(n_splits=kfold,random_state=seed)
    test_pred=np.empty((0,1))
    train_pred=np.empty((0,1))
    
    for train_index,test_index in kf.split(X_train,y_train.values):
        
        x_tr,y_tr=X_train.iloc[train_index],y_train.iloc[train_index] 
        x_te,y_te=X_train.iloc[test_index],y_train.iloc[test_index]
        
        #print('x_tr', x_tr.head(5))
        #print('x_val', x_val.head(5))
        #print('y_tr', y_tr.head(5))
        #print('y_val', y_val.head(5))

        model.fit(X=X_train,y=y_train)
        
        le = LabelEncoder()
              
        train_pred=np.append(train_pred,model.predict(x_te))
        test_pred=np.append(test_pred,model.predict(X_test))
                       
    return test_pred.reshape(-1,1), train_pred


In [71]:
#train the base model: Random Forest   , Ada Boost, Logistic Regression)
rf = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=seed)

#use Stacking Method to train the model
test_pred_rf, train_pred_rf = Stacking(model=rf, X_train=X_train, X_test=X_test, y_train=y_train, kfold=4)

#write results to new data frame and encode labels for second layer classification algo
le = LabelEncoder()
train_pred_rf = pd.DataFrame(le.fit_transform(train_pred_rf))
test_pred_rf = pd.DataFrame(le.fit_transform(test_pred_rf))


print(train_pred_rf.shape)
print(test_pred_rf.shape)
print(train_pred_rf.head(5))
print(test_pred_rf.head(5))



In [17]:
#train the base model: Ada Boost
ada = AdaBoostClassifier(n_estimators=200, learning_rate=0.5, random_state=seed)

#use Stacking Method to train the model
test_pred_ada, train_pred_ada = Stacking(model=ada, X_train=X_train, X_test=X_test, y_train=y_train, kfold=4)

#write results to new data frame and encode labels for second layer classification algo
le = LabelEncoder()
train_pred_ada = pd.DataFrame(le.fit_transform(train_pred_ada))
test_pred_ada = pd.DataFrame(le.fit_transform(test_pred_ada))

print(train_pred_ada.shape)
print(test_pred_ada.shape)


In [18]:
#train the base model: Logistic Regression
lr = LogisticRegression(random_state=seed)

#use Stacking Method to train the model
test_pred_lr, train_pred_lr = Stacking(model=lr, X_train=X_train, X_test=X_test, y_train=y_train, kfold=4)

#write results to new data frame and encode labels for second layer classification algo
le = LabelEncoder()
train_pred_lr = pd.DataFrame(le.fit_transform(train_pred_lr))
test_pred_lr = pd.DataFrame(le.fit_transform(test_pred_lr))

print(train_pred_lr.shape)
print(test_pred_lr.shape)


In [19]:
#Combine the predictions from the first layer in one data frame as new training set 
#for the second layer (sl)

X_train_sl = pd.concat([train_pred_rf, train_pred_ada, train_pred_lr], axis=1)
X_test_sl = pd.concat([test_pred_rf, test_pred_ada, test_pred_lr], axis=1)

X_train_sl.columns = ['rf','ada','lr']
X_test_sl.columns = ['rf','ada','lr']

print(X_train_sl.head(5))
print(X_test_sl.head(5))

print(X_train_sl.shape)
print('Issue', X_test_sl.shape)

print(y_train.shape)
print(y_test.shape)



In [20]:
#Second Layer Model (Gradient Boosting) using predictions from the first layer

lrm = LogisticRegression(random_state=seed)
#gbm = GradientBoostingClassifier(max_depth=2, n_estimators=100, learning_rate=1.0)
#lrm.fit(X_train_sl, y_train)
#lrm.score(X_test_sl, y_test)


****## Ensembling: Blending (Stacking)

In [72]:
#split data into train and test (and make validation set)
X_train_val1, X_test1, y_train_val1, y_test1 = train_test_split(X, y, test_size=0.25, random_state=seed)
X_train1, X_val1, y_train1, y_val1 = train_test_split(X_train_val1, y_train_val1, test_size=0.25, random_state=seed)
print('X_train1: ', X_train1.shape)
print('X_test1: ', X_test1.shape)
print('X_train_val1: ', X_train_val1.shape)
print('X_val1: ', X_val1.shape)
print('y_train1: ', y_train1.shape)
print('y_test1: ', y_test1.shape)
print('y_train_val1: ', y_train_val1.shape)
print('y_val1: ', y_val1.shape)



In [73]:
#initialize base models
lr1_clf = LogisticRegression(random_state=seed)
dt1_clf = DecisionTreeClassifier(random_state=seed)
lda1_clf = LinearDiscriminantAnalysis()
ada1_clf = AdaBoostClassifier(random_state=seed)
gbm1_clf = GradientBoostingClassifier(random_state=seed)
svc1_clf = SVC(random_state=seed, gamma='auto')
rf1_clf = RandomForestClassifier(random_state=seed)
extr1_clf = ExtraTreesClassifier(random_state=seed)


In [74]:
#Train the first layer models
models = [lr1_clf, dt1_clf, lda1_clf, ada1_clf, gbm1_clf, svc1_clf, rf1_clf, extr1_clf]
for model in models:
    print("Training the", model)
    model.fit(X_train1, y_train1)

In [75]:
#show model scores
[model.score(X_val1, y_val1) for model in models]



In [76]:
#save X_val predictions for the second layer model
X_val_predictions = np.empty((len(X_val1), len(models)), dtype=np.float32)

for index, model in enumerate(models):
    X_val_predictions[:, index] = le.fit_transform(model.predict(X_val1))
    
print('Label Encoded Predictions', X_val_predictions)

#transform to pd Dataframe to show correlations between model predictions on validation set
pd_corr =  pd.DataFrame(data=X_val_predictions[:,:],   
                        columns=X_val_predictions[0,:])
pd_corr.columns = ['lr','dt','lda','ada','gbm','svc','rf','extr']
print(pd_corr.head(5))

#calculate correlations between models
cor = pd_corr.corr()

#make correlation heatmap using seaborn lib
colormap = plt.cm.RdBu
plt.figure(figsize=(10,8))
sns.heatmap(cor,linewidths=0.1,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True)
plt.show()
    

In [77]:
#train the second layer  model
rf_blender = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=seed)
rf_blender.fit(X_val_predictions, y_val1)

print(rf_blender.oob_score_)



In [78]:
#save X_test predictions

X_test_predictions = np.empty((len(X_test1), len(models)), dtype=np.float32)

for i, model in enumerate(models):
    X_test_predictions[:, i] = le.fit_transform(model.predict(X_test1))

y_pred1 = rf_blender.predict(X_test_predictions)

print(accuracy_score(y_test1, y_pred1))

cm = confusion_matrix(y_test1, y_pred1)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Pastel2)
classNames = ['M','A','L','I']
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames)
plt.yticks(tick_marks, classNames)
plt.ylabel('True')
plt.xlabel('Predicted')
for i in range(4):
    for j in range(4):
        plt.text(j, i, str(cm[i][j]))
plt.show() 

