In [5]:
import pandas as pd 
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import warnings
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
from sklearn.ensemble import  GradientBoostingClassifier
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC 
from sklearn.neural_network import MLPClassifier

#Additional imports
from sklearn import tree
import xgboost as xgb
import graphviz

In [6]:
%%javascript
    IPython.OutputArea.auto_scroll_threshold = 9999

<IPython.core.display.Javascript object>

In [8]:
data = pd.read_csv("train.csv")
pd.set_option('display.max_columns',100)

In [9]:
X = np.array(data.as_matrix(columns=data.columns[1:55]))
Y = np.array(data["Cover_Type"].tolist())
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, Y = X[shuffle], Y[shuffle]
train_df = data.iloc[shuffle,:].iloc[:12000 , :]
print('data shape: ', X.shape)
print('label shape:', Y.shape)
test_data, test_labels = X[13000:], Y[13000:]
dev_data, dev_labels = X[12000:13000], Y[12000:13000]
train_data, train_labels = X[:12000], Y[:12000]

data shape:  (15120, 54)
label shape: (15120,)


  """Entry point for launching an IPython kernel.


In [10]:
warnings.filterwarnings(action='ignore')

In [23]:
class MLModel():
    """
    Parent class for all ML models
    """
    def __init__(self, modelName='LogisticRegression'):
        self.modelName = modelName
        self.scaler = None
        self.pca = None
        if modelName == 'LogisticRegression':
            self.model = LogisticRegression(penalty='l2', solver='newton-cg', tol=0.001,random_state=1,
                                            multi_class='auto', max_iter=1000, verbose=0)
        elif modelName == 'DecisionTree':
            self.model = DecisionTreeClassifier(random_state=1)
        elif modelName == 'RandomForest':
            self.model = RandomForestClassifier(random_state=1)
        elif modelName == 'GradientBoosting':
            self.model = GradientBoostingClassifier(random_state=1)
        elif modelName == 'KNearestNeighbor':
            self.model = KNeighborsClassifier()
        elif modelName == 'Xgboost':
            self.model = xgb.XGBClassifier(random_state=1)
        else:
            raise Exception('Model ' + modelName + ' not implemented...')
            
    def grid_search(self, train_data, train_labels, dev_data, dev_labels, params, pca=None, scaler_type=None , print_out=True):
        
        grd_model = GridSearchCV( self.model,  param_grid = params ,return_train_score = 1, cv=3, n_jobs=-1)
        if scaler_type is not None:
            [scaled_train_data, scaled_dev_data] = self.scale_data(scaler_type ,  train_data , dev_data)
            if pca is not None:
                [scaled_train_pca_data, scaled_dev_pca_data] = self.pca_transform(scaled_train_data, scaled_dev_data, pca)
                grd_model.fit(scaled_train_pca_data, train_labels)
                predicted= grd_model.predict(scaled_dev_pca_data)
            else:
                grd_model.fit(scaled_train_data,train_labels)
                predicted= grd_model.predict(scaled_dev_data)
        else:
            if pca is not None:
                [train_pca_data, dev_pca_data] = self.pca_transform(train_data, dev_data, pca)
                grd_model.fit(train_pca_data, train_labels)
                predicted= grd_model.predict(dev_pca_data)
            else:
                grd_model.fit(train_data,train_labels)
                predicted= grd_model.predict(dev_data)
        if print_out != False:  
            print ( "\033[1m" ,  self.modelName , "\033[0;0m" )
            print ("Best fit parameters :")
            print (grd_model.best_params_)
            print ("Best fit model F1 score :")
            print (metrics.f1_score(dev_labels, predicted , average='micro'))
        self.classification_report = classification_report(predicted,dev_labels )       
        self.best_model = grd_model
        self.best_metrics = metrics.f1_score(dev_labels, predicted , average='micro')
        
    def scale_data(self, scaler_type ,  X_train , X_dev):
        if scaler_type == 'MinMax' :
            scaler = MinMaxScaler(feature_range=(0, 1))
        elif scaler_type == 'Robust':
            scaler = RobustScaler()
        elif scaler_type == 'Standard':
            scaler = StandardScaler()
        else:
            print('Unrecognized scaler ' + scaler_type + ' ... reverting to MinMax')
            scaler = MinMaxScaler(feature_range=(0, 1))
        scaled_X_train = scaler.fit_transform(X_train)
        scaled_X_dev = scaler.transform(X_dev)
        
        self.scaler = scaler

        return([scaled_X_train, scaled_X_dev])
    
    def pca_transform(self, X_train , X_dev, npca):
        pcaModel = PCA(n_components=npca)
        X_train_pca = pcaModel.fit_transform(X_train)
        X_dev_pca = pcaModel.transform(X_dev)
        return([X_train_pca, X_dev_pca])

### Approach 7

First classify the output between two classes . Class 0 with covertype 1,2 . Class 1 with covertype with rest. Have 2 more models 2 classify to exact cover type. 

In [24]:
#Prepare train and dev data for model 1 
test_model_1 = MLModel(modelName='RandomForest')
test_model_2 = MLModel(modelName='RandomForest')
test_model_3 = MLModel(modelName='RandomForest')
pca_components = None
scaler_type = None
params = {  'n_estimators' : [ 20, 30 , 40 ,50 , 200 ]  }
train_labels_ens_1 = np.where(train_labels> 2 , 1 , 0)
dev_labels_ens_1 = np.where(dev_labels> 2 , 1 , 0)
test_model_1.grid_search(train_data, train_labels_ens_1, dev_data, dev_labels_ens_1, params, pca=pca_components, scaler_type=scaler_type , print_out=False)
train_labels_ens_2 = train_labels[np.where(train_labels <= 2)]
train_data_ens_2 = train_data[np.where(train_labels <= 2)]
test_model_2.grid_search(train_data_ens_2, train_labels_ens_2, dev_data, dev_labels, params, pca=pca_components, scaler_type=scaler_type , print_out=False)
train_labels_ens_3 = train_labels[np.where(train_labels > 2)]
train_data_ens_3 = train_data[np.where(train_labels > 2)]
test_model_3.grid_search(train_data_ens_3, train_labels_ens_3, dev_data, dev_labels, params, pca=pca_components, scaler_type=scaler_type, print_out=False)

In [26]:
def predict_approach_ensemble_7(data , labels , test_model_1 , test_model_2, test_model_3):
    predicted_1 = test_model_1.best_model.predict(data)
    predicted_2 = test_model_2.best_model.predict(data)
    predicted_3 = test_model_3.best_model.predict(data)
    predicted_final = np.empty(labels.size , dtype=int)
    for i in range(labels.size):
        if predicted_1[i] == 0:
            predicted_final[i] = predicted_2[i]
        else:
            predicted_final[i] = predicted_3[i]
    return predicted_final 
predicted = predict_approach_ensemble_7(dev_data , dev_labels , test_model_1 , test_model_2, test_model_3)
print ("Best fit model F1 score :")
print(metrics.f1_score(dev_labels, predicted , average='micro'))

Best fit model F1 score :
0.8619999999999999


### Approach 8

First classify the output into 6 classes 0 ,  3 , 4 , 5 , 6 , 7. 0 is a combined class for 1 and 2. Have a separate model to classify 1 and 2 covertypes. In case the first model predicts 0 , then use the second model to further classify it as 1 or 2.

In [27]:
train_labels_ens_1 = np.where(train_labels> 2 , train_labels , 0 )
dev_labels_ens_1 = np.where(dev_labels> 2 , dev_labels , 0)

In [28]:
test_model_1 = MLModel(modelName='RandomForest')
test_model_2 = MLModel(modelName='RandomForest')
pca_components = None
scaler_type = None
params = {  'n_estimators' : [ 20, 30 , 40 ,50 , 200 ]  }
train_labels_ens_1 = np.where(train_labels> 2 , train_labels , 0 )
dev_labels_ens_1 = np.where(dev_labels> 2 , dev_labels , 0)
test_model_1.grid_search(train_data, train_labels_ens_1, dev_data, dev_labels_ens_1, params, pca=pca_components, scaler_type=scaler_type , print_out=False)
train_labels_ens_2 = train_labels[np.where(train_labels <= 2)]
train_data_ens_2 = train_data[np.where(train_labels <= 2)]
test_model_2.grid_search(train_data_ens_2, train_labels_ens_2, dev_data, dev_labels, params, pca=pca_components, scaler_type=scaler_type , print_out=False)

In [29]:
def predict_approach_ensemble_8(data , labels , test_model_1 , test_model_2):
    predicted_1 = test_model_1.best_model.predict(data)
    predicted_2 = test_model_2.best_model.predict(data)
    predicted_final = predicted_1
    for i in range(predicted_1.size):
        if predicted_1[i] == 0:
            predicted_final[i] = predicted_2[i]
    return predicted_final     
predicted = predict_approach_ensemble_8(dev_data , dev_labels , test_model_1 , test_model_2)
print ("Best fit model F1 score :")
print(metrics.f1_score(dev_labels, predicted , average='micro'))

Best fit model F1 score :
0.865
