# Submission Information:

### Team Member 1:
* UNI:  qh2174
* Name: Qiong Hu

### Team Member 2 [optional]:
* UNI:  qc2217
* Name: Qi Chen

# Step0 - Import Libraries, Load Data [0 points]

This is the basic step where you can load the data and create train and test sets for internal validation as per your convinience.

In [10]:
from sklearn.model_selection import train_test_split,cross_val_score,cross_val_predict,GridSearchCV, StratifiedKFold
from sklearn.preprocessing import PolynomialFeatures,OneHotEncoder,LabelEncoder,Imputer,FunctionTransformer,scale
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier,VotingClassifier,AdaBoostClassifier,ExtraTreesClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.decomposition import PCA
from sklearn import metrics
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from imblearn.pipeline import make_pipeline as make_imb_pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

## Load data

We get X_complete, Y_complete from data.csv, and get holdout_data from holdout.csv.  

In [3]:
def load_data(filename1, filename2):
    X_complete = pd.read_csv(filename1, usecols=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19])
    Y_complete = pd.read_csv(filename1, usecols=[20])
    X_holdout = pd.read_csv(filename2)
    return X_complete, Y_complete,X_holdout
X_complete,Y_complete, X_holdout = load_data('data.csv','holdout.csv')

# Step1 - Exploration and Preparation [10 points]

In this step, we expect you to look into the data and try to understand it before modeling. This understanding may lead to some basic data preparation steps which are common across the two model sets required.

## Data preparation

In the 'subscriebed' column in Y_complete, we replace 'no' with 0, and replace 'yes' with 1. 

We remove the 'Duration' column in X_complete and holdout_data.

In [15]:
def transform_y(y):
    y[y=="yes"] = 1
    y[y=="no"] = 0
    y = np.asarray(y)
    y_list = []
    for item in y:
        y_list.append(item[0])
    y = np.asarray(y_list)
    return y

def feature_selection(filename, is_train):
    if is_train: #training data
        category_index = [1, 2, 3, 4, 5, 6, 7, 8, 9, 14]
        continuous_index = [0, 11, 12, 13, 15, 16, 17, 18, 19]
    else: #testing data
        holdout_id = np.array(pd.read_csv(filename))[:,0]
        holdout_id = holdout_id.astype(int)
        category_index = [2, 3, 4, 5, 6, 7, 8, 9, 10, 15]
        continuous_index = [1, 12, 13, 14, 16, 17, 18, 19, 20]
        
    data_category = pd.read_csv(filename, dtype=np.str, usecols=category_index)
    data_continuous = pd.read_csv(filename, dtype=np.float32, usecols=continuous_index)
    x = np.append(data_category, data_continuous, axis=1)
    if is_train: #training data (with y)
        return x
    else: #testing data (without y)
        return x,holdout_id
    

X = feature_selection('data.csv',True)
holdout_data, holdout_id = feature_selection('holdout.csv',False)
Y = transform_y(Y_complete)
   
    

# Step2 - ModelSet1 [35 points]

In this step, we expect you to perform the following steps relevant to the models you choose for set1:

* feature engineering
* validation
* feature selection
* final model selection

You may select up to 5 models in this step for the purpose of final ensemble. Any classification algorithm covered in class apart from tree-based models can be tested here.

## Data process we do 

We scale the continuous variables.

We use LabelEncoder and OneHotEncoder to transform categorical data into numeric data in X and holdout_data. 

Then we split X into X_train, y_train, X_test, y_test.  

Since the data is imbalanced, we tried to oversample and undersample the data. And we find that oversampling has a better performance. So we comment the undersampling process.

And we choose the features with top 90% f score. 

The model we choose are: Logistic Regression, Knn, Nearest Centroids, Gaussian Naive Bayes, SVM models. We use GridSearchCV with different number of cv to in each model find the best parameters.

#### Model select
Logistic Regression roc score is about 0.8018908687. Knn roc score is about 0.7513665713. Nearest Centroid roc score is about 0.7432849. Gaussian Naive Bayes roc score is about 0.774527286. SVM roc score is about 0.77278532.

Logistic Regression has the best performance. So we will choose it in later ensemble. 

In [16]:
def prepare_data(x, y, x_holdout):
    
    # Scale 
    select_categorical = x[:, 0:10]
    select_continuous = x[:, 10:]
    select_categorical_holdout = x_holdout[:, 0:10]
    select_continuous_holdout = x_holdout[:, 10:]
    select_continuous = scale(select_continuous)
    select_continuous_holdout = scale(select_continuous_holdout)
    x = np.append(select_categorical, select_continuous, axis=1)
    x_holdout = np.append(select_categorical_holdout, select_continuous_holdout, axis=1)
              
    # LabelEncoder
    le = LabelEncoder()
    for i in range(10):
        le.fit(x[:, i])
        x[:, i] = le.transform(x[:, i])
        x_holdout[:, i] = le.transform(x_holdout[:, i])

    # OneHotEncoder
    encoder = OneHotEncoder(categorical_features=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                            sparse=False)  # input must be numeric (not string)
    encoder.fit(x)
    x = encoder.transform(x)
    x_holdout = encoder.transform(x_holdout)
    
    # Split data into train data and test data
    X_train, X_test, y_train, y_test = train_test_split(x, y, random_state = 56)

    #oversample dataset
    ros = RandomOverSampler()
    X_train, y_train = ros.fit_sample(X_train, y_train)
    
    '''#undersamle dataset
    rus = RandomUnderSampler()
    X_train, y_train = rus.fit_sample(X_train, y_train)
    '''
    
    # feature selection
    select=SelectPercentile(score_func=f_regression,percentile=90)
    select.fit(X_train,y_train)
    X_train=select.transform(X_train)
    X_test=select.transform(X_test)
    
    return X_train, y_train, X_test, y_test, x_holdout, x

X_train, y_train, X_test, y_test,X_holdout, X = prepare_data(X, Y, holdout_data)

In [10]:
#Logistic Regression model
def logistic_regression_classifier(X_train,y_train, X_test, y_test):
    params = {'C':[0.1, 0.5, 1, 5, 10, 100]}
    #oversample: clf = GridSearchCV(LogisticRegression(class_weight='balanced'), params, cv=10, n_jobs=-1).fit(X_train, y_train)
    clf = GridSearchCV(LogisticRegression(), params, cv=10, n_jobs=-1).fit(X_train, y_train)
    y_predict = clf.predict_proba(X_test)[:,1]
    lr_score= roc_auc_score(y_test, y_predict)
    return lr_score, clf

In [None]:
#knn model
def knn_classifier(X_train,y_train, X_test, y_test):
    k = np.arange(20)+1
    parameters = {'n_neighbors': k}
    knn = KNeighborsClassifier()
    clf = GridSearchCV(knn,parameters,cv=10)
    clf.fit(X_train,y_train)
    knn_score = roc_auc_score(y_test, knn.predict_proba(X_test)[:,1])
    return knn_score, clf

In [3]:
#Nearest Centroids
def nc_classifier(X_train,y_train, X_test, y_test):
    nc = NearestCentroid()
    nc.fit(X_train,y_train)
    nc_score = roc_auc_score(y_test, nc.predict(X_test))
    return nc_score, nc

In [4]:
# Gaussian Naive Bayes model
def naive_bayes_classifier(X_train,y_train, X_test, y_test):
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    nb_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
    return nb_score, clf

In [2]:
# SVM
def svm_classifier(X_train,y_train, X_test, y_test):
    parameters = {'kernel': ['linear', 'rbf'], 'C': [0.01, 0.05, 0.1, 0.5, 1]}
    svc = SVC()
    clf = GridSearchCV(svc,parameters,cv=10)
    clf.fit(X_train, y_train)
    svm_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
    return svm_score, clf

## Our try
Before we get our highest score, since the data has some 'unknown' value, we processed the data by imputing the data('most_frequenct'). But the roc score is not that high. So we give up these processing steps. 

#### The following is the code we tried but didn't choose.

def select_feature(filename, is_train):

    if is_train: #training data
        category_index = [1, 2, 3, 4, 5, 6, 7, 8, 9, 14]
        continuous_index = [0, 11, 13, 15, 16, 17, 18, 19]
    else: #testing data
        holdout_id = np.array(pd.read_csv(filename))[:,0]
        holdout_id = holdout_id.astype(int)
        category_index = [2, 3, 4, 5, 6, 7, 8, 9, 10, 15]
        continuous_index = [1, 12, 13, 14, 16, 17, 18, 19, 20]

    #for imputing
    missing = {"job": ['unknown'], "marital_status": ['unknown'], "education": ['unknown'],
               'credit_default': ['unknown'], 'housing': ['unknown'], 'loan': ['unknown']}
    
    data_category = pd.read_csv(filename, dtype=np.str, usecols=category_index, na_values=missing)
    data_continuous = pd.read_csv(filename, dtype=np.float32, usecols=continuous_index, na_values=missing)
    x = np.append(data_category, data_continuous, axis=1)
    if is_train: #training data (with y)
        y = pd.read_csv('data.csv', usecols=['subscribed'])
        return x,y
    else: #testing data (without y)
        return x,holdout_id


def data_processing(x, x_holdout):
    # Scale 
    select_categorical = x[:, 0:10]
    select_continuous = x[:, 10:]
    select_categorical_holdout = x_holdout[:, 0:10]
    select_continuous_holdout = x_holdout[:, 10:]
    select_continuous = scale(select_continuous)
    select_continuous_holdout = scale(select_continuous_holdout)
    x = np.append(select_categorical, select_continuous, axis=1)
    x_holdout = np.append(select_categorical_holdout, select_continuous_holdout, axis=1)
    
    #for imputing
    x = pd.DataFrame(x)
    x = x.fillna('null')
    x = np.asarray(x)
    
    # LabelEncoder
    le = LabelEncoder()
    for i in range(10):
        le.fit(x[:, i])
        x[:, i] = le.transform(x[:, i])
        
        #imputing: fill Nan
        l = list(le.inverse_transform(x[:,i].tolist()))
        indices = [j for j, k in enumerate(l) if k == "null"]
        for j in indices:
            x[j][i] = np.nan
    
    # Impute x
    imp = Imputer(strategy='most_frequent').fit(x)
    x = imp.transform(x)

    # OneHotEncoder
    encoder = OneHotEncoder(categorical_features=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                            sparse=False)  # input must be numeric (not string)
    encoder.fit(x)
    x = encoder.transform(x)
    return x, x_holdout


# Step3 - ModelSet2 [35 points]

In this step, we expect you to perform the following steps relevant to the models you choose for set2:

* feature engineering
* validation
* feature selection
* final model selection

You may select up to 5 models in this step for the purpose of final ensemble. We encourage you to try decition tree, random forest and gradient boosted tree methods here and pick the one which you think works best.

## Data process
The data processing (feature engineering, feature selection) is the same with that of Step2 ModelSet1.

#### Model select

We choose Decision Tree, GradientBoosting, Random Forest, AdaBoosting and ExtraTreesClassifier models in ModelSet2.

Decision Tree roc score is about 0.7633922753. GradientBoosting roc score is about 0.8008535437.Random Forest roc score is about 0.8058265199. AdaBoosting roc score is about 0.791998019343. ExtraTreesClassifier roc score is about 0.7811898194. 

So we will choose GradientBoosting, Random Forest, AdaBoosting, ExtraTreesClassifier in the later ensemble.

In [None]:
#Decision Tree
def decision_tree(X_train,y_train, X_test, y_test):
    dt = DecisionTreeClassifier(max_leaf_nodes=3, random_state=0)
    dt.fit(X_train, y_train)
    dt_score = roc_auc_score(y_test, dt.predict_proba(X_test)[:,1])
    return dt_score, dt

In [5]:
#GradientBoosting Model
def gradient_boosting(X_train,y_train, X_test, y_test):
    params = {'n_estimators':range(30,100,5)}
    GB = GradientBoostingClassifier(learning_rate=0.08,max_depth=8,min_samples_split=500, 
                                    min_samples_leaf=50,max_features='sqrt', random_state=20)
    clf = GridSearchCV(GB, params, cv=3, n_jobs=-1).fit(X_train, y_train)
    GB_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
    return GB_score, clf

In [6]:
#Random Forest Model
def random_forest(X_train,y_train, X_test, y_test):
    params = {'n_estimators':range(50,250,50)} #'max_depth':[3,6,9]
    #RF = RandomForestClassifier(n_estimators=150, n_jobs=-1,class_weight='balanced')
    RF = RandomForestClassifier(max_depth = 6, n_jobs=-1, random_state=0)
    clf = GridSearchCV(RF, params, cv=5).fit(X_train, y_train)
    rf_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
    return rf_score, clf

In [8]:
#AdaBoosting Model
def ada_boosting(X_train,y_train, X_test, y_test):
    params = {'learning_rate':[0.01, 0.05, 0.1, 0.5, 1], 'n_estimators': range(50,300,50) }
    Ada = AdaBoostClassifier(random_state=0)
    clf = GridSearchCV(Ada, params, cv=3).fit(X_train, y_train)
    ada_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
    return ada_score, clf

In [2]:
# ExtraTreesClassifier
def extra_tree(X_train,y_train, X_test, y_test):
    params = { 'n_estimators': range(50,250,50) }
    extra = ExtraTreesClassifier(random_state=0, n_estimators=300)
    clf = GridSearchCV(extra, params, cv=3).fit(X_train, y_train)
    et_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
    return ada_score, clf

# Step4 - Ensemble [20 points + 10 Bonus points]

In this step, we expect you to use the models created before and create new predictions. You should definitely try poor man's stacking but we encourage you to think of different ensemble techniques as well. We will judge your creativity and improvement in model performance using ensemble models and you can potentially earn 10 bonus points here.

#### We created three ensemble classifiers. Finally we choose the last ensemble: blending_classifier.




### voting classifier
we ensembled three classifiers (Logistic Regression, Random Forest, Gradient Boosting) with the weight of 10%, 30%, 60% respectively. The roc score is about 0.798033751247. 

In [7]:
#Voting classifier
def voting_classifier(X_train,y_train, X_test, y_test):
    voting = VotingClassifier([('LogisticRegression',LogisticRegression(C=1)),
                          ('RandomForest', RandomForestClassifier(max_depth=6, n_estimators=150, random_state=0)),
                          ('GradientBoosting', GradientBoostingClassifier(learning_rate=0.08, max_depth=8,n_estimators=85,random_state=20))
                          ],
                         voting='soft', weights=[1,3,6])
    voting.fit(X_train,y_train)
    lr,tree,gb = voting.estimators_
    score = roc_auc_score(y_test, voting.predict_proba(X_test)[:,1])
    return score, voting

### Poor man stacking
We use voting with three classifiers(Logistic Regression, Random Forest, Gradient Boosting). Then use Logistic Regression on the voting results.  The roc score is about 0.70 before we do resampling.

#### After we resample it, we get a better performance.
When we use undersampling, the roc score improves to 0.75. When we use oversampling, the roc score jumps to 0.962469072977(which is weird but we don't know why). So we finally choose oversampling.

In [1]:
def poor_man(X_train, y_train):
    voting = VotingClassifier([('LogisticRegression',LogisticRegression(C=1)),
                          ('RandomForest', RandomForestClassifier(max_depth=6, n_estimators=150, random_state=0)),
                          ('GradientBoosting', GradientBoostingClassifier(learning_rate=0.08,
                                                                          max_depth=8,n_estimators=85,random_state=0))
                          ], voting='soft', weights=[1,3,6])
    reshaper = FunctionTransformer(lambda X_: np.rollaxis(X_, 1).reshape(-1,6)[:, 1::2], validate=False)
    stacking = make_pipeline(voting, reshaper, LogisticRegression(C=100))
    stacking.fit(X_train, y_train)
    return np.mean(cross_val_score(stacking, X_train, y_train, cv=5, scoring='roc_auc')), stacking

### stacking_classifier

Firstly, we use KNeighborsClassifier, RandomForestClassifier. Then a meta-classifier "LogisticRegression" is used to do fitting based on the outputs. Before we do resampling, the roc score is about 0.71, which is not a good performance. 

#### After we resample it, we get a better performance.
When we use undersampling, the roc score improves to 0.76. When we use oversampling, the roc score jumps to 0.96(which is weird but we don't know why). So we finally choose oversampling.

In [9]:
#Stacking classifier
def stacking_classifier(X,Y, X_holdout,holdout_id):
    #oversample dataset
    ros = RandomOverSampler()
    X_oversample, y_oversample = ros.fit_sample(X, Y)
    
    clf1 = AdaBoostClassifier(random_state=0,n_estimators=150,learning_rate=0.1)
    clf2 = RandomForestClassifier(random_state=0,n_estimators=150)
    clf3 = GaussianNB()
    lr = LogisticRegression()
    sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], 
                          meta_classifier=lr)

    print('3-fold cross validation:\n')

    for clf, label in zip([clf1, clf2, clf3, sclf], 
                          ['Ada Boost', 
                           'Random Forest', 
                           'Naive Bayes',
                           'StackingClassifier']):
        
        scores = cross_val_score(clf, X_oversample, y_oversample, cv=3, scoring='accuracy')
        print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
    
        scores = cross_val_score(clf, X_oversample, y_oversample, cv=3, scoring='roc_auc')
        print("roc_auc: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
    #y_holdout = sclf.predict_proba(X_holdout)[:, 1]
    #y_holdout = cross_val_predict(sclf, X_holdout, cv=3)
    #save_csv_numpy(holdout_id, y_holdout)
    return scores

#stacking_classifier(X,Y,X_holdout,holdout_id)

### Blending classifier ensemble
In this ensemble, we stack 5 base models(RandomForests, ExtraTrees, GradientBoosting) in StratifiedKfold with the meta-classifier GradientBoosting.

We use two For-loop. In the first For-loop, there are 5 different classifiers. In the second For-loop, we use StratifiedKfold to split the data into 5 train-test folds. In each split, we create predictions for X_test and X_holdout. When creating predictions for the X_holdout, we take an average of the out-of-fold predictors.

Finally, the meta-classifier GradientBoosting is used to do fitting based on the outputs.


In [10]:
def blending_classifier(X,Y, X_holdout,holdout_id):
    clfs = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
            RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy', random_state=0),
            ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini', random_state=0),
            ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy', random_state=0),
            GradientBoostingClassifier(learning_rate=0.08, subsample=0.8, max_depth=8, n_estimators=85, random_state=20)]

    skf = StratifiedKFold(n_splits=5) #5 folders

    dataset_blend_train = np.zeros((X.shape[0], len(clfs))) #to save X_test prediction
    dataset_blend_test = np.zeros((X_holdout.shape[0], len(clfs)))#to save X_holdout prediction

    for j, clf in enumerate(clfs):
        dataset_blend_test_j = np.zeros((X_holdout.shape[0], 5))
        for i, (train_index, test_index) in enumerate(skf.split(X, Y)):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = Y[train_index], Y[test_index]
            clf.fit(X_train, y_train)
            dataset_blend_train[test_index, j] = clf.predict_proba(X_test)[:, 1]
            dataset_blend_test_j[:, i] = clf.predict_proba(X_holdout)[:, 1]
        dataset_blend_test[:, j] = dataset_blend_test_j.mean(1) # get average of dataset_blend_test_j


    clf1 = GradientBoostingClassifier()
    clf1.fit(dataset_blend_train, Y)
    y_submission = clf1.predict_proba(dataset_blend_test)[:, 1]
    y_scores = clf1.predict_proba(dataset_blend_train)[:, 1]
    y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
    #save_csv_numpy(holdout_id, y_submission)
    return roc_auc_score(Y, y_scores)

In [11]:
def save_csv_numpy(holdout_id, y_submission):
    tmp = np.vstack([holdout_id, y_submission]).T
    np.savetxt(fname='submission.csv', X=tmp, fmt='%d,%0.9f', header='ID,subscribed', comments='')

In [12]:
def save_csv(X_holdout, holdout_id, clf):
    holdout_predict = clf.predict_proba(X_holdout)
    holdout = np.zeros(shape=(len(holdout_id),2))
    holdout_id = pd.DataFrame(holdout_id)
    holdout_predict = pd.DataFrame(holdout_predict)

    holdout = pd.concat([holdout_id, holdout_predict[1]], axis = 1)
    holdout.columns = ["ID","subscribed"]
    holdout.to_csv('holdout_predict.csv', index=False)

In [18]:
def test(X_train,y_train, X_test, y_test, X_holdout, X, Y, holdout_id):  
    #get score from logistic regression classifier
    score, clf = logistic_regression_classifier(X_train,y_train, X_test, y_test)
    #check score
    print (score)
    assert score > 0.79
    
    #get score from gradient boosting classifier
    score,clf = gradient_boosting(X_train,y_train, X_test, y_test)
    #check score
    print (score)
    assert score > 0.79
    
    #get score from random forest classifier
    score,clf = random_forest(X_train,y_train, X_test, y_test)
    #check score
    print (score)
    assert score > 0.79
    
    #get score from voting classifier
    score,clf = voting_classifier(X_train,y_train, X_test, y_test)
    #check score
    print (score)
    assert score > 0.79
    
    #get score from blending classifier
    score = blending_classifier(X,Y, X_holdout, holdout_id)
    #check score
    print (score)
    assert score > 0.79
    
test(X_train,y_train, X_test, y_test, X_holdout, X, Y, holdout_id)

0.803163021884
0.806829058849
