In [2]:
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier

from sklearn.svm import SVC, LinearSVC
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve


import warnings
warnings.filterwarnings('ignore')

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [11]:
from mlens.ensemble import SuperLearner
from sklearn.metrics import accuracy_score
seed = 2017
np.random.seed(seed)

In [3]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
train_df = pd.read_csv('Datasets/train.csv')
test_df = pd.read_csv('Datasets/test.csv')
combine_df = pd.concat([train_df,test_df])

In [4]:
combine_df['Title'] = combine_df['Name'].apply(lambda x: x.split(', ')[1]).apply(lambda x: x.split('.')[0])
combine_df['Title'] = combine_df['Title'].replace(['Don','Dona', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col','Sir','Dr'],'Mr')
combine_df['Title'] = combine_df['Title'].replace(['Mlle','Ms'], 'Miss')
combine_df['Title'] = combine_df['Title'].replace(['the Countess','Mme','Lady','Dr'], 'Mrs')
df = pd.get_dummies(combine_df['Title'],prefix='Title')
combine_df = pd.concat([combine_df,df],axis=1)
combine_df['Name_Len'] = combine_df['Name'].apply(lambda x: len(x))
combine_df['Name_Len'] = pd.qcut(combine_df['Name_Len'],5)
#Dead_female_family & Survive_male_family
combine_df['Surname'] = combine_df['Name'].apply(lambda x:x.split(',')[0])
dead_female_surname = list(set(combine_df[(combine_df.Sex=='female') & (combine_df.Age>=12)
                              & (combine_df.Survived==0) & ((combine_df.Parch>0) | (combine_df.SibSp > 0))]['Surname'].values))
survive_male_surname = list(set(combine_df[(combine_df.Sex=='male') & (combine_df.Age>=12)
                              & (combine_df.Survived==1) & ((combine_df.Parch>0) | (combine_df.SibSp > 0))]['Surname'].values))
combine_df['Dead_female_family'] = np.where(combine_df['Surname'].isin(dead_female_surname),0,1)
combine_df['Survive_male_family'] = np.where(combine_df['Surname'].isin(survive_male_surname),0,1)
combine_df = combine_df.drop(['Name','Surname'],axis=1)

group = combine_df.groupby(['Title', 'Pclass'])['Age']
combine_df['Age'] = group.transform(lambda x: x.fillna(x.median()))
combine_df = combine_df.drop('Title',axis=1)
combine_df['IsChild'] = np.where(combine_df['Age']<=12,1,0)
# combine_df['Age'] = pd.cut(combine_df['Age'],5)
# combine_df = combine_df.drop('Age',axis=1)
combine_df['Ticket_Lett'] = combine_df['Ticket'].apply(lambda x: str(x)[0])
combine_df['Ticket_Lett'] = combine_df['Ticket_Lett'].apply(lambda x: str(x))

combine_df['High_Survival_Ticket'] = np.where(combine_df['Ticket_Lett'].isin(['1', '2', 'P']),1,0)
combine_df['Low_Survival_Ticket'] = np.where(combine_df['Ticket_Lett'].isin(['A','W','3','7']),1,0)
combine_df = combine_df.drop(['Ticket','Ticket_Lett'],axis=1)


In [5]:
combine_df.columns

Index(['Age', 'Cabin', 'Embarked', 'Fare', 'Parch', 'PassengerId', 'Pclass',
       'Sex', 'SibSp', 'Survived', 'Title_Master', 'Title_Miss', 'Title_Mr',
       'Title_Mrs', 'Name_Len', 'Dead_female_family', 'Survive_male_family',
       'IsChild', 'High_Survival_Ticket', 'Low_Survival_Ticket'],
      dtype='object')

In [37]:
features = combine_df.drop(["PassengerId","Survived"], axis=1).columns
le = LabelEncoder()
for feature in features:
    le = le.fit(combine_df[feature])
    combine_df[feature] = le.transform(combine_df[feature])

In [None]:
combine_df.to_csv('Datasets/combine.csv')

In [4]:
combine_df = pd.read_csv('Datasets/combine.csv')

In [5]:
X_all = combine_df.iloc[:891,:].drop(["PassengerId","Survived"], axis=1)
Y_all = combine_df.iloc[:891,:]["Survived"]
X_test = combine_df.iloc[891:,:].drop(["PassengerId","Survived"], axis=1)

In [6]:
X_train, X_cv, Y_train, Y_cv = train_test_split(X_all.values, Y_all.values, test_size=0.20)

In [46]:
# def get_models():
    
#   # """Generate a library of base learners."""
#     nb = GaussianNB()
# #     svc = SVC(C=100, probability=True)
#     knn = KNeighborsClassifier(n_neighbors=3)
#     lr = LogisticRegression()
#     rf = RandomForestClassifier(n_estimators=300,min_samples_leaf=4,class_weight={0:0.745,1:0.255})
#     gbdt = GradientBoostingClassifier(n_estimators=500,learning_rate=0.03,max_depth=3)
#     xgbGBDT = XGBClassifier(max_depth=2, n_estimators=280, learning_rate=0.05)
#     lgb = LGBMClassifier(max_depth=3, n_estimators=500, learning_rate=0.02)
#     models = {# 'svm': svc,
#             'knn': knn,
#             'naive bayes': nb,
#             'random forest': rf,
#             'logistic': lr,
#             'gbdt': gbdt,
# #             'xgbGBDT': xgbGBDT,
# #             'lgb': lgb,
             
#              }

#     return models


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)


In [47]:
base_learners = get_models()

In [48]:
base_learners.values()

dict_values([KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform'), GaussianNB(priors=None, var_smoothing=1e-09), RandomForestClassifier(bootstrap=True, class_weight={0: 0.745, 1: 0.255},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=4,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=None, oob_score=False,
            random_state=None, verbose=0, warm_start=False), LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False), GradientBoostingClassifier(criterion='friedman_mse', init=None,
  

In [7]:
meta_learner = GradientBoostingClassifier(n_estimators=500,learning_rate=0.03,max_depth=3)
# GradientBoostingClassifier(
#    n_estimators=1000,
#    loss="exponential",
#    max_features=4,
#    max_depth=3,
#    subsample=0.5,
#    learning_rate=0.005, 
# )



In [17]:
# !pip install mlens

Collecting mlens
[?25l  Downloading https://files.pythonhosted.org/packages/0b/f7/c04bda423ac93ddb54bc4c3a21c79c9a24bc83844efc30dc4c11c289e894/mlens-0.2.3-py2.py3-none-any.whl (227kB)
[K    100% |████████████████████████████████| 235kB 5.6MB/s ta 0:00:01
Installing collected packages: mlens
Successfully installed mlens-0.2.3


In [8]:
# from mlens.ensemble import SuperLearner
# from sklearn.metrics import accuracy_score
# seed = 2017
# np.random.seed(seed)

# # Instantiate the ensemble with 10 folds
# ensemble = SuperLearner(folds=10,scorer=accuracy_score,verbose=2,random_state=seed,backend="multiprocessing")

# # Build the first layer
# # Add the base learners and the meta learner
# ensemble.add(list(base_learners.values())) 
# # Build the second layer
# ensemble.add([XGBClassifier(max_depth=2, n_estimators=280, learning_rate=0.05)
#               ,LGBMClassifier(max_depth=3, n_estimators=500, learning_rate=0.02)])
# # Attach the final meta estimator
# ensemble.add_meta(meta_learner)

# # Train the ensemble
# X_train, X_cv, Y_train, Y_cv = train_test_split(X_all.values, Y_all.values, test_size=0.20)
# ensemble.fit(X_train, Y_train)

In [9]:
from mlens.ensemble import SequentialEnsemble

[MLENS] backend: threading


In [12]:
knn = KNeighborsClassifier(n_neighbors=3)
lr = LogisticRegression()
rf = RandomForestClassifier(n_estimators=300,min_samples_leaf=4,class_weight={0:0.745,1:0.255},random_state=seed)
gbdt = GradientBoostingClassifier(n_estimators=500,learning_rate=0.03,max_depth=3)
xgbGBDT = XGBClassifier(max_depth=2, n_estimators=280, learning_rate=0.05)
lgb = LGBMClassifier(max_depth=3, n_estimators=500, learning_rate=0.02)

In [13]:
ensemble = SequentialEnsemble()

# The initial layer is a blended layer, same as a layer in the BlendEnsemble
ensemble.add('blend',
             [lr,rf])

# The second layer is a stacked layer, same as a layer of the SuperLearner
ensemble.add('stack', [lr, rf,gbdt,lgb])

# The third layer is a subsembled layer, same as a layer of the Subsemble
ensemble.add('subsemble', [xgbGBDT,knn])

# The meta estimator is added as in any other ensemble
ensemble.add_meta(lr)



SequentialEnsemble(array_check=None, backend=None,
          layers=[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,
   name='layer-1', propagate_features=None, raise_on_exception=True,
   random_state=None, shuffle=False,
   stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,
   indexer=BlendIndex(X=None, raise_on_exception=...rer=None)],
   n_jobs=-1, name='group-3', raise_on_exception=True, transformers=[])],
   verbose=0)],
          model_selection=False, n_jobs=None, raise_on_exception=True,
          random_state=None, sample_size=20, scorer=None, shuffle=False,
          verbose=False)

In [None]:
preds = ensemble.fit(X_all.values, Y_all.values).predict(X_test.values).astype(int) 

In [None]:
p_sl = ensemble.predict(X_test.values)

In [35]:
p_sl

array([[0.98386216, 0.01613782],
       [0.14633349, 0.85366654],
       [0.9370754 , 0.06292461],
       [0.9868736 , 0.01312636],
       [0.02136476, 0.97863525],
       [0.9781376 , 0.02186241],
       [0.28253037, 0.71746963],
       [0.8557064 , 0.14429362],
       [0.018398  , 0.981602  ],
       [0.9793516 , 0.0206484 ],
       [0.97612894, 0.02387108],
       [0.98009115, 0.01990883],
       [0.0106215 , 0.9893785 ],
       [0.98748934, 0.01251066],
       [0.05787614, 0.94212383],
       [0.01458306, 0.98541695],
       [0.80458164, 0.19541834],
       [0.9050554 , 0.0949446 ],
       [0.91446674, 0.08553328],
       [0.03858819, 0.96141183],
       [0.9906501 , 0.00934986],
       [0.06507163, 0.93492836],
       [0.01217929, 0.9878207 ],
       [0.69616795, 0.30383205],
       [0.01506704, 0.98493296],
       [0.9702842 , 0.02971578],
       [0.01178834, 0.98821163],
       [0.95147425, 0.04852575],
       [0.5489112 , 0.4510888 ],
       [0.98843163, 0.01156835],
       [0.

In [24]:
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": p_sl
    })
submission.to_csv('submission_superlearner.csv', index=False)

Exception: Data must be 1-dimensional