### https://www.kaggle.com/c/tabular-playground-series-apr-2021

In [289]:
import warnings
warnings.filterwarnings('ignore')

In [290]:
import time

import numpy as np
import pandas as pd
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [291]:
from sklearn.metrics import accuracy_score

In [292]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

In [422]:
ds = pd.read_csv('train.csv', delimiter=',')

In [423]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   PassengerId  100000 non-null  int64  
 1   Survived     100000 non-null  int64  
 2   Pclass       100000 non-null  int64  
 3   Name         100000 non-null  object 
 4   Sex          100000 non-null  object 
 5   Age          96708 non-null   float64
 6   SibSp        100000 non-null  int64  
 7   Parch        100000 non-null  int64  
 8   Ticket       95377 non-null   object 
 9   Fare         99866 non-null   float64
 10  Cabin        32134 non-null   object 
 11  Embarked     99750 non-null   object 
dtypes: float64(2), int64(5), object(5)
memory usage: 9.2+ MB


### Filling missing values

In [424]:
ds.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,1,"Oconnor, Frankie",male,,2,0,209245,27.14,C12239,S
1,1,0,3,"Bryan, Drew",male,,0,0,27323,13.35,,S
2,2,0,3,"Owens, Kenneth",male,0.33,1,2,CA 457703,71.29,,S


In [425]:
ds.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [426]:
columns_cat = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
columns_numeric = ['Age', 'Fare']
target = ['Survived']

### -------------ds-train--------------

In [427]:
def filling_missing_values_mean(data, column):
    
    data[column][data.Survived == 0] = data[column].fillna(data[column][data.Survived == 0].mean())
    data[column][data.Survived == 1] = data[column].fillna(data[column][data.Survived == 1].mean())
    
    return data

In [428]:
ds = filling_missing_values_mean(ds, 'Age')

In [429]:
ds = filling_missing_values_mean(ds, 'Fare')

In [430]:
ds = ds[columns_cat + columns_numeric + target]
ds.dropna(axis=0, inplace=True)

In [431]:
ds.columns

Index(['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked', 'Age', 'Fare',
       'Survived'],
      dtype='object')

In [432]:
ds.isnull().sum()

Pclass      0
Sex         0
SibSp       0
Parch       0
Embarked    0
Age         0
Fare        0
Survived    0
dtype: int64

In [433]:
ds.shape

(99750, 8)

In [434]:
X = ds.drop('Survived', axis=1)
y = ds.Survived

In [435]:
X.shape, y.shape

((99750, 7), (99750,))

In [436]:
X.head(3)

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked,Age,Fare
0,1,male,2,0,S,40.553799,27.14
1,3,male,0,0,S,36.708695,13.35
2,3,male,1,2,S,0.33,71.29


In [437]:
columns = X.columns

In [438]:
columns

Index(['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked', 'Age', 'Fare'], dtype='object')

### ----------ds-test----------------

In [439]:
ds_test = pd.read_csv('test.csv', decimal=',')

In [440]:
ds_test.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,100000,3,"Holliday, Daniel",male,19.0,0,0,24745,63.01,,S
1,100001,3,"Nguyen, Lorraine",female,53.0,0,0,13264,5.81,,S
2,100002,1,"Harris, Heather",female,19.0,0,0,25990,38.91,B15315,C


In [441]:
ds_test_id = ds_test.PassengerId
X_test = ds_test.drop('PassengerId', axis=1)

In [442]:
X_test = X_test[columns]

In [443]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   Pclass    100000 non-null  int64 
 1   Sex       100000 non-null  object
 2   SibSp     100000 non-null  int64 
 3   Parch     100000 non-null  int64 
 4   Embarked  99723 non-null   object
 5   Age       96513 non-null   object
 6   Fare      99867 non-null   object
dtypes: int64(3), object(4)
memory usage: 5.3+ MB


In [444]:
X_test.isnull().sum()

Pclass         0
Sex            0
SibSp          0
Parch          0
Embarked     277
Age         3487
Fare         133
dtype: int64

In [445]:
X_test['Age'] = X_test['Age'].astype('float64')
X_test['Fare'] = X_test['Fare'].astype('float64')

In [446]:
def filling_missing_values_mean_age_test(data, column, change_col):
    
    for val in data[column].unique():
        
        fillna_value = data[change_col][data[column] == val].mean()
        data[change_col][data[column] == val] = data[change_col].fillna(fillna_value)
        
    return data    

In [447]:
X_test = filling_missing_values_mean_age_test(X_test, 'Sex', 'Age')

In [448]:
X_test = filling_missing_values_mean_age_test(X_test, 'Pclass', 'Fare')

In [449]:
ds_test.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [450]:
ds_test.Ticket[ds_test.Embarked == 'C'].count()

21104

In [451]:
ds_test.Ticket[ds_test.Embarked == 'S'].count()

65066

In [452]:
ds_test.Ticket[ds_test.Embarked == 'Q'].count()

8379

In [453]:
X_test.Embarked.fillna('S', inplace=True)

In [454]:
X_test.isnull().sum() == X.isnull().sum()

Pclass      True
Sex         True
SibSp       True
Parch       True
Embarked    True
Age         True
Fare        True
dtype: bool

### ----train-----------

In [455]:
X = pd.get_dummies(X, columns=columns_cat, drop_first=True, prefix_sep='_')

In [456]:
X.shape

(99750, 20)

In [457]:
columns_for_learn = X.columns

In [458]:
X.head(3)

Unnamed: 0,Age,Fare,Pclass_2,Pclass_3,Sex_male,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Parch_9,Embarked_Q,Embarked_S
0,40.553799,27.14,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
1,36.708695,13.35,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0.33,71.29,0,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1


### --------test------------

In [459]:
X_test = pd.get_dummies(X_test, columns=columns_cat, prefix_sep='_')

In [460]:
X_test = X_test[columns_for_learn]

In [461]:
X_test.shape

(100000, 20)

In [462]:
X_test.head(3)

Unnamed: 0,Age,Fare,Pclass_2,Pclass_3,Sex_male,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Parch_9,Embarked_Q,Embarked_S
0,19.0,63.01,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,53.0,5.81,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,19.0,38.91,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### train test

In [463]:
# from sklearn.model_selection import train_test_split

In [464]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [465]:
# plt.figure(figsize=(5, 1))
# plt.subplot(121)
# sns.countplot(y_train);
# plt.subplot(122)
# sns.countplot(y_test);

In [466]:
# X_train.head(3)

### models

In [467]:
from sklearn.linear_model import LogisticRegressionCV, PassiveAggressiveClassifier, Perceptron, LogisticRegression
from sklearn.linear_model import RidgeClassifierCV, SGDClassifier, RidgeClassifier

from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC, NuSVC, SVC

from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier

from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [468]:
models = {'log_reg': [LogisticRegression(penalty='l2', C=5.0, random_state=42, n_jobs=-1), 
                      0.46696696696696693], 
          'RidgeClass': [RidgeClassifier(alpha=7.25, fit_intercept=True, normalize=False, random_state=42)],
          'AdaBoostClassifier': [AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=9, random_state=42), 
                                                   n_estimators=43, learning_rate=0.2789473684210526, random_state = 42), 
                                0.5018018018018018],
          'BaggingClassifier': [BaggingClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=9, random_state=42), 
                                                 n_estimators=450, max_features=0.525, bootstrap=True, bootstrap_features=True, 
                                                 n_jobs=-1, random_state=42), 
                                0.48018018018018016],
          
          'GradientBoostingClassifier': [GradientBoostingClassifier(learning_rate=0.075, n_estimators=300, criterion='friedman_mse', 
                                                                   min_samples_leaf=1, max_depth=3, random_state=42, 
                                                                   max_leaf_nodes=None, validation_fraction=0.1), 
                                         0.47257257257257257],
          
          'RandomForestClassifier': [RandomForestClassifier(n_estimators=1500, criterion='entropy', max_leaf_nodes=9, 
                                                           max_features=6, bootstrap=True, random_state=42, n_jobs=-1), 
                                     0.5042042042042042],
          'LGBMClassifier': [LGBMClassifier(n_estimators=100, learning_rate=0.14434343434, metric='accuracy', max_depth=9, random_state = 42), 
                             0.5004004004004005],
          'XGBClassifier': [XGBClassifier(learning_rate=0.05, max_depth=6, n_estimators=300, seed=42), 
                            0.5058058058058058]
          }

In [469]:
# y_pred = np.array(model.predict_proba(X_test)[:, 1] > 0.5).astype(int)
# y_pred

In [470]:
pred_train = pd.DataFrame()
pred_test = pd.DataFrame()

In [471]:
%%time
pred_train = pd.DataFrame()
pred_test = pd.DataFrame()

for name_model, m in models.items():
    
    preds = np.zeros(y.shape[0])
    preds_test = np.zeros(X_test.shape[0])
    
    kf = StratifiedKFold(n_splits=10 , random_state=42 , shuffle=True)
    scores = []
    print(name_model, time.strftime('%X'), end='-')
    
    for trn_idx , val_idx in kf.split(X , y):
        
        train_x = X.iloc[trn_idx]
        train_y = y.iloc[trn_idx]
        val_x = X.iloc[val_idx]
        val_y = y.iloc[val_idx]
 #---TRAIN               
        model = m[0]
        
        if name_model in ['LGBMClassifier', 'XGBClassifier']:
            if name_model in ['LGBMClassifier']:
                model.fit(train_x , train_y, eval_metric='accuracy')            
            else:
                model.fit(train_x , train_y, eval_metric = 'logloss')
                
        else:
            model.fit(train_x, train_y)
 #---PREDICT           
        if name_model in ['RidgeClass']:
            preds += model.predict(X) / kf.n_splits
            preds_test += model.predict(X_test) / kf.n_splits
            scores.append(accuracy_score(val_y, model.predict(val_x)))
        else:
            preds += model.predict_proba(X)[:, 1] / kf.n_splits
            preds_test += model.predict_proba(X_test)[:, 1] / kf.n_splits
            scores.append(accuracy_score(val_y, np.array(model.predict_proba(val_x)[:, 1] > m[1])))      
          
    if name_model in ['RidgeClass']:
        pred_train[name_model] = preds
        pred_test[name_model] = preds_test
    else:
        pred_train[name_model] = np.array(preds > m[1]).astype(int)
        pred_test[name_model] = np.array(preds_test > m[1]).astype(int)    
    
    print(time.strftime('%X'), np.mean(scores))

log_reg 21:57:50-21:58:23 0.7691729323308272
RidgeClass 21:58:23-21:58:25 0.7655538847117794
AdaBoostClassifier 21:58:25-21:59:36 0.7781152882205514
BaggingClassifier 21:59:36-22:00:51 0.7689423558897243
GradientBoostingClassifier 22:00:51-22:05:05 0.7791679197994987
RandomForestClassifier 22:05:05-22:08:11 0.7689323308270677
LGBMClassifier 22:08:11-22:08:17 0.779438596491228
XGBClassifier 22:08:17-22:09:32 0.7794887218045113
CPU times: user 33min 4s, sys: 28 s, total: 33min 32s
Wall time: 11min 41s


In [479]:
pred_train.head(3)

Unnamed: 0,log_reg,RidgeClass,AdaBoostClassifier,BaggingClassifier,GradientBoostingClassifier,RandomForestClassifier,LGBMClassifier,XGBClassifier
0,0,0,1,1,1,0,1,1
1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0


In [473]:
pred_train.RidgeClass = np.array(pred_train.RidgeClass >= 0.5).astype(int)

In [474]:
pred_train[[col for col in pred_train.columns if col]].sum(axis = 1).value_counts()

0    51442
8    35520
7     3167
1     2009
2     1710
3     1674
4     1630
6     1499
5     1099
dtype: int64

In [480]:
pred_test.head(3)

Unnamed: 0,log_reg,RidgeClass,AdaBoostClassifier,BaggingClassifier,GradientBoostingClassifier,RandomForestClassifier,LGBMClassifier,XGBClassifier
0,0,0,0,0,0,0,0,0
1,1,1,0,0,1,0,0,0
2,1,1,1,1,1,1,1,1


In [476]:
pred_test.RidgeClass = np.array(pred_test.RidgeClass >= 0.5).astype(int)

In [478]:
pred_test[[col for col in pred_test.columns if col]].sum(axis = 1).value_counts()

0    64511
8    22334
7     3009
1     2869
2     1834
6     1691
3     1422
5     1186
4     1144
dtype: int64

In [487]:
sample_sub = pd.read_csv('sample_submission.csv')

In [488]:
sample_sub.columns

Index(['PassengerId', 'Survived'], dtype='object')

### votting

In [481]:
y_pred = (pred_train[[col for col in pred_train.columns]].sum(axis=1) >= 4).astype(int)

In [482]:
accuracy_score(y, y_pred)

0.7803408521303258

In [483]:
y_pred_test = (pred_test[[col for col in pred_test.columns]].sum(axis=1) >= 4).astype(int)

In [489]:
answer_1 = pd.DataFrame({'PassengerId': ds_test_id, 'Survived': y_pred_test})

In [491]:
answer_1.to_csv('answer_1.csv', index=False)

### Params 2

In [26]:
# pred_train = pd.read_csv('step1.csv')

In [492]:
pred_train.columns

Index(['log_reg', 'RidgeClass', 'AdaBoostClassifier', 'BaggingClassifier',
       'GradientBoostingClassifier', 'RandomForestClassifier',
       'LGBMClassifier', 'XGBClassifier'],
      dtype='object')

In [493]:
# pred_train.drop('Unnamed: 0', axis=1, inplace=True)

In [494]:
# X_train, X_test, y_train, y_test = train_test_split(pred_train, y, test_size=0.25, random_state=42)

In [495]:
models_2 = {'log_reg': [LogisticRegression(penalty='l2', C=0.16335714285714284, random_state = 42, n_jobs=-1), 
                      0.54505], 
          'RidgeClass': [RidgeClassifier(alpha=8.165102040816327, fit_intercept=True, normalize=False, random_state=42)],
          'AdaBoostClassifier': [AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=7, max_depth=7, random_state=42), 
                               n_estimators=60, learning_rate=0.20408163265306123, random_state = 42), 
                                0.5],
          'BaggingClassifier': [BaggingClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=2, random_state=42),
                                                  n_estimators=141, max_features=1.0, bootstrap=True, bootstrap_features=True,
                                                  n_jobs=-1, random_state=42), 
                                0.5],
          
          'GradientBoostingClassifier': [GradientBoostingClassifier(learning_rate=0.1, n_estimators=45, criterion='friedman_mse', max_depth=4,
                                                                    random_state=42, max_leaf_nodes=None, validation_fraction=0.1), 
                                         0.4941942],
          
          'RandomForestClassifier': [RandomForestClassifier(n_estimators=1500, criterion='entropy', max_leaf_nodes=None, max_features=5, 
                                                            bootstrap=True, random_state=33, n_jobs=-1), 
                                     0.521821821822],
          'LGBMClassifier': [LGBMClassifier(n_estimators=39, learning_rate=0.023469387755102038, metric='accuracy', max_depth=6, random_state = 42), 
                             0.49],
          'XGBClassifier': [XGBClassifier(learning_rate=1.0, max_depth=4, n_estimators=30, seed=42), 
                            0.5]
          }

In [496]:
pred_train_2 = pd.DataFrame()
pred_test_2 = pd.DataFrame()

In [497]:
for name_model, m in models_2.items():
    
    preds = np.zeros(y.shape[0])
    preds_test = np.zeros(X_test.shape[0])
    
    kf = StratifiedKFold(n_splits=5 , random_state=42 , shuffle=True)
    scores = []
    print(name_model, time.strftime('%X'), end='-')
    
    for trn_idx , val_idx in kf.split(pred_train, y):
        
        train_x = pred_train.iloc[trn_idx]
        train_y = y.iloc[trn_idx]
        val_x = pred_train.iloc[val_idx]
        val_y = y.iloc[val_idx]
                
        model = m[0]
        
        if name_model in ['LGBMClassifier', 'XGBClassifier']:
            if name_model in ['LGBMClassifier']:
                model.fit(train_x , train_y, eval_metric='accuracy')            
            else:
                model.fit(train_x , train_y, eval_metric = 'logloss')
                
        else:
            model.fit(train_x, train_y)
            
        if name_model in ['RidgeClass']:
            preds += model.predict(pred_train) / kf.n_splits
            preds_test += model.predict(pred_test) / kf.n_splits
            scores.append(accuracy_score(val_y, model.predict(val_x)))
        else:
            preds += model.predict_proba(pred_train)[:, 1] / kf.n_splits
            preds_test += model.predict_proba(pred_test)[:, 1] / kf.n_splits
            scores.append(accuracy_score(val_y, np.array(model.predict_proba(val_x)[:, 1] > m[1])))      
          
    if name_model in ['RidgeClass']:
        pred_train_2[name_model] = preds
        pred_test_2[name_model] = preds_test
    else:
        pred_train_2[name_model] = np.array(preds > m[1]).astype(int)
        pred_test_2[name_model] = np.array(preds_test > m[1]).astype(int)    
    
    print(time.strftime('%X'), np.mean(scores))

log_reg 22:18:19-22:18:24 0.7882406015037595
RidgeClass 22:18:24-22:18:24 0.7886917293233082
AdaBoostClassifier 22:18:24-22:18:43 0.7901353383458647
BaggingClassifier 22:18:43-22:18:52 0.7893834586466165
GradientBoostingClassifier 22:18:52-22:18:58 0.7902355889724311
RandomForestClassifier 22:18:58-22:20:37 0.7899749373433584
LGBMClassifier 22:20:37-22:20:38 0.7899649122807018
XGBClassifier 22:20:38-22:20:40 0.7901754385964912


In [500]:
pred_train_2.head(3)

Unnamed: 0,log_reg,RidgeClass,AdaBoostClassifier,BaggingClassifier,GradientBoostingClassifier,RandomForestClassifier,LGBMClassifier,XGBClassifier
0,1,0,1,1,1,1,1,1
1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0


In [499]:
pred_train_2.RidgeClass = np.array(pred_train.RidgeClass >= 0.5).astype(int)

In [501]:
pred_train_2[[col for col in pred_train_2.columns if col]].sum(axis = 1).value_counts()

0    53766
8    37846
1     3806
7     3240
4      444
5      274
6      260
3      101
2       13
dtype: int64

In [502]:
y_pred = (pred_train_2[[col for col in pred_train_2.columns]].sum(axis=1) >= 4).astype(int)

In [503]:
accuracy_score(y, y_pred)

0.7904761904761904

In [507]:
pred_test_2.head(3)

Unnamed: 0,log_reg,RidgeClass,AdaBoostClassifier,BaggingClassifier,GradientBoostingClassifier,RandomForestClassifier,LGBMClassifier,XGBClassifier
0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1


In [506]:
pred_test_2.RidgeClass = np.array(pred_test_2.RidgeClass >= 0.5).astype(int)

In [508]:
pred_test_2[[col for col in pred_test_2.columns if col]].sum(axis = 1).value_counts()

0    70840
8    27824
3      457
5      370
6      273
7      124
2       47
1       38
4       27
dtype: int64

In [515]:
y_pred_test_2 = (pred_test_2[[col for col in pred_test_2.columns]].sum(axis=1) >= 5).astype(int)

In [516]:
answer_2 = pd.DataFrame({'PassengerId': ds_test_id, 'Survived': y_pred_test_2})

In [517]:
answer_2.to_csv('answer_5.csv', index=False)