<div style="background-color: blue; padding: 10px; border-radius: 10px">
    <h1 align="center" style="color: lightblue;">Importing Libraries</h1>
</div>

In [10]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np 
import pandas as pd
import pickle as pkl
from xgboost import XGBClassifier
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import StratifiedKFold  

<div style="background-color: blue; padding: 10px; border-radius: 10px">
    <h1 align="center"><font color='lightblue'>Reading Train and Test Datasets</font></h1>
</div>

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def import_data(file, **kwargs):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True, **kwargs)
    df = reduce_mem_usage(df)
    return df

In [22]:
train = import_data('./data/train.csv', index_col='id')
test = import_data('./data/test.csv', index_col='id')

Memory usage of dataframe is 523.17 MB
Memory usage after optimization is: 95.15 MB
Decreased by 81.8%
Memory usage of dataframe is 332.93 MB
Memory usage after optimization is: 61.46 MB
Decreased by 81.5%


<div style="background-color: blue; padding: 10px; border-radius: 10px">
    <h1 align="center"><font color='lightblue'>Preprocessing</font></h1>
</div>

In [4]:
initial_features = list(test.columns)

float_features = ['cap-diameter', 'stem-height', 'stem-width']
cat_features = [f for f in initial_features if f not in float_features]
for feature in initial_features:
    if feature in cat_features:
        dtype = pd.CategoricalDtype(categories=sorted(list(set(train[feature].dropna()) | set(test[feature].dropna()))),
                                    ordered=False)
        print(f"{feature:30} {len(dtype.categories)}")
    else:
        dtype = np.float32
    train[feature] = train[feature].astype(dtype)
    if True:
        test[feature] = test[feature].astype(dtype)

cap-shape                      108
cap-surface                    114
cap-color                      109
does-bruise-or-bleed           29
gill-attachment                117
gill-spacing                   66
gill-color                     86
stem-root                      45
stem-surface                   87
stem-color                     88
veil-type                      24
veil-color                     27
has-ring                       26
ring-type                      47
spore-print-color              43
habitat                        65
season                         4


<div style="background-color: blue; padding: 10px; border-radius: 10px">
    <h1 align="center"><font color='lightblue'>Cross-Validation</font></h1>
</div>

In [5]:
X = train.drop(['class'], axis=1)
y = train['class'].map({'p': 0, 'e': 1})

In [6]:
%%time

params_xgb = {
    
    'enable_categorical': True,
    'tree_method': 'hist',
    'device': 'cuda',
    'n_estimators': 360,         
    'learning_rate': 0.1,           
    'max_depth': 17,                
    'colsample_bytree': 0.4,         
    'min_child_weight': 2,           
    'reg_lambda': 67,                
    'subsample': 0.98,              
    'num_parallel_tree': 4,
}

NUM_FOLDS = 5
val_scores = []

skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=1)

for fold, (train_index, val_index) in enumerate(skf.split(X, y)):  

    X_train, X_val = X.iloc[train_index], X.iloc[val_index] 
    y_train, y_val = y[train_index], y[val_index]    
        
    xgb = XGBClassifier(**params_xgb)   
   
    xgb.fit(X_train, y_train) 
      
    val_pred = xgb.predict(X_val) 
    mcc = matthews_corrcoef(y_val, val_pred )
    print(f'Fold {fold}: MCC = {mcc:.5f}')
    val_scores.append(mcc)

print(f'Mean Validation MCC= {np.mean(val_scores):.5f}')
print(f'Standard Deviation Validation MCC= {np.std(val_scores):.5f}')

Fold 0: MCC = 0.98503
Fold 1: MCC = 0.98477
Fold 2: MCC = 0.98480
Fold 3: MCC = 0.98480
Fold 4: MCC = 0.98498
Mean Validation MCC= 0.98488
Standard Deviation Validation MCC= 0.00010
CPU times: user 11h 28min 26s, sys: 38.5 s, total: 11h 29min 4s
Wall time: 3h 26min 5s


In [23]:
xgb = XGBClassifier(**params_xgb)
xgb.fit(X, y) 
test_preds_xgb = xgb.predict(test)

In [15]:
pkl.dump(xgb, open('./model/model.pkl', 'wb'))

In [24]:
del train, test

<div style="background-color: blue; padding: 10px; border-radius: 10px">
    <h1 align="center"><font color='lightblue'>Submission</font></h1>
</div>

In [42]:
sample_submission = import_data('./data/sample_submission.csv', index_col='id')
test_preds_series = pd.Series(test_preds_xgb, index=sample_submission.index)
sample_submission['class'] = test_preds_series.map({0: 'p', 1: 'e'})
sample_submission.head()

Memory usage of dataframe is 31.71 MB
Memory usage after optimization is: 17.84 MB
Decreased by 43.7%


Unnamed: 0_level_0,class
id,Unnamed: 1_level_1
3116945,p
3116946,p
3116947,p
3116948,p
3116949,p


In [44]:
sample_submission.to_csv('submission.csv', index=True)