In [1]:
import json

import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

from imblearn.over_sampling import SMOTE, RandomOverSampler

from catboost import CatBoostClassifier, CatBoostRegressor
from soreva_metrics import calculate_metrics, macro_averaged_mean_absolute_error

In [2]:
symps = [
         'artifact',
         'calcified cyst',
         'calcified vessels',
         'calcinates_benign',
         'calcinates_malignant',
         'fibrocystic_breast_changes',
         'lymphonodus',
         'mass_benign',
         'mass_malignant',
         'nipple',
         'papilloma',
         'pectoral muscle',
         'skin_thickening',
         'other'
        ]

In [3]:
def extract_basic_features(breast):
    
    predictors = {}

    
# ---------------------------------
# Every model and view


    for key in ['patient_id', 'laterality', 'tissue_density_predicted', 'cancer_probability_predicted']:
        predictors[key] = breast[key]
    
    for view in ["CC", "MLO"]:
        for modl in [1, 2, 3]:
            for symp in symps:
                for agg in ['min', 'max', 'mean', 'median', 'sum', 'count']:
                    predictors[view+'_'+str(modl)+'_'+symp+'_'+agg] = 0.0
                    
    for view in ["CC", "MLO"]:
        for modl in [1, 2, 3]:
            for symp in symps:
                for agg in ['min', 'max', 'mean', 'median', 'sum']:
                    predictors['square_'+view+'_'+str(modl)+'_'+symp+'_'+agg] = 0.0
                    
    for view in ["CC", "MLO"]:
        for modl in [1, 2, 3]:
            for symp in symps:
                for agg in ['min', 'max', 'mean', 'median', 'sum']:
                    predictors['prob_square_'+view+'_'+str(modl)+'_'+symp+'_'+agg] = 0.0

                
    for view in ["CC", "MLO"]:
        
        for modl in [1, 2, 3]:
        
            for symp in symps:
            
                objs_probs = [obj['probability'] for obj in breast[view] if symp in obj['object_type'] 
                              and obj['model_number'] == modl]
                
                objs_coords = [obj['coordinates'] for obj in breast[view] if symp in obj["object_type"]
                               and obj['model_number'] == modl]
            
                if objs_probs:
                    
                    predictors[view+'_'+str(modl)+'_'+symp+'_min'] = np.min(objs_probs)
                    predictors[view+'_'+str(modl)+'_'+symp+'_max'] = np.max(objs_probs)
                    predictors[view+'_'+str(modl)+'_'+symp+'_mean'] = np.mean(objs_probs)
                    predictors[view+'_'+str(modl)+'_'+symp+'_median'] = np.median(objs_probs)
                    predictors[view+'_'+str(modl)+'_'+symp+'_sum'] = np.sum(objs_probs)
                    predictors[view+'_'+str(modl)+'_'+symp+'_count'] = len(objs_probs)
                    
                    objs_squares = [(cord[2] - cord[0])*(cord[3] - cord[1]) for cord in objs_coords]
                    
                    predictors['square_'+view+'_'+str(modl)+'_'+symp+'_min'] = np.min(objs_squares)
                    predictors['square_'+view+'_'+str(modl)+'_'+symp+'_max'] = np.max(objs_squares)
                    predictors['square_'+view+'_'+str(modl)+'_'+symp+'_mean'] = np.mean(objs_squares)
                    predictors['square_'+view+'_'+str(modl)+'_'+symp+'_median'] = np.median(objs_squares)
                    predictors['square_'+view+'_'+str(modl)+'_'+symp+'_sum'] = np.sum(objs_squares)
                    
                    objs_probs_squares = [sq*prob for sq, prob in zip(objs_squares, objs_probs)]
                    
                    predictors['prob_square_'+view+'_'+str(modl)+'_'+symp+'_min'] = np.min(objs_probs_squares)
                    predictors['prob_square_'+view+'_'+str(modl)+'_'+symp+'_max'] = np.max(objs_probs_squares)
                    predictors['prob_square_'+view+'_'+str(modl)+'_'+symp+'_mean'] = np.mean(objs_probs_squares)
                    predictors['prob_square_'+view+'_'+str(modl)+'_'+symp+'_median'] = np.median(objs_probs_squares)
                    predictors['prob_square_'+view+'_'+str(modl)+'_'+symp+'_sum'] = np.sum(objs_probs_squares)
                    
        
    return predictors

In [47]:
def extract_basic_features(breast):
    
    predictors = {}

    
# ---------------------------------
# Every model and view


    for key in ['patient_id', 'laterality', 'tissue_density_predicted', 'cancer_probability_predicted']:
        predictors[key] = breast[key]
    
    for view in ["CC", "MLO"]:
        for modl in [1, 2, 3]:
            for symp in symps:
                for agg in ['min', 'max', 'mean', 'median', 'sum', 'count']:
                    predictors[view+'_'+str(modl)+'_'+symp+'_'+agg] = 0.0
                    
    for view in ["CC", "MLO"]:
        for modl in [1, 2, 3]:
            for symp in symps:
                for agg in ['min', 'max', 'mean', 'median', 'sum']:
                    predictors['square_'+view+'_'+str(modl)+'_'+symp+'_'+agg] = 0.0
                    
    for view in ["CC", "MLO"]:
        for modl in [1, 2, 3]:
            for symp in symps:
                for agg in ['min', 'max', 'mean', 'median', 'sum']:
                    predictors['prob_square_'+view+'_'+str(modl)+'_'+symp+'_'+agg] = 0.0
                    
#     for view in ["CC", "MLO"]:
#         for modl in [1, 2, 3]:
#             for symp in symps:
#                 for agg in ['min', 'max', 'mean', 'median']:
#                     predictors['xx_'+view+'_'+str(modl)+'_'+symp+'_'+agg] = 0.0
#                     predictors['yy_'+view+'_'+str(modl)+'_'+symp+'_'+agg] = 0.0

                
    for view in ["CC", "MLO"]:
        
        for modl in [1, 2, 3]:
        
            for symp in symps:
            
                objs_probs = [obj['probability'] for obj in breast[view] if symp in obj['object_type'] 
                              and obj['model_number'] == modl]
                
                objs_coords = [obj['coordinates'] for obj in breast[view] if symp in obj["object_type"]
                               and obj['model_number'] == modl]
            
                if objs_probs:
                    
                    predictors[view+'_'+str(modl)+'_'+symp+'_min'] = np.min(objs_probs)
                    predictors[view+'_'+str(modl)+'_'+symp+'_max'] = np.max(objs_probs)
                    predictors[view+'_'+str(modl)+'_'+symp+'_mean'] = np.mean(objs_probs)
                    predictors[view+'_'+str(modl)+'_'+symp+'_median'] = np.median(objs_probs)
                    predictors[view+'_'+str(modl)+'_'+symp+'_sum'] = np.sum(objs_probs)
                    predictors[view+'_'+str(modl)+'_'+symp+'_count'] = len(objs_probs)
                    
                    objs_squares = [(cord[2] - cord[0])*(cord[3] - cord[1]) for cord in objs_coords]
                    
                    predictors['square_'+view+'_'+str(modl)+'_'+symp+'_min'] = np.min(objs_squares)
                    predictors['square_'+view+'_'+str(modl)+'_'+symp+'_max'] = np.max(objs_squares)
                    predictors['square_'+view+'_'+str(modl)+'_'+symp+'_mean'] = np.mean(objs_squares)
                    predictors['square_'+view+'_'+str(modl)+'_'+symp+'_median'] = np.median(objs_squares)
                    predictors['square_'+view+'_'+str(modl)+'_'+symp+'_sum'] = np.sum(objs_squares)
                    
                    objs_probs_squares = [sq*prob for sq, prob in zip(objs_squares, objs_probs)]
                    
                    predictors['prob_square_'+view+'_'+str(modl)+'_'+symp+'_min'] = np.min(objs_probs_squares)
                    predictors['prob_square_'+view+'_'+str(modl)+'_'+symp+'_max'] = np.max(objs_probs_squares)
                    predictors['prob_square_'+view+'_'+str(modl)+'_'+symp+'_mean'] = np.mean(objs_probs_squares)
                    predictors['prob_square_'+view+'_'+str(modl)+'_'+symp+'_median'] = np.median(objs_probs_squares)
                    predictors['prob_square_'+view+'_'+str(modl)+'_'+symp+'_sum'] = np.sum(objs_probs_squares)
    
# ----------------------------------
# All models


    for view in ["CC", "MLO"]:
        for symp in symps:
            for agg in ['min', 'max', 'mean', 'median', 'sum', 'count']:
                predictors[view+'_'+symp+'_'+agg] = 0.0
                
    for view in ["CC", "MLO"]:
        for symp in symps:
            for agg in ['min', 'max', 'mean', 'median']:
                predictors['square_'+view+'_'+symp+'_'+agg] = 0.0
                    
    for view in ["CC", "MLO"]:
        for symp in symps:
            for agg in ['min', 'max', 'mean', 'median', 'sum']:
                predictors['prob_square_'+view+'_'+symp+'_'+agg] = 0.0
                
    for view in ["CC", "MLO"]:
        
        for symp in symps:
        
            objs_probs = [obj['probability'] for obj in breast[view] if symp in obj['object_type']]
            
            objs_coords = [obj['coordinates'] for obj in breast[view] if symp in obj["object_type"]]
        
            if objs_probs:
                
                predictors[view+'_'+symp+'_min'] = np.min(objs_probs)
                predictors[view+'_'+symp+'_max'] = np.max(objs_probs)
                predictors[view+'_'+symp+'_mean'] = np.mean(objs_probs)
                predictors[view+'_'+symp+'_median'] = np.median(objs_probs)
                predictors[view+'_'+symp+'_sum'] = np.sum(objs_probs)
                predictors[view+'_'+symp+'_count'] = len(objs_probs)
                
                objs_squares = [(cord[2] - cord[0])*(cord[3] - cord[1]) for cord in objs_coords]
                
                predictors['square_'+view+'_'+symp+'_min'] = np.min(objs_squares)
                predictors['square_'+view+'_'+symp+'_max'] = np.max(objs_squares)
                predictors['square_'+view+'_'+symp+'_mean'] = np.mean(objs_squares)
                predictors['square_'+view+'_'+symp+'_median'] = np.median(objs_squares)
                
                objs_probs_squares = [cord*prob for cord, prob in zip(objs_squares, objs_probs)]
                
                predictors['prob_square_'+view+'_'+symp+'_min'] = np.min(objs_probs_squares)
                predictors['prob_square_'+view+'_'+symp+'_max'] = np.max(objs_probs_squares)
                predictors['prob_square_'+view+'_'+symp+'_mean'] = np.mean(objs_probs_squares)
                predictors['prob_square_'+view+'_'+symp+'_median'] = np.median(objs_probs_squares)
                predictors['prob_square_'+view+'_'+symp+'_sum'] = np.sum(objs_probs_squares)
                
# ----------------------------------
# All models and views


    for symp in symps:
        for agg in ['min', 'max', 'mean', 'median', 'sum', 'count']:
            predictors[symp+'_'+agg] = 0.0
                
    for symp in symps:
        for agg in ['min', 'max', 'mean', 'median']:
            predictors['square_'+symp+'_'+agg] = 0.0
                    
    for symp in symps:
        for agg in ['min', 'max', 'mean', 'median', 'sum']:
            predictors['prob_square_'+symp+'_'+agg] = 0.0
        
    for symp in symps:
        
        objs_probs = [obj['probability'] for obj in breast['CC'] if symp in obj['object_type']]
        [objs_probs.append(obj['probability']) for obj in breast['MLO'] if symp in obj['object_type']]
        
        objs_coords = [obj['coordinates'] for obj in breast['CC'] if symp in obj['object_type']]
        [objs_coords.append(obj['coordinates']) for obj in breast['MLO'] if symp in obj['object_type']]
    
        if objs_probs:
            
            predictors[symp+'_min'] = np.min(objs_probs)
            predictors[symp+'_max'] = np.max(objs_probs)
            predictors[symp+'_mean'] = np.mean(objs_probs)
            predictors[symp+'_median'] = np.median(objs_probs)
            predictors[symp+'_sum'] = np.sum(objs_probs)
            predictors[symp+'_count'] = len(objs_probs)
            
            objs_squares = [(cord[2] - cord[0])*(cord[3] - cord[1]) for cord in objs_coords]
            
            predictors['square_'+symp+'_min'] = np.min(objs_squares)
            predictors['square_'+symp+'_max'] = np.max(objs_squares)
            predictors['square_'+symp+'_mean'] = np.mean(objs_squares)
            predictors['square_'+symp+'_median'] = np.median(objs_squares)
            
            objs_probs_squares = [cord*prob for cord, prob in zip(objs_squares, objs_probs)]
            
            predictors['prob_square_'+symp+'_min'] = np.min(objs_probs_squares)
            predictors['prob_square_'+symp+'_max'] = np.max(objs_probs_squares)
            predictors['prob_square_'+symp+'_mean'] = np.mean(objs_probs_squares)
            predictors['prob_square_'+symp+'_median'] = np.median(objs_probs_squares)
            predictors['prob_square_'+symp+'_sum'] = np.sum(objs_probs_squares)
                
# ----------------------------------
# All models and views and symps


#     for agg in ['min', 'max', 'mean', 'median', 'sum', 'count']:
#         predictors[agg] = 0.0
                
#     for agg in ['min', 'max', 'mean', 'median']:
#         predictors['square_'+agg] = 0.0
                    
#     for agg in ['min', 'max', 'mean', 'median', 'sum']:
#         predictors['prob_square_'+agg] = 0.0
        
#     objs_probs = [obj['probability'] for obj in breast['CC']]
#     [objs_probs.append(obj['probability']) for obj in breast['MLO']]
    
#     objs_coords = [obj['coordinates'] for obj in breast['CC']]
#     [objs_coords.append(obj['coordinates']) for obj in breast['MLO']]
    
#     if objs_probs:
        
#         predictors['min'] = np.min(objs_probs)
#         predictors['max'] = np.max(objs_probs)
#         predictors['mean'] = np.mean(objs_probs)
#         predictors['median'] = np.median(objs_probs)
#         predictors['sum'] = np.sum(objs_probs)
#         predictors['count'] = len(objs_probs)
        
#         objs_squares = [(cord[2] - cord[0])*(cord[3] - cord[1]) for cord in objs_coords]
        
#         predictors['square_'+'min'] = np.min(objs_squares)
#         predictors['square_'+'max'] = np.max(objs_squares)
#         predictors['square_'+'mean'] = np.mean(objs_squares)
#         predictors['square_'+'median'] = np.median(objs_squares)
        
#         objs_probs_squares = [cord*prob for cord, prob in zip(objs_squares, objs_probs)]
        
#         predictors['prob_square_'+'min'] = np.min(objs_probs_squares)
#         predictors['prob_square_'+'max'] = np.max(objs_probs_squares)
#         predictors['prob_square_'+'mean'] = np.mean(objs_probs_squares)
#         predictors['prob_square_'+'median'] = np.median(objs_probs_squares)
#         predictors['prob_square_'+'sum'] = np.sum(objs_probs_squares)
        
    return predictors

In [62]:
with open("data_train/data_train.json", "r") as fin:
    data_train = json.load(fin)

targets_train = pd.read_csv("data_train/targets_train.csv", index_col=0)

predictors = {}
for key, value in tqdm(data_train.items()):
    predictors[key] = extract_basic_features(value)

df_train = pd.DataFrame.from_dict(predictors, orient="index")
df_train = pd.merge(df_train, targets_train, left_index=True, right_index=True)

df_train.loc[df_train.laterality == 'L', 'laterality'] = 0
df_train.loc[df_train.laterality == 'R', 'laterality'] = 1
df_train['laterality'] = df_train['laterality'].astype(int)

100%|██████████████████████████████████████| 4063/4063 [00:33<00:00, 122.87it/s]


In [63]:
# train_columns = df_train[df_train.columns[1:-1]].columns

In [64]:
# df_train[['another_'+x for x in train_columns]] = 0

# for i in tqdm(range(df_train.shape[0])):
    
#     pid = df_train.patient_id.iloc[i]
#     lat = df_train.laterality.iloc[i]
    
#     try:
    
#         df_train.loc[(df_train.patient_id == pid)
#                      &(df_train.laterality == lat), ['another_'+x for x in train_columns]] \
#         = df_train.loc[(df_train.patient_id == pid)
#                        &(df_train.laterality != lat), train_columns].iloc[0].values
        
#     except:
        
#         pass

In [65]:
df_train = df_train.drop(['patient_id'], axis=1)

In [66]:
with open("data_test/data_test.json", "r") as fin:
    data_test = json.load(fin)

predictors_test = {}
for key, value in tqdm(data_test.items()):
    predictors_test[key] = extract_basic_features(value)

df_test = pd.DataFrame.from_dict(predictors_test, orient="index")

df_test.loc[df_test.laterality == 'L', 'laterality'] = 0
df_test.loc[df_test.laterality == 'R', 'laterality'] = 1
df_test['laterality'] = df_test['laterality'].astype(int)

100%|██████████████████████████████████████| 4663/4663 [00:38<00:00, 121.95it/s]


In [67]:
df_test = df_test.drop(['patient_id'], axis=1)

In [8]:
# df_test[['another_'+x for x in train_columns]] = 0

# for i in tqdm(range(df_test.shape[0])):
    
#     pid = df_test.patient_id.iloc[i]
#     lat = df_test.laterality.iloc[i]
    
#     try:
    
#         df_test.loc[(df_test.patient_id == pid)
#                      &(df_test.laterality == lat), ['another_'+x for x in train_columns]] \
#         = df_test.loc[(df_test.patient_id == pid)
#                        &(df_test.laterality != lat), train_columns].iloc[0].values
        
#     except:
        
#         pass

In [68]:
sub_best = pd.read_csv('sub_18.csv')
df_test['BiRads'] = sub_best['BiRads'].values

In [134]:
X_test_train = df_test.drop(['BiRads'], axis=1).copy()
y_test_train = df_test['BiRads'].copy()
print(X_test_train.shape)
print(y_test_train.shape)

(4663, 1977)
(4663,)


In [135]:
X = df_train.drop(['BiRads'], axis=1)
y = df_train['BiRads']
print(X.shape)
print(y.shape)

(4063, 1977)
(4063,)


In [136]:
X = pd.concat([X, X_test_train], ignore_index=True)
y = pd.concat([y, y_test_train], ignore_index=True)
print(X.shape)
print(y.shape)

(8726, 1977)
(8726,)


In [76]:
# X = df_train.drop(['BiRads'], axis=1)
# y = df_train['BiRads']
# print(X.shape)
# print(y.shape)

In [77]:
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.25)

In [78]:
y.reset_index().groupby('BiRads').index.count()

BiRads
1    2427
2    3594
3    1138
4    1347
5     220
Name: index, dtype: int64

In [79]:
# model = GradientBoostingClassifier()
# model.fit(X_train, y_train)

# # ExtraTreesClassifier, GradientBoostingClassifier, SVC

In [80]:
model = CatBoostClassifier(loss_function='MultiClass',
#                            auto_class_weights='Balanced',
#                            l2_leaf_reg=5, iterations=2000,
#                            class_weights=[1, 1, 1.5, 1.5, 1.5],
#                            random_strength=10, bagging_temperature=10,
                           early_stopping_rounds=250, verbose=100)

model.fit(X_train, y_train, eval_set=(X_val, y_val),
          cat_features=['laterality', 
#                         'another_laterality'
                       ]
         )

Learning rate set to 0.11211
0:	learn: 1.5272254	test: 1.5321866	best: 1.5321866 (0)	total: 875ms	remaining: 14m 34s
100:	learn: 0.8108903	test: 1.0099800	best: 1.0099800 (100)	total: 22.2s	remaining: 3m 17s
200:	learn: 0.6439491	test: 0.9859121	best: 0.9859121 (200)	total: 44.1s	remaining: 2m 55s
300:	learn: 0.5306945	test: 0.9784904	best: 0.9778257 (283)	total: 1m 6s	remaining: 2m 33s
400:	learn: 0.4454548	test: 0.9762596	best: 0.9761281 (369)	total: 1m 26s	remaining: 2m 9s
500:	learn: 0.3765231	test: 0.9782063	best: 0.9761281 (369)	total: 1m 47s	remaining: 1m 47s
600:	learn: 0.3220637	test: 0.9804570	best: 0.9761281 (369)	total: 2m 8s	remaining: 1m 25s
Stopped by overfitting detector  (250 iterations wait)

bestTest = 0.9761281019
bestIteration = 369

Shrink model to first 370 iterations.


<catboost.core.CatBoostClassifier at 0x1cf0b3e20>

In [137]:
models = []

for train_index, test_index in StratifiedKFold(n_splits=5).split(X, y):
    
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    
    X_val, y_val = X.iloc[test_index], y.iloc[test_index]
    
    model = CatBoostClassifier(loss_function='MultiClass',
                               early_stopping_rounds=250,
                               verbose=100)

    model.fit(X_train, y_train, eval_set=(X_val, y_val),
              cat_features=['laterality'])
    
    print(calculate_metrics(y_val.values, model.predict(X_val)))
    
    models.append(model)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Learning rate set to 0.112255
0:	learn: 1.5194657	test: 1.5414109	best: 1.5414109 (0)	total: 457ms	remaining: 7m 36s
100:	learn: 0.7718494	test: 1.1620439	best: 1.1617769 (97)	total: 21.6s	remaining: 3m 12s
200:	learn: 0.6145409	test: 1.1604287	best: 1.1593271 (163)	total: 43.1s	remaining: 2m 51s
300:	learn: 0.5049591	test: 1.1691391	best: 1.1593271 (163)	total: 1m 5s	remaining: 2m 32s
400:	learn: 0.4264132	test: 1.1750044	best: 1.1593271 (163)	total: 1m 29s	remaining: 2m 13s
Stopped by overfitting detector  (250 iterations wait)

bestTest = 1.159327142
bestIteration = 163

Shrink model to first 164 iterations.
0.11077184010625116
Learning rate set to 0.112255
0:	learn: 1.5124835	test: 1.5404678	best: 1.5404678 (0)	total: 401ms	remaining: 6m 40s
100:	learn: 0.7816704	test: 1.1292742	best: 1.1288509 (99)	total: 23.5s	remaining: 3m 28s
200:	learn: 0.6196086	test: 1.1145031	best: 1.1142724 (186)	total: 45.6s	remaining: 3m 1s
300:	learn: 0.5088359	test: 1.1105940	best: 1.1068042 (269)	tota

In [93]:
# models = []

# for i in range(5):
    
#     test_temp = df_test.sample(2000)
    
#     X = df_train.drop(['patient_id', 'BiRads'], axis=1)
#     y = df_train['BiRads']
    
#     X = pd.concat([X, test_temp.drop(['patient_id', 'BiRads'], axis=1)], ignore_index=True)
#     y = pd.concat([y, test_temp['BiRads']], ignore_index=True)
#     print(X.shape)
#     print(y.shape)
    
#     model = CatBoostClassifier(loss_function='MultiClass',
#                                early_stopping_rounds=250,
#                                verbose=100)

#     model.fit(X_train, y_train, eval_set=(X_val, y_val),
#               cat_features=['laterality'])
    
#     models.append(model)

In [138]:
df_test.shape

(4663, 1978)

In [139]:
preds = np.zeros((X_val.shape[0], 5))

for model in models:
    
    preds += model.predict_proba(X_val) / 5

In [140]:
# preds = model.predict_proba(X_val)

In [141]:
preds[:, 4].max()

0.9729551067541522

In [142]:
preds_ = []

for pred in preds:
    
    max_pred = np.where(pred == pred.max())[0][0]
    
    if pred[4] > 0.005:
        preds_.append(5)
    
    elif pred[3] > 0.01:
        preds_.append(4)
    
    elif pred[2] > 0.015:
        preds_.append(3)
    
    else:
        preds_.append(max_pred+1)

In [143]:
calculate_metrics(y_val.values, np.array(preds_)) + 0.25*macro_averaged_mean_absolute_error(y_val.values, np.array(preds_))

0.4329156247610729

In [144]:
0.25*macro_averaged_mean_absolute_error(y_val.values, np.array(preds_))

0.22253386534882363

In [145]:
calculate_metrics(y_val.values, np.array(preds_))

0.2103817594122493

In [146]:
calculate_metrics(y_val.values, model.predict(X_val))

0.29280597517400203

In [147]:
preds = np.zeros((df_test.shape[0], 5))

for model in models:
    preds += model.predict_proba(df_test.drop(['BiRads'], axis=1)) / len(models)

In [148]:
# preds = model.predict_proba(df_test.drop(['BiRads'], axis=1))

In [149]:
# for i in range(5):
#     preds[:, i] = (preds[:, i] - preds[:, i].min()) / (preds[:, i].max() - preds[:, i].min())

In [150]:
preds[:, 4].max()

0.9729551067541522

In [152]:
preds_ = []

for pred in preds:
    
    max_pred = np.where(pred == pred.max())[0][0]
    
    if pred[4] > 0.5:
        preds_.append(5)
    
    elif pred[3] > 0.5:
        preds_.append(4)
    
    elif pred[2] > 0.5:
        preds_.append(3)
        
    elif pred[1] > 0.5:
        preds_.append(2)
    
    elif pred[0] > 0.5:
        preds_.append(1)
    
    else:
#         preds_.append(3)
        preds_.append(max_pred+1)

In [153]:
# preds_ = model.predict(df_test.drop(['BiRads'], axis=1))

In [154]:
sub = pd.read_csv('sample_submit.csv')

In [155]:
sub['BiRads'] = preds_
sub['best'] = pd.read_csv('sub_18.csv')['BiRads']

In [156]:
sub.groupby('BiRads').id.count()

BiRads
1     989
2    1389
3     825
4    1249
5     211
Name: id, dtype: int64

In [157]:
sub.groupby('best').id.count()

best
1     937
2    1175
3    1034
4    1303
5     214
Name: id, dtype: int64

In [158]:
sub[sub.BiRads != sub.best].shape

(314, 3)

In [159]:
sub[['id', 'BiRads']].to_csv('sub_50.csv', index=False)

In [161]:
# fimp = np.zeros((df_train.shape[1]-1))

# for model in models:
    
#     fimp += model.get_feature_importance() / 5

In [806]:
feature_importance = pd.DataFrame()
feature_importance['feature'] = X.columns
feature_importance['importance'] = model.get_feature_importance()
# feature_importance['importance'] = fimp

In [810]:
feature_importance[feature_importance.importance > 0.0001].sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
1154,prob_square_MLO_1_calcinates_benign_max,0.719360
737,square_MLO_1_calcinates_benign_sum,0.716224
3,cancer_probability_predicted,0.698121
1083,prob_square_CC_3_calcinates_benign_min,0.584782
388,MLO_2_mass_malignant_min,0.569182
...,...,...
51,CC_1_mass_benign_count,0.004604
761,square_MLO_1_mass_malignant_median,0.004178
74,CC_1_pectoral muscle_sum,0.003234
752,square_MLO_1_lymphonodus_sum,0.001858


In [None]:
def feature_type(feature):
    
    if 'square' in feature:
        return 'square'
    elif 'prob_square'

In [None]:
feature_importance['agg'] = feature_importance.feature.apply(lambda x: x.split('_')[-1])
feature_importance['type'] = feature_importance.feature.apply(lambda x: feature_type(x))

In [502]:
# feature_importance.sort_values('importance', ascending=False).iloc[:50]

In [811]:
X_feat = X[feature_importance[feature_importance.importance > 0.0001].feature.values]
y_feat = y

In [812]:
X_train, X_val, y_train, y_val = train_test_split(X_feat, y_feat, stratify=y_feat, test_size=0.15)

In [813]:
model = CatBoostClassifier(loss_function='MultiClass', early_stopping_rounds=250, verbose=100)

model.fit(X_train, y_train, eval_set=(X_val, y_val))

Learning rate set to 0.110686
0:	learn: 1.4641425	test: 1.4705180	best: 1.4705180 (0)	total: 261ms	remaining: 4m 20s
100:	learn: 0.5180438	test: 0.7306986	best: 0.7297976 (98)	total: 12.7s	remaining: 1m 53s
200:	learn: 0.3542670	test: 0.7336471	best: 0.7277790 (115)	total: 25.6s	remaining: 1m 41s
300:	learn: 0.2518778	test: 0.7358142	best: 0.7277790 (115)	total: 37.3s	remaining: 1m 26s
Stopped by overfitting detector  (250 iterations wait)

bestTest = 0.7277789616
bestIteration = 115

Shrink model to first 116 iterations.


<catboost.core.CatBoostClassifier at 0x1b408fe50>

In [814]:
# models = []

# for train_index, test_index in StratifiedKFold(n_splits=5).split(X, y):
    
#     X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    
#     X_val, y_val = X.iloc[test_index], y.iloc[test_index]
    
#     model = CatBoostClassifier(loss_function='MultiClass',
#                                early_stopping_rounds=250,
#                                verbose=False)

#     model.fit(X_train, y_train, eval_set=(X_val, y_val),
# #               cat_features=['laterality']
#              )
    
#     print(calculate_metrics(y_val, model.predict(X_val)))
    
#     models.append(model)

In [178]:
# preds = np.zeros((df_test.shape[0], 5))

# for model in models:
#     preds += model.predict_proba(df_test[feature_importance[feature_importance.importance > 0.05].feature.values]) / len(models)

In [839]:
preds = model.predict_proba(df_test[feature_importance[feature_importance.importance > 0.0001].feature.values])

preds_ = []

for pred in preds:
    
    max_pred = np.where(pred == pred.max())[0][0]
    
    if pred[4] > 0.005:
        preds_.append(5)
    
    elif pred[3] > 0.01:
        preds_.append(4)
    
    elif pred[2] > 0.015:
        preds_.append(3)
    
    else:
        preds_.append(max_pred+1)

In [840]:
sub = pd.read_csv('sample_submit.csv')

In [841]:
# sub['BiRads'] = preds_
sub['BiRads'] = preds_
sub['best'] = pd.read_csv('sub_18.csv')['BiRads']

In [842]:
sub.groupby('BiRads').id.count()

BiRads
1     795
2    1001
3    1362
4    1278
5     227
Name: id, dtype: int64

In [843]:
sub.groupby('best').id.count()

best
1     937
2    1175
3    1034
4    1303
5     214
Name: id, dtype: int64

In [844]:
sub[['id', 'BiRads']].to_csv('sub_30.csv', index=False)

In [845]:
sub[sub.BiRads != sub.best]

Unnamed: 0,id,BiRads,best
0,8097d218-2c51-4ec8-9ee8-ea6b7701ef3c,3,4
1,4c9a6bed-a454-467c-b51e-451f5ee2db35,4,3
7,f572edaa-93ce-4489-80e2-0fad155e5851,2,1
9,6aba217f-bbaf-4368-9b30-64b481177c80,4,3
10,9a13ff22-f414-457b-8e01-e11adea327a9,3,4
...,...,...,...
4653,a71181a4-a11f-49cb-b997-bacbc65123f3,3,4
4656,a83040a4-6949-4915-a863-1a185de4551a,3,2
4658,67db950f-8a39-4594-82a4-6b22db8afb76,4,5
4660,eb0a4b02-1d68-4b62-bcaa-1825ce5804d2,3,4


In [12]:
ress = [0.2661, 0.1815, 0.2104, 0.2241, 
        0.2009, 0.2309, 0.2031, 0.2043]

In [13]:
sub = pd.read_csv('sample_submit.csv')

In [15]:
preds = np.zeros((sub.shape[0], 5))

for i in range(18, 25):

    pred_temp = pd.read_csv('sub_{}.csv'.format(i))['BiRads']
    sub[i] = pred_temp
    
#     for j, pr in enumerate(pred_temp):
        
#         preds[j, pr-1] += ress[i-11]

In [22]:
sub[(sub[18] == sub[19])&(sub[18] == sub[20])
    &(sub[18] == sub[21])&(sub[18] == sub[22])
    &(sub[18] == sub[23])&(sub[18] == sub[24])][['id', 'BiRads']].to_csv('train_from_test.csv', index=False)

In [376]:
preds_ = []

for pred in preds:
    
    preds_.append(np.where(pred == pred.max())[0][0] + 1)

In [377]:
sub['BiRads'] = preds_
sub['best'] = pd.read_csv('sub_18.csv')['BiRads']

In [378]:
sub[sub.BiRads != sub.best]

Unnamed: 0,id,BiRads,best
0,8097d218-2c51-4ec8-9ee8-ea6b7701ef3c,2,4
1,4c9a6bed-a454-467c-b51e-451f5ee2db35,4,3
2,2737373f-e0ce-4d58-8d0a-0b85925d1783,4,3
9,6aba217f-bbaf-4368-9b30-64b481177c80,4,3
14,d3b1b825-ea24-4366-afe2-d69b8455f981,3,2
...,...,...,...
4651,be19bb09-a4f3-4d81-abf9-d037e5749380,3,2
4652,7a78f147-851a-472e-b447-7a5add369557,1,3
4656,a83040a4-6949-4915-a863-1a185de4551a,1,2
4661,061be2ba-9c5d-4c19-815c-deb227a9cc68,1,3


In [379]:
sub[['id', 'BiRads']].to_csv('sub_27.csv', index=False)