In [1]:
import json

import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import *

from imblearn.over_sampling import SMOTE, RandomOverSampler

from catboost import CatBoostClassifier, CatBoostRegressor
from soreva_metrics import calculate_metrics, macro_averaged_mean_absolute_error

In [2]:
symps = [
         'artifact',
         'calcified cyst',
         'calcified vessels',
         'calcinates_benign',
         'calcinates_malignant',
         'fibrocystic_breast_changes',
         'lymphonodus',
         'mass_benign',
         'mass_malignant',
         'nipple',
         'papilloma',
         'pectoral muscle',
         'skin_thickening',
         'other'
        ]

In [3]:
def extract_basic_features(breast):
    
    predictors = {}

    
# ---------------------------------
# Every model and view


    for key in ['patient_id', 'laterality', 'tissue_density_predicted', 'cancer_probability_predicted']:
        predictors[key] = breast[key]
    
    for view in ["CC", "MLO"]:
        for modl in [1, 2, 3]:
            for symp in symps:
                for agg in ['min', 'max', 'mean', 'median', 'sum', 'count']:
                    predictors[view+'_'+str(modl)+'_'+symp+'_'+agg] = 0.0
                    
    for view in ["CC", "MLO"]:
        for modl in [1, 2, 3]:
            for symp in symps:
                for agg in ['min', 'max', 'mean', 'median', 'sum']:
                    predictors['square_'+view+'_'+str(modl)+'_'+symp+'_'+agg] = 0.0
                    
    for view in ["CC", "MLO"]:
        for modl in [1, 2, 3]:
            for symp in symps:
                for agg in ['min', 'max', 'mean', 'median', 'sum']:
                    predictors['prob_square_'+view+'_'+str(modl)+'_'+symp+'_'+agg] = 0.0
                    
#     for view in ["CC", "MLO"]:
#         for modl in [1, 2, 3]:
#             for symp in symps:
#                 for agg in ['min', 'max', 'mean', 'median']:
#                     predictors['xx_'+view+'_'+str(modl)+'_'+symp+'_'+agg] = 0.0
#                     predictors['yy_'+view+'_'+str(modl)+'_'+symp+'_'+agg] = 0.0

                
    for view in ["CC", "MLO"]:
        
        for modl in [1, 2, 3]:
        
            for symp in symps:
            
                objs_probs = [obj['probability'] for obj in breast[view] if symp in obj['object_type'] 
                              and obj['model_number'] == modl]
                
                objs_coords = [obj['coordinates'] for obj in breast[view] if symp in obj["object_type"]
                               and obj['model_number'] == modl]
            
                if objs_probs:
                    
                    predictors[view+'_'+str(modl)+'_'+symp+'_min'] = np.min(objs_probs)
                    predictors[view+'_'+str(modl)+'_'+symp+'_max'] = np.max(objs_probs)
                    predictors[view+'_'+str(modl)+'_'+symp+'_mean'] = np.mean(objs_probs)
                    predictors[view+'_'+str(modl)+'_'+symp+'_median'] = np.median(objs_probs)
                    predictors[view+'_'+str(modl)+'_'+symp+'_sum'] = np.sum(objs_probs)
                    predictors[view+'_'+str(modl)+'_'+symp+'_count'] = len(objs_probs)
                    
                    objs_squares = [(cord[2] - cord[0])*(cord[3] - cord[1]) for cord in objs_coords]
                    
                    predictors['square_'+view+'_'+str(modl)+'_'+symp+'_min'] = np.min(objs_squares)
                    predictors['square_'+view+'_'+str(modl)+'_'+symp+'_max'] = np.max(objs_squares)
                    predictors['square_'+view+'_'+str(modl)+'_'+symp+'_mean'] = np.mean(objs_squares)
                    predictors['square_'+view+'_'+str(modl)+'_'+symp+'_median'] = np.median(objs_squares)
                    predictors['square_'+view+'_'+str(modl)+'_'+symp+'_sum'] = np.sum(objs_squares)
                    
                    objs_probs_squares = [sq*prob for sq, prob in zip(objs_squares, objs_probs)]
                    
                    predictors['prob_square_'+view+'_'+str(modl)+'_'+symp+'_min'] = np.min(objs_probs_squares)
                    predictors['prob_square_'+view+'_'+str(modl)+'_'+symp+'_max'] = np.max(objs_probs_squares)
                    predictors['prob_square_'+view+'_'+str(modl)+'_'+symp+'_mean'] = np.mean(objs_probs_squares)
                    predictors['prob_square_'+view+'_'+str(modl)+'_'+symp+'_median'] = np.median(objs_probs_squares)
                    predictors['prob_square_'+view+'_'+str(modl)+'_'+symp+'_sum'] = np.sum(objs_probs_squares)
                    
#                     objs_xx = [(cord[2] + cord[0]) / 2 for cord in objs_coords]
                    
#                     predictors['xx_'+view+'_'+str(modl)+'_'+symp+'_min'] = np.min(objs_xx)
#                     predictors['xx_'+view+'_'+str(modl)+'_'+symp+'_max'] = np.max(objs_xx)
#                     predictors['xx_'+view+'_'+str(modl)+'_'+symp+'_mean'] = np.mean(objs_xx)
#                     predictors['xx_'+view+'_'+str(modl)+'_'+symp+'_median'] = np.median(objs_xx)
                    
#                     objs_yy = [(cord[3] + cord[1]) / 2 for cord in objs_coords]
                    
#                     predictors['yy_'+view+'_'+str(modl)+'_'+symp+'_min'] = np.min(objs_yy)
#                     predictors['yy_'+view+'_'+str(modl)+'_'+symp+'_max'] = np.max(objs_yy)
#                     predictors['yy_'+view+'_'+str(modl)+'_'+symp+'_mean'] = np.mean(objs_yy)
#                     predictors['yy_'+view+'_'+str(modl)+'_'+symp+'_median'] = np.median(objs_yy)
  
    return predictors

In [4]:
with open("data_train/data_train.json", "r") as fin:
    data_train = json.load(fin)

targets_train = pd.read_csv("data_train/targets_train.csv", index_col=0)

predictors = {}
for key, value in tqdm(data_train.items()):
    predictors[key] = extract_basic_features(value)

df_train = pd.DataFrame.from_dict(predictors, orient="index")
df_train = pd.merge(df_train, targets_train, left_index=True, right_index=True)

df_train.loc[df_train.laterality == 'L', 'laterality'] = 0
df_train.loc[df_train.laterality == 'R', 'laterality'] = 1
df_train['laterality'] = df_train['laterality'].astype(int)

100%|██████████████████████████████████████| 4063/4063 [00:17<00:00, 228.18it/s]


In [5]:
# 1 - маммограммы без патологии
# 2 - маммограммы с доброкачественным изменениями

# 3 - маммограммы с доброкачественными изменениями, с подозрением на рак не более 2%

# 4 - маммограммы с подозрением на злокачественные изменения (от 2% до 94%)
# 5 - маммограммы с подозрением на злокачественные изменения (более 95%)

In [203]:
y_transformed = df_train.BiRads.copy()
# 1 - withouth pathology 
# 2 - with pathology 
y_transformed[y_transformed == 2] = 1
y_transformed[y_transformed == 3] = 3
y_transformed[y_transformed == 4] = 3
y_transformed[y_transformed == 5] = 3

In [204]:
df_train.groupby('BiRads').laterality.count()

BiRads
1    1490
2    2419
3     104
4      44
5       6
Name: laterality, dtype: int64

In [205]:
y_transformed.reset_index().groupby('BiRads').index.count()

BiRads
1    3909
3     154
Name: index, dtype: int64

In [206]:
X = df_train.drop(['BiRads',
                   'patient_id',
#                   'laterality'
                  ], axis=1)
# y = df_train['BiRads']
y = y_transformed

In [207]:
X.shape

(4063, 1347)

In [208]:
X_train, X_val, y_train, y_val = train_test_split(X, y.values, 
                                                  stratify=y, 
                                                  test_size=0.1)

In [209]:
model = CatBoostClassifier(early_stopping_rounds=250, verbose=100)

model.fit(X_train, y_train, eval_set=(X_val, y_val),
          cat_features=['laterality']
         )

Learning rate set to 0.043649
0:	learn: 0.6261728	test: 0.6254611	best: 0.6254611 (0)	total: 99.3ms	remaining: 1m 39s
100:	learn: 0.0906458	test: 0.1389621	best: 0.1386544 (91)	total: 3.15s	remaining: 28.1s
200:	learn: 0.0652911	test: 0.1363719	best: 0.1362558 (138)	total: 6.05s	remaining: 24.1s
300:	learn: 0.0483133	test: 0.1369281	best: 0.1358874 (253)	total: 8.97s	remaining: 20.8s
400:	learn: 0.0365013	test: 0.1375287	best: 0.1358874 (253)	total: 12.1s	remaining: 18s
500:	learn: 0.0286055	test: 0.1383740	best: 0.1358874 (253)	total: 15s	remaining: 15s
Stopped by overfitting detector  (250 iterations wait)

bestTest = 0.135887428
bestIteration = 253

Shrink model to first 254 iterations.


<catboost.core.CatBoostClassifier at 0x19af99220>

In [210]:
with open("data_test/data_test.json", "r") as fin:
    data_test = json.load(fin)

predictors_test = {}
for key, value in tqdm(data_test.items()):
    predictors_test[key] = extract_basic_features(value)

df_test = pd.DataFrame.from_dict(predictors_test, orient="index")

df_test.loc[df_test.laterality == 'L', 'laterality'] = 0
df_test.loc[df_test.laterality == 'R', 'laterality'] = 1
df_test['laterality'] = df_test['laterality'].astype(int)

100%|██████████████████████████████████████| 4663/4663 [00:21<00:00, 220.04it/s]


In [211]:
df_test = df_test.drop(['patient_id'], axis=1)

In [212]:
preds = model.predict_proba(df_test)

In [213]:
preds[:, 1].max()

0.7184460143079641

In [214]:
preds_ = []

for pred in preds:
    
    max_pred = np.where(pred == pred.max())[0][0]
    
    if pred[1] > 0.021:
        preds_.append(3)
        
    else:
        preds_.append(1)

In [215]:
# preds_ = model.predict(df_test)

In [216]:
sub = pd.read_csv('sample_submit.csv')

In [217]:
sub['BiRads'] = preds_
sub['best'] = pd.read_csv('sub_18.csv')['BiRads']

In [218]:
sub.groupby('BiRads').id.count()

BiRads
1    2727
3    1936
Name: id, dtype: int64

In [219]:
sub.groupby('best').id.count()

best
1     937
2    1175
3    1034
4    1303
5     214
Name: id, dtype: int64

In [220]:
y_transformed = df_train.BiRads.copy()
y_transformed = y_transformed
# 2 - benign tumor
# 4 - malignant tumor
y_transformed[y_transformed == 1] = 1
y_transformed[y_transformed == 2] = 2
y_transformed[y_transformed == 3] = 3
y_transformed[y_transformed == 4] = 4
y_transformed[y_transformed == 5] = 5

In [221]:
df_train.groupby('BiRads').laterality.count()

BiRads
1    1490
2    2419
3     104
4      44
5       6
Name: laterality, dtype: int64

In [222]:
y_transformed.reset_index().groupby('BiRads').index.count()

BiRads
1    1490
2    2419
3     104
4      44
5       6
Name: index, dtype: int64

In [223]:
X = df_train[df_train.BiRads < 3].drop(['BiRads', 'patient_id'], axis=1)

y = y_transformed[y_transformed < 3]

In [224]:
X.shape

(3909, 1347)

In [225]:
X_train, X_val, y_train, y_val = train_test_split(X, y.values, 
                                                  stratify=y, 
                                                  test_size=0.1)

In [226]:
model = CatBoostClassifier(early_stopping_rounds=250, verbose=100)

model.fit(X_train, y_train, eval_set=(X_val, y_val),
          cat_features=['laterality']
         )

Learning rate set to 0.043236
0:	learn: 0.6860115	test: 0.6862585	best: 0.6862585 (0)	total: 55.9ms	remaining: 55.8s
100:	learn: 0.4979934	test: 0.5723963	best: 0.5723865 (90)	total: 3.15s	remaining: 28s
200:	learn: 0.4227118	test: 0.5677072	best: 0.5658508 (189)	total: 5.94s	remaining: 23.6s
300:	learn: 0.3488134	test: 0.5700980	best: 0.5658508 (189)	total: 8.71s	remaining: 20.2s
400:	learn: 0.2848293	test: 0.5696200	best: 0.5658508 (189)	total: 11.7s	remaining: 17.4s
Stopped by overfitting detector  (250 iterations wait)

bestTest = 0.5658508207
bestIteration = 189

Shrink model to first 190 iterations.


<catboost.core.CatBoostClassifier at 0x1cd874970>

In [227]:
preds = model.predict_proba(df_test.iloc[sub[sub.BiRads < 3].index])

In [228]:
preds[:, 1].max()

0.960959801066722

In [229]:
preds_ = []

for pred in preds:
    
    max_pred = np.where(pred == pred.max())[0][0]
    
    if pred[1] > 0.5:
        preds_.append(2)
        
    else:
        preds_.append(1)

In [230]:
# preds_ = model.predict(df_test)

In [231]:
# sub = pd.read_csv('sample_submit.csv')

In [232]:
sub.loc[sub.BiRads < 3, 'BiRads'] = preds_
# sub2['best'] = pd.read_csv('sub_18.csv')['BiRads']

In [233]:
sub.groupby('BiRads').id.count()

BiRads
1     944
2    1783
3    1936
Name: id, dtype: int64

In [234]:
sub.groupby('best').id.count()

best
1     937
2    1175
3    1034
4    1303
5     214
Name: id, dtype: int64

In [235]:
y_transformed = df_train.BiRads.copy()
y_transformed = y_transformed
# 2 - benign tumor
# 4 - malignant tumor
y_transformed[y_transformed == 1] = 1
y_transformed[y_transformed == 2] = 2
y_transformed[y_transformed == 3] = 3
y_transformed[y_transformed == 4] = 4
y_transformed[y_transformed == 5] = 4

In [236]:
df_train.groupby('BiRads').laterality.count()

BiRads
1    1490
2    2419
3     104
4      44
5       6
Name: laterality, dtype: int64

In [237]:
y_transformed.reset_index().groupby('BiRads').index.count()

BiRads
1    1490
2    2419
3     104
4      50
Name: index, dtype: int64

In [238]:
X = df_train[df_train.BiRads > 2].drop(['BiRads', 'patient_id'], axis=1)

y = y_transformed[y_transformed > 2]

In [239]:
X.shape

(154, 1347)

In [240]:
X_train, X_val, y_train, y_val = train_test_split(X, y.values, 
                                                  stratify=y, 
                                                  test_size=0.1)

In [241]:
model = CatBoostClassifier(early_stopping_rounds=250, verbose=100)

model.fit(X_train, y_train, eval_set=(X_val, y_val),
          cat_features=['laterality']
         )

Learning rate set to 0.019429
0:	learn: 0.6851329	test: 0.6928283	best: 0.6928283 (0)	total: 17.4ms	remaining: 17.4s
100:	learn: 0.2678772	test: 0.5503681	best: 0.5503681 (100)	total: 1.28s	remaining: 11.4s
200:	learn: 0.1339954	test: 0.5364962	best: 0.5324546 (142)	total: 2.57s	remaining: 10.2s
300:	learn: 0.0685725	test: 0.5346941	best: 0.5324183 (276)	total: 3.87s	remaining: 8.98s
400:	learn: 0.0354240	test: 0.5170020	best: 0.5138733 (394)	total: 5.21s	remaining: 7.78s
500:	learn: 0.0210467	test: 0.5224427	best: 0.5138733 (394)	total: 6.48s	remaining: 6.46s
600:	learn: 0.0144250	test: 0.5379256	best: 0.5138733 (394)	total: 7.82s	remaining: 5.19s
Stopped by overfitting detector  (250 iterations wait)

bestTest = 0.5138732623
bestIteration = 394

Shrink model to first 395 iterations.


<catboost.core.CatBoostClassifier at 0x1cd65a820>

In [242]:
preds = model.predict_proba(df_test.iloc[sub[sub.BiRads > 2].index])

In [243]:
preds[:, 1].max()

0.7630884105623508

In [244]:
preds_ = []

for pred in preds:
    
    max_pred = np.where(pred == pred.max())[0][0]
    
    if pred[1] > 0.3:
        preds_.append(4)
        
    else:
        preds_.append(3)

In [245]:
# preds_ = model.predict(df_test)

In [246]:
# sub = pd.read_csv('sample_submit.csv')

In [247]:
sub.loc[sub.BiRads > 2, 'BiRads'] = preds_
# sub2['best'] = pd.read_csv('sub_18.csv')['BiRads']

In [248]:
sub.groupby('BiRads').id.count()

BiRads
1     944
2    1783
3    1484
4     452
Name: id, dtype: int64

In [249]:
sub.groupby('best').id.count()

best
1     937
2    1175
3    1034
4    1303
5     214
Name: id, dtype: int64

In [250]:
y_transformed = df_train.BiRads.copy()
y_transformed = y_transformed
# 2 - benign tumor
# 4 - malignant tumor
y_transformed[y_transformed == 1] = 1
y_transformed[y_transformed == 2] = 2
y_transformed[y_transformed == 3] = 3
y_transformed[y_transformed == 4] = 4
y_transformed[y_transformed == 5] = 5

In [251]:
df_train.groupby('BiRads').laterality.count()

BiRads
1    1490
2    2419
3     104
4      44
5       6
Name: laterality, dtype: int64

In [252]:
y_transformed.reset_index().groupby('BiRads').index.count()

BiRads
1    1490
2    2419
3     104
4      44
5       6
Name: index, dtype: int64

In [253]:
X = df_train[df_train.BiRads > 3].drop(['BiRads', 'patient_id'], axis=1)

y = y_transformed[y_transformed > 3]

In [254]:
X.shape

(50, 1347)

In [255]:
X_train, X_val, y_train, y_val = train_test_split(X, y.values, 
                                                  stratify=y, 
                                                  test_size=0.1)

In [256]:
model = CatBoostClassifier(early_stopping_rounds=250, verbose=100)

model.fit(X_train, y_train, eval_set=(X_val, y_val),
          cat_features=['laterality']
         )

Learning rate set to 0.014732
0:	learn: 0.6810908	test: 0.6910016	best: 0.6910016 (0)	total: 6.98ms	remaining: 6.97s
100:	learn: 0.1691429	test: 0.5547019	best: 0.5471471 (91)	total: 834ms	remaining: 7.42s
200:	learn: 0.0688363	test: 0.6214378	best: 0.5471471 (91)	total: 1.61s	remaining: 6.4s
300:	learn: 0.0351085	test: 0.7104491	best: 0.5471471 (91)	total: 2.27s	remaining: 5.28s
Stopped by overfitting detector  (250 iterations wait)

bestTest = 0.5471470568
bestIteration = 91

Shrink model to first 92 iterations.


<catboost.core.CatBoostClassifier at 0x1bfa03490>

In [257]:
preds = model.predict_proba(df_test.iloc[sub[sub.BiRads > 3].index])

In [258]:
preds[:, 1].max()

0.37288519076337967

In [259]:
preds_ = []

for pred in preds:
    
    max_pred = np.where(pred == pred.max())[0][0]
    
    if pred[1] > 0.2:
        preds_.append(5)
        
    else:
        preds_.append(4)

In [260]:
# preds_ = model.predict(df_test)

In [261]:
# sub = pd.read_csv('sample_submit.csv')

In [262]:
sub.loc[sub.BiRads > 3, 'BiRads'] = preds_
# sub2['best'] = pd.read_csv('sub_18.csv')['BiRads']

In [263]:
sub.groupby('BiRads').id.count()

BiRads
1     944
2    1783
3    1484
4     170
5     282
Name: id, dtype: int64

In [264]:
sub.groupby('best').id.count()

best
1     937
2    1175
3    1034
4    1303
5     214
Name: id, dtype: int64

In [267]:
sub[['id', 'BiRads']].to_csv('sub_48.csv', index=False)

In [624]:
df_train[df_train.BiRads > 3].groupby('BiRads').laterality.count()

BiRads
2    2419
3     104
4      44
5       6
Name: laterality, dtype: int64

In [625]:
X = df_train[(df_train.BiRads == 2)|(df_train.BiRads == 3)].drop(['BiRads'], axis=1)

y = df_train[(df_train.BiRads == 2)|(df_train.BiRads == 3)].BiRads.copy()

In [626]:
X.shape

(2523, 1348)

In [627]:
X_train, X_val, y_train, y_val = train_test_split(X, y.values, 
                                                  stratify=y, 
                                                  test_size=0.15)

In [628]:
model = CatBoostClassifier(early_stopping_rounds=250, verbose=100)

model.fit(X_train, y_train, eval_set=(X_val, y_val),
          cat_features=['laterality']
         )

Learning rate set to 0.038258
0:	learn: 0.6358309	test: 0.6352425	best: 0.6352425 (0)	total: 50.8ms	remaining: 50.7s
100:	learn: 0.0946021	test: 0.1569576	best: 0.1555540 (86)	total: 2.97s	remaining: 26.5s
200:	learn: 0.0591299	test: 0.1555772	best: 0.1554653 (197)	total: 5.78s	remaining: 23s
300:	learn: 0.0391343	test: 0.1614979	best: 0.1554653 (197)	total: 8.55s	remaining: 19.9s
400:	learn: 0.0286095	test: 0.1661353	best: 0.1554653 (197)	total: 11.2s	remaining: 16.7s
Stopped by overfitting detector  (250 iterations wait)

bestTest = 0.1554653497
bestIteration = 197

Shrink model to first 198 iterations.


<catboost.core.CatBoostClassifier at 0x1cf51a880>

In [629]:
preds = model.predict_proba(df_test.iloc[sub[(sub.BiRads != 1)&(sub.BiRads != 4)].index])

In [631]:
preds[:, 1].max()

0.22985679712805543

In [653]:
preds_ = []

for pred in preds:
    
    max_pred = np.where(pred == pred.max())[0][0]
    
    if pred[1] > 0.015:
        preds_.append(3)
        
    else:
        preds_.append(2)

In [654]:
# preds_ = model.predict(df_test)

In [655]:
# sub = pd.read_csv('sample_submit.csv')

In [656]:
sub.loc[(sub.BiRads != 1)&(sub.BiRads != 4), 'BiRads'] = preds_
# sub['best'] = pd.read_csv('sub_18.csv')['BiRads']

In [265]:
sub.groupby('BiRads').id.count()

BiRads
1     944
2    1783
3    1484
4     170
5     282
Name: id, dtype: int64

In [266]:
sub.groupby('best').id.count()

best
1     937
2    1175
3    1034
4    1303
5     214
Name: id, dtype: int64

In [659]:
df_train[df_train.BiRads != 1].groupby('BiRads').laterality.count()

BiRads
2    2419
3     104
4      44
5       6
Name: laterality, dtype: int64

In [660]:
X = df_train[(df_train.BiRads == 4)|(df_train.BiRads == 5)].drop(['BiRads'], axis=1)

y = df_train[(df_train.BiRads == 4)|(df_train.BiRads == 5)].BiRads.copy()

In [661]:
X.shape

(50, 1348)

In [662]:
X_train, X_val, y_train, y_val = train_test_split(X, y.values, 
                                                  stratify=y, 
                                                  test_size=0.15)

In [663]:
model = CatBoostClassifier(early_stopping_rounds=250, verbose=100)

model.fit(X_train, y_train, eval_set=(X_val, y_val),
          cat_features=['laterality']
         )

Learning rate set to 0.014483
0:	learn: 0.6791379	test: 0.6786708	best: 0.6786708 (0)	total: 9.9ms	remaining: 9.89s
100:	learn: 0.1754472	test: 0.4352948	best: 0.4352948 (100)	total: 599ms	remaining: 5.33s
200:	learn: 0.0783620	test: 0.4105160	best: 0.4105160 (200)	total: 1.21s	remaining: 4.8s
300:	learn: 0.0405787	test: 0.4129919	best: 0.4021914 (242)	total: 1.97s	remaining: 4.58s
400:	learn: 0.0240307	test: 0.4356385	best: 0.4021914 (242)	total: 2.61s	remaining: 3.89s
Stopped by overfitting detector  (250 iterations wait)

bestTest = 0.4021913936
bestIteration = 242

Shrink model to first 243 iterations.


<catboost.core.CatBoostClassifier at 0x19d37cc70>

In [664]:
preds = model.predict_proba(df_test.iloc[sub[sub.BiRads == 4].index])

In [665]:
preds[:, 1].max()

0.36855576839534765

In [672]:
preds_ = []

for pred in preds:
    
    max_pred = np.where(pred == pred.max())[0][0]
    
    if pred[1] > 0.2:
        preds_.append(5)
        
    else:
        preds_.append(4)

In [673]:
# preds_ = model.predict(df_test)

In [674]:
# sub = pd.read_csv('sample_submit.csv')

In [675]:
sub.loc[(sub.BiRads != 1)&(sub.BiRads != 2)&(sub.BiRads != 3), 'BiRads'] = preds_
# sub['best'] = pd.read_csv('sub_18.csv')['BiRads']

In [676]:
sub.groupby('BiRads').id.count()

BiRads
1     841
2    1017
3    1726
4     997
5      82
Name: id, dtype: int64

In [677]:
sub.groupby('best').id.count()

best
1     937
2    1175
3    1034
4    1303
5     214
Name: id, dtype: int64

In [678]:
sub[['id', 'BiRads']].to_csv('sub_32.csv', index=False)