**Data Analysis**

In [1]:
import pandas as pd
import glob
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import os
import json
import random
import collections

import numpy as np
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import cv2

import time

import torch
from torch import nn
from torch.utils import data as torch_data
from torch.nn import functional as torch_functional
from torch.utils.data import DataLoader
from torchvision import datasets, models, transforms
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler

from sklearn import model_selection
from sklearn import metrics
from skimage import exposure

from albumentations import Resize, Normalize, Compose
from albumentations.pytorch import ToTensorV2
import albumentations as album

import warnings
warnings.filterwarnings("ignore")

# plt.style.use("dark_background")

In [2]:
# # Path to data
# data_directory_path = "../input/rsna-miccai-brain-tumor-radiogenomic-classification"
# train_labels = data_directory_path + "/train_labels.csv"

In [3]:
# # Showing the view of training labels
# train_labels = pd.read_csv(train_labels)
# train_labels.head()

In [4]:
# # Shape of train labels
# x, y = train_labels.shape
# print("Shape of train labels data")
# print("Amount of rows: ",  x)
# print("Amount of columns: ", y)

In [5]:
# # Investigation of missing values
# missing_values = train_labels.isnull().sum()
# print("Missinng values:\n", missing_values)

In [6]:
# # Checking whether data is balanced or not
# plt.figure(figsize=(8, 8))
# sns.countplot(data=train_labels, x="MGMT_value")
# plt.grid()
# plt.title('Distribution of Labels for Training')
# plt.show()

In [7]:
# # Getting amount of scans in training and testing  part -- need ???
# file_names_train = glob.glob('../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/*/*/*')
# print("Amount of brain pictures in train: ", len(file_names_train))

In [8]:
# # Calculating amount of different scan types photos: FLAIR, T1w, T1wCE, T2w
# flair_scan_amount = sum(1 for scan_path in file_names_train if 'FLAIR' in scan_path)
# t1w_scan_amount = sum(1 for scan_path in file_names_train if 'T1w' in scan_path)
# t1wce_scan_amount = sum(1 for scan_path in file_names_train if 'T1wCE' in scan_path)
# t2w_scan_amount = sum(1 for scan_path in file_names_train if 'T2w' in scan_path)
# print("Amount FLAIR scan: ", flair_scan_amount)
# print("Amount T1w scan: ", t1w_scan_amount)
# print("Amount T1wCE scan: ", t1wce_scan_amount)
# print("Amount T2w scan: ", t2w_scan_amount)

In [9]:
# # Creating a graph with scans amount
# scan_names = ['FLAIR', 'T1w', 'T1wCE', 'T2w']
# scan_amount = [flair_scan_amount, t1w_scan_amount, t1wce_scan_amount, t2w_scan_amount]
# imgs_per_orient_scan = dict(zip(scan_names, scan_amount))


# plt.figure(figsize=(8, 8))
# keys = list(imgs_per_orient_scan.keys())
# values = [imgs_per_orient_scan[k] for k in keys]
# sns.barplot(x=keys, y=values, palette='hls')
# plt.title("Total Images Count per Scan Type", fontsize=17)
# plt.grid()
# plt.show()

# # TODO: fix for the folder name count only (we dont care how many images arein the specific scan)

In [10]:
# # Different type of scans lists 
# flair_scan = [scan_path for scan_path in file_names_train if 'FLAIR' in scan_path]
# t1w_scan   = [scan_path for img_path in file_names_train if 'T1w' in scan_path]
# t1wce_scan = [scan_path for scan_path in file_names_train if 'T1wCE' in scan_path]
# t2w_scan   = [scan_path for scan_path in file_names_train if 'T2w' in scan_path]

**View of the data**

In [11]:
data = pd.read_csv("../input/rsna-miccai-brain-tumor-radiogenomic-classification/train_labels.csv")
data

In [12]:
def get_image(path, fix_monochrome=True):
    dicom = pydicom.read_file(path)
    data = dicom.pixel_array
#     data = exposure.equalize_adapthist(data, clip_limit=0.05)
    
#     data = apply_voi_lut(dicom.pixel_array, dicom)
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    data = cv2.resize(data, (256, 256))

    data = cv2.cvtColor(data, cv2.COLOR_BGR2RGB)
#     data = data[..., 0] 
#     data -= data.mean()
#     data /= data.std()
    
    return data

In [13]:
def get_middle_idx(path, offset=0):
    l = os.listdir(path)
    l = sorted(l, key=lambda x: int(x.split("-")[1].split(".")[0]))
    mid_idx = len(l) // 2 + int(l[0].split("-")[1].split(".")[0])
    
    return mid_idx + offset


def get_middle_image(path, offset=0):
    idx = get_middle_idx(path, offset)
    full_path = path + f"/Image-{idx}.dcm"
    
    while not os.path.exists(full_path):
        idx += 1
        full_path = path + f"/Image-{idx}.dcm"
    
    img = get_image(full_path).astype(np.uint8)
    orientation = get_image_orientation(full_path)
#     print(full_path)
    
    return img, orientation

In [14]:
def get_image_orientation(path, fix_monochrome=True):
    dicom = pydicom.read_file(path)
    orientation = list(np.abs(np.round(dicom.ImageOrientationPatient)).astype(np.uint8))
    
    return orientation

In [15]:
ORIENTATIONS = {
    "side":  [0, 1, 0, 0, 0, 1], # side view
    "front": [1, 0, 0, 0, 0, 1], # front view
    "top":   [1, 0, 0, 0, 1, 0], # top view
}

In [16]:
# ### Show the idx's picture image with some id
# id = 2

# path = f"../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/{str(id).zfill(5)}/FLAIR"

# assert os.path.isdir(path), "No such directory"

# ids = np.arange(25)
    
# plt.figure(figsize=(20, 20))
# for idx, x in enumerate(ids):
#     plt.subplot(5, 5, idx + 1)

#     img, orientaion = get_middle_image(path, offset=idx * 2)
#     direction = [k for k, v in ORIENTATIONS.items() if v == orientaion][0]
#     print(orientaion, direction)

#     plt.imshow(img)
        

In [17]:
# ### Show the idx's picture image with some id
# id = 2

# path = f"../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/{str(id).zfill(5)}/T1w"

# assert os.path.isdir(path), "No such directory"

# ids = np.arange(25)
    
# plt.figure(figsize=(20, 20))
# for idx, x in enumerate(ids):
#     plt.subplot(5, 5, idx + 1)

#     img, orientaion = get_middle_image(path, offset=idx * 2)
#     direction = [k for k, v in ORIENTATIONS.items() if v == orientaion][0]
#     print(orientaion, direction)

#     plt.imshow(img)

In [18]:
scan_types = ['FLAIR', 'T1w', 'T1wCE', 'T2w']

In [19]:
# voxels_per_orient_scan = dict()
# voxels_per_scan_types = dict()

# imgs_per_orient_scan = dict()
# imgs_per_scan_type = dict()



# scan_types = ['FLAIR', 'T1w', 'T1wCE', 'T2w']

# for id in tqdm(data['BraTS21ID'].values):
#     for scan_type in scan_types:
#         path = f"../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/{str(id).zfill(5)}/{scan_type}"
#         assert os.path.isdir(path), "No such directory"
        
        
#         # --------
#         img, orientation = get_middle_image(path, offset=0)
#         direction = [k for k, v in ORIENTATIONS.items() if v == orientation][0]
        
#         if direction in voxels_per_orient_scan:
#             voxels_per_orient_scan[direction] += 1
#         else:
#             voxels_per_orient_scan[direction] = 1
            
#         if scan_type in voxels_per_scan_types:
#             voxels_per_scan_types[scan_type] += 1
#         else:
#             voxels_per_scan_types[scan_type] = 1
            
        
#         # --------
#         num_scans = len(os.listdir(path))
        
#         if direction in imgs_per_orient_scan:
#             imgs_per_orient_scan[direction].append(num_scans)
#         else:
#             imgs_per_orient_scan[direction] = [num_scans]
            
#         if scan_type in imgs_per_scan_type:
#             imgs_per_scan_type[scan_type].append(num_scans)
#         else:
#             imgs_per_scan_type[scan_type] = [num_scans]
        
    
    

#     # print(sorted(os.listdir(path), key=lambda x: int(x.split('-')[1].split('.')[0])))

In [20]:
# sns.set_theme(style='white')

In [21]:
# plt.figure(figsize=[8, 8])
# keys = list(voxels_per_orient_scan.keys())
# values = [voxels_per_orient_scan[k] for k in keys]
# sns.barplot(x=keys, y=values, palette='hls')
# plt.grid()
# plt.xticks(fontsize=13)
# plt.title("Total Voxels Count per Orientation", fontsize=17)
# plt.show()

In [22]:
# plt.figure(figsize=[8, 8])
# keys = list(voxels_per_scan_types.keys())
# values = [voxels_per_scan_types[k] for k in keys]
# sns.barplot(x=keys, y=values, palette='hls')
# plt.grid()
# plt.xticks(fontsize=13)
# plt.title("Total Voxels Count per Scan Type", fontsize=17)
# plt.show()

In [23]:
# d = {key: sum(value) for key, value in imgs_per_orient_scan.items()}

# plt.figure(figsize=[8, 8])
# keys = list(d.keys())
# values = [d[k] for k in keys]
# sns.barplot(x=keys, y=values, palette='hls')
# plt.xticks(fontsize=13)
# plt.title("Total Images Count per Orientation", fontsize=17)
# plt.grid()
# plt.show()

In [24]:
# a = pd.DataFrame({'orientation': np.repeat('front', len(imgs_per_orient_scan['front'])), 
#                   'num of imgs': imgs_per_orient_scan['front']})
# b = pd.DataFrame({'orientation': np.repeat('side', len(imgs_per_orient_scan['side'])), 
#                   'num of imgs': imgs_per_orient_scan['side']})
# c = pd.DataFrame({'orientation': np.repeat('top', len(imgs_per_orient_scan['top'])), 
#                   'num of imgs': imgs_per_orient_scan['top']})
# df = a.append(b).append(c)

# plt.figure(figsize=[8, 8])
# sns.boxplot(x='orientation', y='num of imgs', data=df, palette='hls', linewidth=2.)  
# plt.grid()
# plt.xticks(fontsize=13)
# plt.title("Distribution of Images Count in a Voxel per Orientation", fontsize=17)
# plt.show()

In [25]:
# scan_types = ['FLAIR', 'T1w', 'T1wCE', 'T2w']

# a = pd.DataFrame({'scan type': np.repeat('FLAIR', len(imgs_per_scan_type['FLAIR'])), 
#                   'num of imgs': imgs_per_scan_type['FLAIR']})
# b = pd.DataFrame({'scan type': np.repeat('T1w', len(imgs_per_scan_type['T1w'])), 
#                   'num of imgs': imgs_per_scan_type['T1w']})
# c = pd.DataFrame({'scan type': np.repeat('T1wCE', len(imgs_per_scan_type['T1wCE'])), 
#                   'num of imgs': imgs_per_scan_type['T1wCE']})
# d = pd.DataFrame({'scan type': np.repeat('T2w', len(imgs_per_scan_type['T2w'])), 
#                   'num of imgs': imgs_per_scan_type['T2w']})
# df = a.append(b).append(c).append(d)

# plt.figure(figsize=[8, 8])
# sns.boxplot(x='scan type', y='num of imgs', data=df, palette='hls', linewidth=2.)
# plt.grid()
# plt.xticks(fontsize=13)
# plt.title("Distribution of Images Count in a Voxel per Scan Type", fontsize=17)
# plt.show()

In [26]:
# plt.figure(figsize=[8, 8])
# sns.histplot(data=imgs_per_orient_scan, bins=50, kde=True)
# plt.xlabel('num of imgs')
# plt.ylabel('Occurrences')
# plt.title('Distribution of Images Count in Voxel per Orientation', fontsize=17)
# plt.show()

In [27]:
# id = 0
# path = f"../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/{str(id).zfill(5)}/FLAIR"
# assert os.path.isdir(path)


# print(sorted(os.listdir(path), key=lambda x: int(x.split('-')[1].split('.')[0])))

**Data preparation**

In [28]:
# X - images
# Y - labels

# 1. Take all middle images from each scan
# 2. Train Test split
# 3. ...

In [50]:
X, y = [], []

In [30]:
# # Random sampling
# for id in tqdm(data.index):
#     for scan_type in scan_types:
#         voxel_id = data['BraTS21ID'].loc[id]
#         path = f"../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/{str(voxel_id).zfill(5)}/{scan_type}"
#         assert os.path.isdir(path), "No such directory"
        
#         # --------
#         num_of_imgs = len(os.listdir(path))
#         img, _ = get_middle_image(path, offset=np.random.randint(-num_of_imgs//6, num_of_imgs//6))
#         label = data['MGMT_value'].loc[id]

#         X.append(img)
#         y.append(label)
#         assert len(X) == len(y)

In [51]:
# Middle image
for id in tqdm(data.index):
    for scan_type in scan_types:
        voxel_id = data['BraTS21ID'].loc[id]
        path = f"../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/{str(voxel_id).zfill(5)}/{scan_type}"
        assert os.path.isdir(path), "No such directory"
        
        # --------
        img, _ = get_middle_image(path, offset=0)
        label = data['MGMT_value'].loc[id]

        X.append(img)
        y.append(label)
        assert len(X) == len(y)

In [32]:
# # Middle image
# for id in tqdm(data.index):
    
#     for scan_type in scan_types:
#         voxel_id = data['BraTS21ID'].loc[id]
#         path = f"../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/{str(voxel_id).zfill(5)}/{scan_type}"
#         assert os.path.isdir(path), "No such directory"
        
#         # --------
#         # create batch
#         batch = []
        
#         for i in np.arange(-4, 4):
#             img, _ = get_middle_image(path, offset=0)
#             batch.append(img)
            
#         batch = np.stack(batch)
#         label = data['MGMT_value'].loc[id]

#         X.append(batch)
#         y.append(label)
#         assert len(X) == len(y)

In [52]:
len(X), len(y)

In [53]:
from sklearn.model_selection import train_test_split

X = np.array(X)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
len(X_train), len(X_test)

In [54]:
X_train.shape

In [55]:
nsamples, nx, ny, chn = X_train.shape
X_train = X_train.reshape((nsamples, nx*ny*chn))
# X_train = np.reshape(X_train, [-1, 3])

nsamples, nx, ny, chn = X_test.shape
X_test = X_test.reshape((nsamples, nx*ny*chn))
# X_test = np.reshape(X_test, [-1, 3])

In [56]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

# grid = {"C":np.logspace(-3,3,7),"penalty":["l1","l2"]}
# clf = LogisticRegression()
# clf = GridSearchCV(clf, grid)

# clf = LogisticRegression(penalty='l2', 
#                          tol=1e-2, solver='sag',
#                          multi_class='ovr').fit(X_train, y_train)


clf = LogisticRegression()
clf.fit(X_train, y_train)

# clf = RandomForestClassifier()
# clf.fit(X_train, y_train)

# clf = SGDClassifier()
# clf.fit(X_train, y_train)
# calibrator = CalibratedClassifierCV(clf, cv='prefit')
# clf = calibrator.fit(X_tr, y_train)


In [57]:
# from sklearn.svm import SVC
# svc = SVC(
#     kernel='linear', 
#     gamma='auto', 
#     probability=True
# )
# svc.fit(X_train, y_train)

In [58]:
#sample number
sample_idx = 0

#plotting image
print(X_test[sample_idx].shape)
plt.figure(figsize=[8, 8])
plt.imshow(X_test[sample_idx].reshape(256, 256, 3))
plt.title('Label: %s\n' % y_test[sample_idx], fontsize=24)
plt.axis('off')

In [59]:
scale = np.max(np.abs(clf.coef_))
print(clf.classes_, clf.coef_)


for i in range(1):
    p = plt.figure(figsize=[8, 8])
    p = plt.imshow(clf.coef_[0].reshape(256, 256, 3)[..., 0], cmap=plt.cm.RdBu, vmin=-scale, vmax=scale);
    p = plt.axis('off')
    p = plt.title('Class %i' % i)

In [60]:
# #prediction probabilities
# y_pred = clf.predict(X_test)
# # predict_proba

# # y_pred=model.predict(X_test)

In [61]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

logit_roc_auc = roc_auc_score(y_test, clf.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])

plt.figure(figsize=[8, 8])
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic [ROC AUC]')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [62]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

logit_roc_auc = roc_auc_score(y_test, clf.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])

plt.figure(figsize=[8, 8])
plt.plot(fpr, tpr, label='Random Forest (area = %0.2f)' % logit_roc_auc, color='red')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic [ROC AUC]')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [44]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

logit_roc_auc = roc_auc_score(y_test, clf.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])

plt.figure(figsize=[8, 8])
plt.plot(fpr, tpr, label='XGBClassifier (area = %0.2f)' % logit_roc_auc, color='red')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic [ROC AUC]')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [45]:
plt.plot(figsize=[8, 8])
sns.barplot(data=[0.54, 0.56, ])

In [46]:
class SUBMISSIONDataset(torch_data.Dataset):
    def __init__(self, 
                 augmentation=None, 
                 preprocessing=None,
                ):
        self.indexes = sorted(os.listdir("../input/rsna-miccai-brain-tumor-radiogenomic-classification/test"))
        self.augmentation = augmentation
        self.preprocessing = preprocessing
          
    def __len__(self):
        return len(self.indexes)
    
    def __getitem__(self, index):
        id = self.indexes[index]
        
        try:
            image_path = f"../input/rsna-miccai-brain-tumor-radiogenomic-classification/test/{str(id).zfill(5)}/T2w"

            image, orientation = get_middle_image(image_path)
            orientation_index = list(ORIENTATIONS.values()).index(orientation)
#             image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

            if self.augmentation:
                sample = self.augmentation(image=image)
                image = sample['image']
        except:
            image = None
            orientation_index = 0
            
        return image, id, orientation_index

In [63]:
submission = pd.read_csv("../input/rsna-miccai-brain-tumor-radiogenomic-classification/sample_submission.csv")

test_dataset = SUBMISSIONDataset()

In [77]:
ids = []
preds = []


# for id in tqdm(data.index):
for i in range(len(test_dataset)):
    if i % 10 == 0:
        print(f"{np.round(i * 100 / len(test_dataset), 2)}%\t/ 100% done")

    image, id, orientation_index = test_dataset[i]
    x_test = np.array([image])
    
    try:
        nsamples, nx, ny, с = x_test.shape
        x_test = x_test.reshape((nsamples, nx*ny*с))

        prediction = clf.predict_proba(x_test)[:,1][0]
    except:
        prediction = 0.5
    
    ids.append(str(id))
    preds.append(f'{prediction:.6f}')
        
print("100%\t/ 100% done")

In [78]:
df = pd.read_csv("../input/rsna-miccai-brain-tumor-radiogenomic-classification/sample_submission.csv", dtype="string")

df['MGMT_value'] = preds
df[['BraTS21ID', 'MGMT_value']].to_csv("submission.csv", index=False)

df

In [101]:
# plt.figure()
sns.histplot(data=df["MGMT_value"].values.astype(np.float32), bins=10)
# plt.show()