# CT画像の読み込み

各ファイルについて
- candidates.csv  
  CTデータの系列が何であるか、結節候補の座標情報、候補が実際に結束であるか否かのフラグを示す。

- annotations.csv  
  結節フラグが建てられた候補に対して、追加の情報を記載しているデータ。追加情報は結節の大きさ  
  このデータを利用することで、訓練データと検証データの結節の大きさの分散を似たようなものにできる＝確率分布が似る。  
  ⇨大小様々な結束を判別する必要があるため各データで分散は一致していた方が望ましい。  

# import

In [112]:
import torch
import copy
import numpy as np
import matplotlib.pyplot as plt
from dsets import getCandidateInfoList, getCt, LunaDataset
from util.util import xyz2irc
import training
import pandas as pd
import os
import shutil

# CTデータの確認

In [5]:
# 頭に！をつけることでlinuxコマンドを実行できる、行数を数える
!wc -l ./data/candidates.csv

  551066 ./data/candidates.csv


In [58]:
! head -n 2 data/candidates.csv

1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860,-56.08,-67.85,-311.92,0


In [34]:
! grep ',1$' ./data/candidates.csv | wc -l

       0


In [2]:
data = pd.read_csv('data/candidates.csv')
data.head()

Unnamed: 0,seriesuid,coordX,coordY,coordZ,class
0,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-56.08,-67.85,-311.92,0
1,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,53.21,-244.41,-245.17,0
2,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,103.66,-121.8,-286.62,0
3,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-33.66,-72.75,-308.41,0
4,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-32.25,-85.36,-362.51,0


In [6]:
data.shape

(551065, 5)

In [8]:
data.columns

Index(['seriesuid', 'coordX', 'coordY', 'coordZ', 'class'], dtype='object')

In [17]:
# ラベルが１のデータの数と比率を確認
(data['class'] == 1).sum(), (data['class'] == 1).sum()/data['class'].count() 

(1351, 0.0024516164154863764)

In [20]:
! wc -l ./data/annotations.csv

    1187 ./data/annotations.csv


In [22]:
!head data/annotations.csv

seriesuid,coordX,coordY,coordZ,diameter_mm
1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860,-128.6994211,-175.3192718,-298.3875064,5.651470635
1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860,103.7836509,-211.9251487,-227.12125,4.224708481
1.3.6.1.4.1.14519.5.2.1.6279.6001.100398138793540579077826395208,69.63901724,-140.9445859,876.3744957,5.786347814
1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016233746780170740405,-24.0138242,192.1024053,-391.0812764,8.143261683
1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016233746780170740405,2.441546798,172.4648812,-405.4937318,18.54514997
1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016233746780170740405,90.93171321,149.0272657,-426.5447146,18.20857028
1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016233746780170740405,89.54076865,196.4051593,-515.0733216,16.38127631
1.3.6.1.4.1.14519.5.2.1.6279.6001.100953483028192176989979435275,81.50964574,54.9572186,-150.3464233,10.36232088
1.3.6.1.4.1.14519.5.2.1.6279.6001.10268196240

In [44]:
# 対応しているもの同士を確認すると、座標に若干のずれがある。原因は結束の大きさが存在するため
# 修正する労力と、それに対する効果が見込めないため、ずれがあるまま分析する
! grep 1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860 ./data/annotations.csv

# ! grep '1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860.*' ./data/candidates.csv
data.loc[(data['seriesuid'] == '1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860') & (data['class'] == 1), :]

1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860,-128.6994211,-175.3192718,-298.3875064,5.651470635
1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860,103.7836509,-211.9251487,-227.12125,4.224708481


Unnamed: 0,seriesuid,coordX,coordY,coordZ,class
13,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,104.164804,-211.685591,-227.011364,1
78,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-128.94,-175.04,-297.87,1


## candidates.csvとannotations.csvの２つを繋ぎ合わせる関数を作成する

In [46]:
from collections import namedtuple
CandidateInfoTuple = namedtuple('CandidateInfoTuple',
                               'isNodule_bool, diameter_mm, series_uid, center_xyz',
                               )

In [47]:
CandidateInfoTuple

__main__.CandidateInfoTuple

サニタイズ（不要部分の除去）、クレンジング（整形）

## CTスキャンデータの読み込み

In [76]:
from dsets import getCandidateInfoList, getCt, LunaDataset

In [77]:
candidateInfo_list = getCandidateInfoList(requireOnDisk_bool=False)
positiveInfo_list = [x for x in candidateInfo_list if x[0]]
diameter_list = [x[1] for x in positiveInfo_list]

In [78]:
print(len(positiveInfo_list))
print(positiveInfo_list[0])

1351
CandidateInfoTuple(isNodule_bool=True, diameter_mm=32.27003025, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.287966244644280690737019247886', center_xyz=(67.61451718, 85.02525992, -109.8084416))


In [79]:
# サニティーチェック
for i in range(0, len(diameter_list), 100):
    print('{:4}  {:4.1f} mm'.format(i, diameter_list[i]))

   0  32.3 mm
 100  17.7 mm
 200  13.0 mm
 300  10.0 mm
 400   8.2 mm
 500   7.0 mm
 600   6.3 mm
 700   5.7 mm
 800   5.1 mm
 900   4.7 mm
1000   4.0 mm
1100   0.0 mm
1200   0.0 mm
1300   0.0 mm


In [80]:
for candidateInfo_tup in positiveInfo_list[:10]:
    print(candidateInfo_tup)
for candidateInfo_tup in positiveInfo_list[-10:]:
    print(candidateInfo_tup)
    
for candidateInfo_tup in positiveInfo_list:
    if candidateInfo_tup.series_uid.endswith('565'):
        print(candidateInfo_tup)

CandidateInfoTuple(isNodule_bool=True, diameter_mm=32.27003025, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.287966244644280690737019247886', center_xyz=(67.61451718, 85.02525992, -109.8084416))
CandidateInfoTuple(isNodule_bool=True, diameter_mm=30.61040636, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.112740418331256326754121315800', center_xyz=(47.90350511, 37.60442008, -99.93417567))
CandidateInfoTuple(isNodule_bool=True, diameter_mm=30.61040636, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.112740418331256326754121315800', center_xyz=(44.19, 37.79, -107.01))
CandidateInfoTuple(isNodule_bool=True, diameter_mm=30.61040636, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.112740418331256326754121315800', center_xyz=(40.69, 32.19, -97.15))
CandidateInfoTuple(isNodule_bool=True, diameter_mm=27.44242293, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.943403138251347598519939390311', center_xyz=(-45.29440163, 74.86925386, -97.52812481))
CandidateInfoTuple(isNodule_bool=True, diameter_mm=27.

## CTスキャンデータの可視化

In [82]:
from vis import findPositiveSamples, showCandidate
positiveSample_list = findPositiveSamples()

2023-01-24 18:22:43,884 INFO     pid:76696 dsets:171:__init__ <dsets.LunaDataset object at 0x7fe8abe9b190>: 332303 training samples


0 CandidateInfoTuple(isNodule_bool=True, diameter_mm=8.604619037, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.202643836890896697853521610450', center_xyz=(-83.0252220779, 9.03100547806, -85.3494928471))
1 CandidateInfoTuple(isNodule_bool=True, diameter_mm=0.0, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.111258527162678142285870245028', center_xyz=(-117.47, 40.27, -136.63))
2 CandidateInfoTuple(isNodule_bool=True, diameter_mm=7.121051509, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.413896555982844732694353377538', center_xyz=(-42.49, 40.6, -169.89))
3 CandidateInfoTuple(isNodule_bool=True, diameter_mm=5.100020792, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.308655308958459380153492314021', center_xyz=(-85.71, -180.34, -108.32))
4 CandidateInfoTuple(isNodule_bool=True, diameter_mm=8.315884851, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.177685820605315926524514718990', center_xyz=(-74.97714965, 81.5241072, -124.82781035))
5 CandidateInfoTuple(isNodule_bool=True, diameter_mm=4.7

In [83]:
series_uid = positiveSample_list[11][2]
showCandidate(series_uid)

2023-01-24 18:26:55,858 INFO     pid:76696 dsets:171:__init__ <dsets.LunaDataset object at 0x7fe890b2f0d0>: 717 training samples


1.3.6.1.4.1.14519.5.2.1.6279.6001.324290109423920971676288828329 129 False [129, 187, 383, 673]


In [84]:
series_uid = '1.3.6.1.4.1.14519.5.2.1.6279.6001.124154461048929153767743874565'
showCandidate(series_uid)

2023-01-24 18:27:53,014 INFO     pid:76696 dsets:171:__init__ <dsets.LunaDataset object at 0x7fe893bc5c70>: 1061 training samples


1.3.6.1.4.1.14519.5.2.1.6279.6001.124154461048929153767743874565 59 False [59]


In [86]:
series_uid = '1.3.6.1.4.1.14519.5.2.1.6279.6001.126264578931778258890371755354'
showCandidate(series_uid)

2023-01-24 18:28:24,269 INFO     pid:76696 dsets:171:__init__ <dsets.LunaDataset object at 0x7fe88c125190>: 605 training samples


1.3.6.1.4.1.14519.5.2.1.6279.6001.126264578931778258890371755354 140 False [140]


# モデルの作成

In [103]:
from util.logconf import logging
from util.util import importstr
log = logging.getLogger('nb')

In [113]:
def run(app, *argv):
    argv = list(argv)
    argv.insert(0, '--num_workers=4')
    log.info('Running: {}({!r}).main()'.format(app, argv))
    
    app_cls = importstr(*app.rsplit('.', 1))
    app_cls(argv).main()
    
    log.info("Finished; {}.{!r}.main()".format(app, argv))

In [114]:
run('training.LunaTrainingApp','--epochs=1')

2023-01-24 22:48:37,404 INFO     pid:76696 nb:004:run Running: training.LunaTrainingApp(['--num_workers=4', '--epochs=1']).main()
usage: ipykernel_launcher.py [-h] [--batch-size BATCH_SIZE]
                             [--num-workers NUM_WORKERS] [--epochs EPOCHS]
                             [--balanced] [--augmented] [--augment-flip]
                             [--augment-offset] [--augment-scale]
                             [--augment-rotate] [--augment-noise]
                             [--tb-prefix TB_PREFIX]
                             [comment]
ipykernel_launcher.py: error: unrecognized arguments: --num_workers=4


SystemExit: 2

In [110]:
def run(app, *argv):
    argv = list(argv)
    argv.insert(0, '--num-workers=4')  # <1>
    log.info("Running: {}({!r}).main()".format(app, argv))
    
    app_cls = importstr(*app.rsplit('.', 1))  # <2>
    app_cls(argv).main()
    
    log.info("Finished: {}.{!r}).main()".format(app, argv))