In [1]:
import os
import glob
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split

import tensorflow as tf

In [2]:
# limit the GPU memory growth
gpu = tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(gpu))
if len(gpu) > 0:
    tf.config.experimental.set_memory_growth(gpu[0], True)

In [3]:
from pathlib import Path

data_dir = Path('../input/seti-breakthrough-listen/')
train_data_dir = data_dir / 'train'
test_data_dir = data_dir / 'test'
train_label_file = data_dir / 'train_labels.csv'
sample_file = data_dir / 'sample_submission.csv'

In [4]:
train_label_df=pd.read_csv("../input/seti-breakthrough-listen/train_labels.csv", index_col="id")#後のコードのためにindex_colを指定しておかないといけない
#train_label_df["path"]=train_label_df["id"].apply(lambda x: str("train/"+str(x[0])+"/"+str(x)+".npy"))

In [5]:
id_col = 'id'
target_col = 'target'
n_epoch = 20
n_stop = 2
seed = 42
batch_size = 32
input_dim = (273, 256, 3)

In [6]:
def id_to_path(s, train=True):
    data_dir=train_data_dir if train else test_data_dir
    return data_dir / s[0] / f'{s}.npy'

In [7]:
from tensorflow.keras.utils import Sequence
from random import shuffle

class SETISequence(Sequence):
    def __init__(self, ids, y=None, batch_size=1024, shuffle=True):
        self.ids = ids
        self.y = y
        self.is_train = False if y is None else True
        self.batch_size = batch_size
        self.shuffle = shuffle
    
    def __len__(self):
        return math.ceil(len(self.ids) / self.batch_size)
    
    def __getitem__(self, index):
        batch = self.df[index * self.batch_size: (index + 1) * self.batch_size]
        
        signals=[]
        
        signals_1=np.empty((len(batch),3,256,273), dtype=np.float32)
        signals_2=np.empty((len(batch),3,256,273), dtype=np.float32)
        
        i=0
        
        for filename in batch.id:
            path=os.path.join(self.directory, filename[0], filename+ ".npy")
            data=np.load(path)
            
            #transpose the dimention
            data=np.transpose(data,(0,2,1)).astype("float32")
            
            signals_1[i, 0,] = data[0]
            signals_1[i, 1,] = data[2]
            signals_1[i, 2,] = data[4]
        
            signals_2[i, 0,] = data[1]
            signals_2[i, 1,] = data[3]
            signals_2[i, 2,] = data[5]
            
            i += 1
          
        
        # Transform the array to the correct input shape
        # signals = np.stack(signals)
        
        # Add the two signal to one array
        inp = [signals_1, signals_2]
        
        if self.training:
            # return signals, batch.target.values
            return inp, batch.target.values
        else:
            # return signals
            return inp
        
    
    def on_epoch_end(self):
        if self.shuffle :
            np.random.seed(42)
            self.df=self.df.sample(frac=1).reset_index(drop=True)

In [8]:
class SETISequence(tf.keras.utils.Sequence):
    def __init__(self, df, directory, batch_size=32, shuffle=True, training=True):
        self.directory=directory
        self.df = df
        self.training= training
        self.batch_size = batch_size
        self.shuffle = shuffle
        
        self.on_epoch_end()
    
    def __len__(self):
        return math.ceil(len(self.df) / self.batch_size).astype(int)
    
    def __getitem__(self, idx):
        batch_ids = self.ids[idx * self.batch_size: (idx + 1) * self.batch_size]
        if self.y is not None:
            batch_y = self.y[idx * self.batch_size: (idx + 1) * self.batch_size]
        
        # taking channels 1, 3, and 5 only
        list_x = [np.load(id_to_path(x, self.is_train))[::2] for x in batch_ids]
        batch_X = np.transpose(np.stack(list_x), 
                               (0, 2, 3, 1))
        if self.is_train:
            return batch_X, batch_y
        else:
            return batch_X
    
    def on_epoch_end(self):
        if self.shuffle and self.is_train:
            ids_y = list(zip(self.ids, self.y))
            shuffle(ids_y)
            self.ids, self.y = list(zip(*ids_y))

In [9]:
image=np.load("../input/seti-breakthrough-listen/train/0/0000799a2b2c42d.npy")#(6, 273, 256)
#np.np.stack(image[::2])
#np.transpose(np.stack(image[::2]),(0,2,3,1))

In [10]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras import models
from tensorflow.keras import layers

#https://qiita.com/kuroneko-hornet/items/7737b71c3854c06fcb49
def build_model():
    conv_base=ResNet50(weights="imagenet",
                  include_top=False,
                  input_shape=(273, 256, 3))# resnetを使う場合、channelを3にしないといけない

    model=models.Sequential()
    model.add(conv_base)
    model.add(layers.Flatten())
    model.add(layers.Dense(128, activation="relu"))
    model.add(layers.Dense(1,activation="sigmoid"))

    return model

In [11]:
model=build_model()

In [12]:
model.summary()

In [13]:
sub_df = pd.read_csv(sample_file, index_col="id")
print(sub_df.shape)
sub_df.head()

In [None]:
y=train_label_df["target"].values
ids=train_label_df.index.values
ids_tst=sub_df.index.values

ids_trn, ids_val, y_trn, y_val = train_test_split(ids, y, test_size=.2, random_state=42)

trn = SETISequence(ids_trn, y_trn, batch_size=batch_size)
val = SETISequence(ids_val, y_val, batch_size=batch_size)
tst = SETISequence(ids_tst, batch_size=batch_size)

In [None]:
model.compile(optimizer="adam",loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])

In [None]:
model.fit(trn, validation_data=val, epochs=10) 

In [None]:
model.save("../kaggle/working/my_h5_model2.h5")

In [None]:
p_tst=model.predict(tst).flatten()

In [None]:
sub_df["target"] = p_tst
sub_df.to_csv('my_submission#1.csv')
sub_df.head()

In [None]:
output = pd.DataFrame(
    {
        'id':sub_df.id,
        "target":p_tst
    }
)

output.to_csv('my_submission#1.csv',index=False)
print(output.head())

print("my submission was successfully saved!")