In [1]:
import os
import gc
import re
import sys
import math
import time
import toad
import json
import pickle
import random
import argparse
import warnings
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from loguru import logger
from datetime import datetime
from functools import lru_cache
from concurrent.futures import ProcessPoolExecutor, wait, ALL_COMPLETED
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, auc, roc_curve

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchinfo import summary

from torch_geometric.data import Data
from torch_geometric.data import InMemoryDataset
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool
from torch_geometric.nn.aggr.attention import AttentionalAggregation
from torch_geometric.nn import Linear, HeteroConv, GraphConv, GAT, RGCNConv, BatchNorm, GCN
from torch_geometric.nn.models import JumpingKnowledge

from ema import EMA

In [2]:
warnings.filterwarnings("ignore")
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
# logger.add(sys.stdout, format="[{time:YYYY-MM-DD :mm:ss}] {level} {message}", colorize=True)
# logger.add("./train.log", format="[{time:YYYY-MM-DD :mm:ss}] {level} {message}", rotation="5MB", encoding="utf-8", enqueue=True, retention="5 days", colorize=True)

In [3]:
def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


def load_pickle(file):
    with open(file, "rb") as f:
        return pickle.load(f)


def save_pickle(obj, file):
    with open(file, "wb") as f:
        pickle.dump(obj, f)


def load_graphs(dataset, edge_lag=1, weight_base=1.05, cache=True):
    if cache and os.path.exists(f"cache/{os.path.basename(dataset)}.pkl"):
        return load_pickle(f"cache/{os.path.basename(dataset)}.pkl")

    pickle_files = glob(dataset + "/*.pkl")

    idx = []
    graphs = []
    label_dict = {"00": 0, "10": 1}

    source = []
    target = []
    for i in range(1, edge_lag + 1):
        source.extend([i for i in range(256 - i)])
        target.extend([i for i in range(i, 256)])

    for file in tqdm(pickle_files, desc=f"load {dataset} data :::"):
        idx.append(os.path.basename(file))

        features, metadata = load_pickle(file)
        
        mileage = metadata.get("mileage")
        label = label_dict[metadata.get("label", "00")]

        features = pd.DataFrame(features, columns=['volt','current','soc','max_single_volt','min_single_volt','max_temp','min_temp','timestamp'])
        features["mileage"] = mileage
        features["timestamp"] = (features["timestamp"] - features["timestamp"].min()).astype(int)
        # features[f"timestamp_diff_{i}"] = features["timestamp"].diff().fillna(-1) # .apply(lambda x: math.log(x+1, 60))

        features["resistance"] = features["volt"] / (-features["current"])
        features["pwoer"] = features["volt"] * (-features["current"])
        features["consistency_volt"] = features["volt"].std()
        features["consistency_soc"] = features["soc"].std()
        features["consistency_temp"] = (features.max_temp - features.min_temp).std()
        features["consistency_resistance"] = (features["volt"] / (-features["current"])).std()
        features["single_volt_range"] = features.max_single_volt - features.min_single_volt
        features["temp_range"] = features.max_temp - features.min_temp
        
        for i in [5, 15, 30, 60]:
            features[f"timestamp_diff_{i}"] = features["timestamp"].diff(periods=i).fillna(-1)
            features[f"volt_div_{i}"] = features["volt"].diff(i).fillna(0) / features[f"timestamp_diff_{i}"]
            features[f"current_div_{i}"] = features["current"].diff(i).fillna(0) / features[f"timestamp_diff_{i}"]
            features[f"soc_div_{i}"] = features["soc"].diff(i).fillna(0) / features[f"timestamp_diff_{i}"]
            features[f"max_single_volt_div_{i}"] = features["max_single_volt"].diff(i).fillna(0) / features[f"timestamp_diff_{i}"]
            features[f"min_single_volt_div_{i}"] = features["min_single_volt"].diff(i).fillna(0) / features[f"timestamp_diff_{i}"]
            features[f"max_temp_div_{i}"] = features["max_temp"].diff(i).fillna(0) / features[f"timestamp_diff_{i}"]
            features[f"min_temp_div_{i}"] = features["min_temp"].diff(i).fillna(0) / features[f"timestamp_diff_{i}"]

            features[f"volt_change_{i}"] = features["volt"].pct_change(periods=i).fillna(0)
            features[f"current_change_{i}"] = features["current"].pct_change(periods=i).fillna(0)
            features[f"soc_change_{i}"] = features["soc"].pct_change(periods=i).fillna(0)
            features[f"max_single_volt_change_{i}"] = features["max_single_volt"].pct_change(periods=i).fillna(0)
            features[f"min_single_volt_change_{i}"] = features["min_single_volt"].pct_change(periods=i).fillna(0)
            features[f"max_temp_change_{i}"] = features["max_temp"].pct_change(periods=i).fillna(0)
            features[f"min_temp_change_{i}"] = features["min_temp"].pct_change(periods=i).fillna(0)

            features[f"volt_lag_{i}"] = features["volt"].shift(i).fillna(features["volt"][0])
            features[f"current_lag_{i}"] = features["current"].shift(i).fillna(features["current"][0])
            features[f"soc_lag_{i}"] = features["soc"].shift(i).fillna(features["soc"][0])
            features[f"max_single_volt_lag_{i}"] = features["max_single_volt"].shift(i).fillna(features["max_single_volt"][0])
            features[f"min_single_volt_lag_{i}"] = features["min_single_volt"].shift(i).fillna(features["min_single_volt"][0])
            features[f"max_temp_lag_{i}"] = features["max_temp"].shift(i).fillna(features["max_temp"][0])
            features[f"min_temp_lag_{i}"] = features["min_temp"].shift(i).fillna(features["min_temp"][0])
            features[f"single_volt_range_lag_{i}"] = features["single_volt_range"].shift(i).fillna(features["single_volt_range"][0])
            features[f"temp_range_lag_{i}"] = features["temp_range"].shift(i).fillna(features["temp_range"][0])

            # if i > 1:
            #     for col in ['volt','current','soc','max_single_volt','min_single_volt','max_temp','min_temp', 'single_volt_range', 'temp_range']:
            #         features[f'{col}_rolling_mean_{i}'] = features[col].rolling(window=i, center=True).mean().fillna(features[col].mean())
            #         features[f'{col}_rolling_max_{i}'] = features[col].rolling(window=i, center=True).max().fillna(features[col].max())
            #         features[f'{col}_rolling_min_{i}'] = features[col].rolling(window=i, center=True).min().fillna(features[col].min())
            #         features[f'{col}_rolling_std_{i}'] = features[col].rolling(window=i, center=True).std().fillna(features[col].std())
            #         features[f'{col}_rolling_median_{i}'] = features[col].rolling(window=i, center=True).median().fillna(features[col].median())
            #         if i > 3:
            #             features[f'{col}_rolling_skew_{i}'] = features[col].rolling(window=i, center=True).skew().fillna(features[col].skew())
            #             features[f'{col}_rolling_kurt_{i}'] = features[col].rolling(window=i, center=True).kurt().fillna(features[col].kurt())

        x = torch.FloatTensor(features.drop(columns=[col for col in ["file_name", "label"] if col in features.columns]).values.tolist())
        edge_index = torch.LongTensor([source, target])
        edge_weight = np.power(weight_base, (features["timestamp"].loc[source].values - features["timestamp"].loc[target].values + 1))
        edge_attr = torch.FloatTensor(edge_weight)

        graph = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=torch.LongTensor([label]))

        graphs.append(graph)

    save_pickle((graphs, idx), f"cache/{os.path.basename(dataset)}.pkl")

    return graphs, idx


def separate_data(graphs, fold_idx, seed=3407, flod=5):
    assert 0 <= fold_idx and fold_idx < flod, "fold_idx must be from 0 to 9."

    labels = [graph.y.cpu().numpy().tolist()[0] for graph in graphs]

    skf = StratifiedKFold(n_splits=flod, shuffle=True, random_state=seed)
    
    idx_list = []
    for idx in skf.split(np.zeros(len(labels)), labels):
        idx_list.append(idx)
    
    train_idx, test_idx = idx_list[fold_idx]
    train_graphs = [graphs[i] for i in train_idx]
    test_graphs = [graphs[i] for i in test_idx]

    return train_graphs, test_graphs

In [4]:
def process(file, source, target, weight_base=1.05):
    label_dict = {"00": 0, "10": 1}
    features, metadata = load_pickle(file)
    
    mileage = metadata.get("mileage")
    label = label_dict[metadata.get("label", "00")]

    features = pd.DataFrame(features, columns=['volt','current','soc','max_single_volt','min_single_volt','max_temp','min_temp','timestamp'])
    features["mileage"] = mileage
    features["timestamp"] = (features["timestamp"] - features["timestamp"].min()).astype(int)
    # features[f"timestamp_diff_{i}"] = features["timestamp"].diff().fillna(-1) # .apply(lambda x: math.log(x+1, 60))

    features["resistance"] = features["volt"] / (-features["current"])
    features["pwoer"] = features["volt"] * (-features["current"])
    features["consistency_volt"] = features["volt"].std()
    features["consistency_soc"] = features["soc"].std()
    features["consistency_temp"] = (features.max_temp - features.min_temp).std()
    features["consistency_resistance"] = (features["volt"] / (-features["current"])).std()
    features["single_volt_range"] = features.max_single_volt - features.min_single_volt
    features["temp_range"] = features.max_temp - features.min_temp
    
    for i in [1, 5, 15, 30, 60]:
        features[f"timestamp_diff_{i}"] = features["timestamp"].diff(periods=i).fillna(-1)
        features[f"volt_div_{i}"] = features["volt"].diff(i).fillna(0) / features[f"timestamp_diff_{i}"]
        features[f"current_div_{i}"] = features["current"].diff(i).fillna(0) / features[f"timestamp_diff_{i}"]
        features[f"soc_div_{i}"] = features["soc"].diff(i).fillna(0) / features[f"timestamp_diff_{i}"]
        features[f"max_single_volt_div_{i}"] = features["max_single_volt"].diff(i).fillna(0) / features[f"timestamp_diff_{i}"]
        features[f"min_single_volt_div_{i}"] = features["min_single_volt"].diff(i).fillna(0) / features[f"timestamp_diff_{i}"]
        features[f"max_temp_div_{i}"] = features["max_temp"].diff(i).fillna(0) / features[f"timestamp_diff_{i}"]
        features[f"min_temp_div_{i}"] = features["min_temp"].diff(i).fillna(0) / features[f"timestamp_diff_{i}"]

        features[f"volt_change_{i}"] = features["volt"].pct_change(periods=i).fillna(0)
        features[f"current_change_{i}"] = features["current"].pct_change(periods=i).fillna(0)
        features[f"soc_change_{i}"] = features["soc"].pct_change(periods=i).fillna(0)
        features[f"max_single_volt_change_{i}"] = features["max_single_volt"].pct_change(periods=i).fillna(0)
        features[f"min_single_volt_change_{i}"] = features["min_single_volt"].pct_change(periods=i).fillna(0)
        features[f"max_temp_change_{i}"] = features["max_temp"].pct_change(periods=i).fillna(0)
        features[f"min_temp_change_{i}"] = features["min_temp"].pct_change(periods=i).fillna(0)

        features[f"volt_lag_{i}"] = features["volt"].shift(i).fillna(features["volt"][0])
        features[f"current_lag_{i}"] = features["current"].shift(i).fillna(features["current"][0])
        features[f"soc_lag_{i}"] = features["soc"].shift(i).fillna(features["soc"][0])
        features[f"max_single_volt_lag_{i}"] = features["max_single_volt"].shift(i).fillna(features["max_single_volt"][0])
        features[f"min_single_volt_lag_{i}"] = features["min_single_volt"].shift(i).fillna(features["min_single_volt"][0])
        features[f"max_temp_lag_{i}"] = features["max_temp"].shift(i).fillna(features["max_temp"][0])
        features[f"min_temp_lag_{i}"] = features["min_temp"].shift(i).fillna(features["min_temp"][0])
        features[f"single_volt_range_lag_{i}"] = features["single_volt_range"].shift(i).fillna(features["single_volt_range"][0])
        features[f"temp_range_lag_{i}"] = features["temp_range"].shift(i).fillna(features["temp_range"][0])

        # if i > 1:
        #     for col in ['volt','current','soc','max_single_volt','min_single_volt','max_temp','min_temp', 'single_volt_range', 'temp_range']:
        #         features[f'{col}_rolling_mean_{i}'] = features[col].rolling(window=i, center=True).mean().fillna(features[col].mean())
        #         features[f'{col}_rolling_max_{i}'] = features[col].rolling(window=i, center=True).max().fillna(features[col].max())
        #         features[f'{col}_rolling_min_{i}'] = features[col].rolling(window=i, center=True).min().fillna(features[col].min())
        #         features[f'{col}_rolling_std_{i}'] = features[col].rolling(window=i, center=True).std().fillna(features[col].std())
        #         features[f'{col}_rolling_median_{i}'] = features[col].rolling(window=i, center=True).median().fillna(features[col].median())
        #         if i > 3:
        #             features[f'{col}_rolling_skew_{i}'] = features[col].rolling(window=i, center=True).skew().fillna(features[col].skew())
        #             features[f'{col}_rolling_kurt_{i}'] = features[col].rolling(window=i, center=True).kurt().fillna(features[col].kurt())
        
    features = features.replace(np.inf, -1).fillna(0)

    x = torch.FloatTensor(features.drop(columns=[col for col in ["file_name", "label"] if col in features.columns]).values.tolist())
    edge_index = torch.LongTensor([source, target])
    edge_weight = np.power(weight_base, (-np.abs(features["timestamp"].loc[source].values - features["timestamp"].loc[target].values) + 1))
    edge_attr = torch.FloatTensor(edge_weight)

    graph = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=torch.LongTensor([label]))

    return graph, os.path.basename(file)


def load_graphs(dataset, edge_lag=1, weight_base=1.05, cache=True, pool=None, no_direction=False):
    if cache and os.path.exists(f"cache/{os.path.basename(dataset)}.pkl"):
        return load_pickle(f"cache/{os.path.basename(dataset)}.pkl")

    pickle_files = glob(dataset + "/*.pkl")

    idx = []
    graphs = []
    label_dict = {"00": 0, "10": 1}

    source = []
    target = []
    for i in range(1, edge_lag + 1):
        source.extend([i for i in range(256 - i)])
        target.extend([i for i in range(i, 256)])

        if no_direction:
            source.extend([i for i in range(i, 256)])
            target.extend([i for i in range(256-i)])

    if pool:
        tasks = []
        for file in pickle_files:
            tasks.append(pool.submit(process, file, source, target))

        wait(tasks, return_when=ALL_COMPLETED)

        for task in tasks:
            if task.result() is not None:
                graph, file = task.result()
                graphs.append(graph)
                idx.append(file)

    else:
        for file in tqdm(pickle_files, desc=f"load {dataset} data :::"):
            idx.append(os.path.basename(file))
            
            graph, _ = process(file, source, target)
            graphs.append(graph)

    save_pickle((graphs, idx), f"cache/{os.path.basename(dataset)}.pkl")

    return graphs, idx

In [5]:
# parser = argparse.ArgumentParser("UGformer", formatter_class=argparse.ArgumentDefaultsHelpFormatter, conflict_handler='resolve')
# parser.add_argument("--train", default="../data/Train", help="")
# parser.add_argument("--test", default="../data/Test_A", help="Name of the dataset.")
# parser.add_argument("--learning_rate", default=0.001, type=float, help="Learning rate")
# parser.add_argument("--num_epochs", default=50, type=int, help="Number of training epochs")
# parser.add_argument("--model_name", default='PTC', help="")
# parser.add_argument("--dropout", default=0.5, type=float, help="")
# parser.add_argument("--num_hidden_layers", default=1, type=int, help="")
# parser.add_argument("--nhead", default=1, type=int, help="")
# parser.add_argument("--num_timesteps", default=1, type=int, help="Number of self-attention layers within each UGformer layer")
# parser.add_argument("--ff_hidden_size", default=256, type=int, help="The hidden size for the feedforward layer")
# parser.add_argument('--fold_idx', type=int, default=1, help='The fold index. 0-9.')
# parser.add_argument("--seed", type=int, default=3407, help="The random seed.")
# args = parser.parse_args()

# print(args)

In [6]:
train = "data/Train"
test = "data/Test_A"
batch_size = 2048
learning_rate = 3e-4
num_epochs =25
dropout = 0.5
seed = 3407
num_flods = 5
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")

In [7]:
seed_everything(seed=seed)

In [8]:
print("Loading data...")

num_classes = 2

# with ProcessPoolExecutor(max_workers=64) as pool:
pool = None
_train_graphs, _train_index = load_graphs(train, cache=True, pool=pool, no_direction=True)
# _train_graphs = []
# for i in range(1, 4):
#     _train_graphs.extend(load_pickle(f"cache/{os.path.basename(train)}_{i}.pkl"))
_test_graphs, test_index = load_graphs(test, cache=True, pool=pool, no_direction=True)

num_features = _train_graphs[0].x.shape[1]

print("Loading data... finished!")

Loading data...
Loading data... finished!


In [9]:
cols = _train_graphs[0].features
cols_index= [cols.index(col) for col in ['volt', 'current', 'soc', 'max_single_volt', 'min_single_volt', 'max_temp', 'min_temp', 'timestamp', 'mileage', 'timestamp_diff', 'pwoer']]

In [10]:
train_graphs = [Data(x=graph.x[:, cols_index], edge_index=graph.edge_index, edge_attr=graph.edge_attr, y=graph.y) for graph in _train_graphs]
test_graphs = [Data(x=graph.x[:, cols_index], edge_index=graph.edge_index, edge_attr=graph.edge_attr, y=graph.y) for graph in _test_graphs]

In [37]:
import lightgbm as lgb
import xgboost as xgb

In [38]:
np.append(_test_graphs[0].graph_attr, _test_graphs[0].x[-1, :]).shape

(39,)

In [None]:
def lgb_features(graph):
    # return graph.graph_attr.numpy()
    # return np.append(graph.graph_attr.numpy(), graph.x[-1, :])
    loader_train = DataLoader(GraphaDataset([graph]), batch_size=1, shuffle=False)
    for b in loader_train:
        return torch.cat([model(b.to(device), embedding=True).cpu()[0], b.graph_attr.cpu().detach()]).detach().numpy()

In [None]:
importance_features = []

def cv_model(clf, clf_name, cv=5, seed=2022):

    test_x = np.array([lgb_features(graph) for graph in _test_graphs])
    _mean = np.mean(test_x, axis=0)
    _std = np.std(test_x, axis=0)
    test_x = (test_x - _mean) / (_std + 1e-4)

    test = np.zeros(len(test_x))

    cv_scores = []

    for i in range(num_flods):
        print('************************************ {} ************************************'.format(str(i+1)))
        trin_graphs, vail_graphs = separate_data(_train_graphs, i, seed=seed, flod=num_flods)
        trn_x, trn_y = np.array([lgb_features(graph) for graph in trin_graphs]), np.array([graph.y.numpy()[0] for graph in trin_graphs])
        val_x, val_y = np.array([lgb_features(graph) for graph in vail_graphs]), np.array([graph.y.numpy()[0] for graph in vail_graphs])

        _mean = np.mean(trn_x, axis=0)
        _std = np.std(trn_x, axis=0)
        trn_x = (trn_x - _mean) / (_std + 1e-4)
        val_x = (val_x - _mean) / (_std + 1e-4)

        features = [f"x_{i}" for i in range(trin_graphs[0].x.shape[-1])]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 4,
                'num_leaves': 2 ** 6,
                'lambda_l2': 10,
                'lambda_l1': 0.2,
                'feature_fraction': 0.7,
                'bagging_fraction': 0.7,
                'bagging_freq': 10,
                'learning_rate': 0.001,
                'max_bin': 8,
                'min_data_in_leaf': 256,
                'seed': 3048,
                'n_jobs':-1,
                'verbose': -1,
            }

            model = clf.train(params, train_matrix, num_boost_round=2500, valid_sets=[train_matrix, valid_matrix], 
                              categorical_feature=[], verbose_eval=100, early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)

            importance_features.extend([i[0] for i in list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20]])
            
            print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
            
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            test_matrix = clf.DMatrix(test_x)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'min_child_weight': 1.5,
                      'max_depth': 4,
                      'max_bin': 10,
                      'gamma': 1,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.0001,
                      'tree_method': 'exact',
                      'seed': 2020,
                      'nthread': -1,
                      'verbosity': 1,
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=1500, evals=watchlist, verbose_eval=100, early_stopping_rounds=200)
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit)
            
        test = test_pred / num_flods
        cv_scores.append(roc_auc_score(val_y, val_pred))
        print(cv_scores)
    
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    
    return test_pred

In [None]:
def lgb_model():
    lgb_test = cv_model(lgb, "lgb")
    return lgb_test

def xgb_model():
    xgb_test = cv_model(xgb, "xgb")
    return xgb_test

In [None]:
pred = lgb_model()

In [None]:
# from sklearn.preprocessing import MinMaxScaler
#区间缩放，返回值为缩放到[0, 1]区间的数据
# Standard_data=MinMaxScaler().fit_transform(data)

In [None]:
# import itertools
# cols = ['volt','current','soc','max_single_volt','min_single_volt','max_temp', 'min_temp', 'resistance', 'pwoer', 'single_volt_range', 'temp_range']
# funcs = ["sum", "min", "max", "std", "median", "skew", "kurt", "mad"]
# columns = ["mileage"] + [{c: f} for f, c in itertools.product(funcs, cols)]
# print([columns[int(x[2:])] for x in sorted(set(importance_features))])

In [None]:
pd.DataFrame(list(zip(test_index, pred)), columns=["file_name", "score"]).to_csv("lgb_submit.csv", index=False)

In [None]:
# pd.read_csv("lgb_submit.csv")['score'].mean()

In [None]:
# pd.read_csv("results_cv1/flod_5_epoch_11submit.csv")["score"].mean()

In [None]:
# pred = 0.3 * MinMaxScaler().fit_transform(pd.read_csv("lgb_submit.csv")[['score']])[:, 0] + 0.7 * MinMaxScaler().fit_transform(pd.read_csv("results_cv1/flod_5_epoch_11submit.csv")[['score']])[:, 0]

In [11]:
class GraphaDataset(InMemoryDataset):
    def __init__(self, data_list):
        super().__init__()
        self.data, self.slices = self.collate(data_list)

In [12]:
dateset_train = GraphaDataset(train_graphs)
dateset_test = GraphaDataset(test_graphs)

In [13]:
loader_train = DataLoader(dateset_train, batch_size=batch_size, shuffle=True)
loader_test = DataLoader(dateset_test, batch_size=1, shuffle=False)

In [14]:
def model_summary(model):
    for batch in loader_train:
        print(summary(model, input_data=(batch)))
        break

In [15]:
class GCNNet(torch.nn.Module):
    
    def __init__(self, hidden_channels):
        super(GCNNet, self).__init__()
        self.conv1 = GCNConv(dateset_train.num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.bn1 = BatchNorm(hidden_channels)
        self.lin = Linear(hidden_channels, dateset_train.num_classes)
    
    def forward(self, data):
        x, edge_index, batch, edge_attr = data.x, data.edge_index, data.batch, data.edge_attr
        # 1. 获得节点嵌入
        x = self.conv1(x, edge_index, edge_attr)
        x = self.bn1(x)
        x = x.relu()

        x = self.conv2(x, edge_index, edge_attr)
        x = x.relu()
        
        x = self.conv3(x, edge_index, edge_attr)
        
        # 2. Readout layer
        x = global_mean_pool(x, batch)
        
        # 3. 分类器
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        
        return x

In [16]:
class GCNTest(torch.nn.Module):
    def __init__(self, dataset, num_layers, hidden):
        super().__init__()
        self.conv1 = GCNConv(dataset.num_features, hidden)
        self.bn1 = BatchNorm(hidden)
        self.convs = torch.nn.ModuleList()
        for i in range(num_layers - 1):
            self.convs.append(GCNConv(hidden, hidden))
        # self.lin1 = Linear(hidden, hidden)
        self.lin2 = Linear(hidden, dataset.num_classes)

    def reset_parameters(self):
        self.bn1.reset_parameters()
        self.conv1.reset_parameters()
        for conv in self.convs:
            conv.reset_parameters()
        # self.lin1.reset_parameters()
        self.lin2.reset_parameters()

    def forward(self, data, embedding=False):
        x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
        if self.training:
            x = self.add_noise(x)
        x = F.dropout(x, p=0.2, training=self.training)
        x = F.relu(self.bn1(self.conv1(x, edge_index)))
        for conv in self.convs:
            x = F.relu(conv(x, edge_index))
        x = global_mean_pool(x, batch)

        if embedding:
            return x

        # x = F.relu(self.lin1(x))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin2(x)
        return F.log_softmax(x, dim=-1)

    @staticmethod
    def add_noise(x, perturb_noise=0.025):
        perturb = torch.empty_like(x).uniform_(-perturb_noise, perturb_noise)
        return x + perturb

    def __repr__(self):
        return self.__class__.__name__

In [17]:
class GCNWithJK(torch.nn.Module):
    def __init__(self, dataset, num_layers, hidden, mode='cat'):
        super().__init__()
        self.conv1 = GCNConv(dataset.num_features, hidden)
        self.bn1 = BatchNorm(hidden)
        self.convs = torch.nn.ModuleList()
        for i in range(num_layers - 1):
            self.convs.append(GCNConv(hidden, hidden))
        self.jump = JumpingKnowledge(mode)
        if mode == 'cat':
            self.lin1 = Linear(num_layers * hidden, hidden)
        else:
            self.lin1 = Linear(hidden, hidden)
        self.lin2 = Linear(hidden, dataset.num_classes)

    def reset_parameters(self):
        self.conv1.reset_parameters()
        self.bn1.reset_parameters()
        for conv in self.convs:
            conv.reset_parameters()
        self.jump.reset_parameters()
        self.lin1.reset_parameters()
        self.lin2.reset_parameters()

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = F.relu(self.conv1(x, edge_index))
        xs = [x]
        x = self.bn1(x)
        for conv in self.convs:
            x = F.relu(conv(x, edge_index))
            xs += [x]
        x = self.jump(xs)
        x = global_mean_pool(x, batch)
        x = F.relu(self.lin1(x))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin2(x)
        return F.log_softmax(x, dim=-1)

    def __repr__(self):
        return self.__class__.__name__

In [18]:
from torch_geometric.nn import JumpingKnowledge, SAGEConv, global_mean_pool


class GraphSAGE(torch.nn.Module):
    def __init__(self, dataset, num_layers, hidden):
        super().__init__()
        self.conv1 = SAGEConv(dataset.num_features, hidden)
        self.bn1 = BatchNorm(hidden)
        self.convs = torch.nn.ModuleList()
        for i in range(num_layers - 1):
            self.convs.append(SAGEConv(hidden, hidden))
        # self.lin1 = Linear(hidden, hidden)
        self.lin2 = Linear(hidden, dataset.num_classes)

    def reset_parameters(self):
        self.bn1.reset_parameters()
        self.conv1.reset_parameters()
        for conv in self.convs:
            conv.reset_parameters()
        self.lin1.reset_parameters()
        self.lin2.reset_parameters()

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        if self.training:
            x = self.add_noise(x)
        x = F.relu(self.bn1(self.conv1(x, edge_index)))
        if self.training:
            x = self.add_noise(x)
        for conv in self.convs:
            x = F.relu(conv(x, edge_index))
        x = global_mean_pool(x, batch)
        # x = F.relu(self.lin1(x))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin2(x)
        return F.log_softmax(x, dim=-1)

    @staticmethod
    def add_noise(x, perturb_noise=0.05):
        perturb = torch.empty_like(x).uniform_(-perturb_noise, perturb_noise)
        return x + perturb

    def __repr__(self):
        return self.__class__.__name__

In [19]:
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
# model = GCNNet(hidden_channels=128).to(device)
model = GCNTest(dateset_train, 4, 256).to(device)
# model = GCNTest(dateset_train, 4, 128).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, weight_decay=0.02)
criterion = torch.nn.CrossEntropyLoss()
ema = EMA(model.parameters(), decay=0.995)

In [20]:
# model_summary(model)

In [21]:
# t_total = len(dateset_trin) // batch_size * num_epochs
# # scheduler = get_default_cosine_schedule_with_warmup(optimizer, t_total, warmup_ratio=0.1)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[1, 2, 3], gamma=0.5)

In [22]:
def train(loader):
    model.train()

    for data in tqdm(loader, desc="training :::"):
        data.to(device)
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, data.y)
        
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)

        optimizer.step()
        # scheduler.step()

        # ema.update(model.parameters())

        # data_iter.set_postfix(loss='{:.4f}'.format(loss.cpu().item()))

In [23]:
def test(loader, mode="train"):
    model.eval()
    preps = []
    trues = []

    # ema.store(model.parameters())
    # ema.copy_to(model.parameters())
    
    with torch.no_grad():
        for data in tqdm(loader, desc=f"evaluate :::"):
            data.to(device)
            trues.extend(data.y.cpu().numpy().tolist())
            out = model(data)
            pred = F.softmax(out)[:, 1]
            preps.extend(pred.cpu().numpy().tolist())
    
    # ema.restore(model.parameters())

    return roc_auc_score(trues, preps)

In [24]:
def inference(loader, submit="results/submit.csv", download=True, use_ema=False):
    model.eval()
    results = []

    if use_ema:
        ema.store(model.parameters())
        ema.copy_to(model.parameters())

    with torch.no_grad():
        for data in tqdm(loader_test, desc="inference :::"):
            data.to(device)
            out = model(data)
            results.extend(F.softmax(out)[:, 1].cpu().numpy().tolist())

    if use_ema:
        ema.restore(model.parameters())

    if submit:
        pd.DataFrame(list(zip(test_index, results)), columns=["file_name", "score"]).to_csv(submit, index=False)

        if download:
            print(f"sshpass -p '' scp vloong/{submit} results/")

    else:
        return results

In [25]:
for epoch in range(14):
    train(loader_train)
    train_auc = test(loader_train, mode="train")
    if epoch >= 10 and epoch % 1 == 0:
        inference(loader_test, submit=f"results/{epoch}_result.csv", download=False)
        # inference(loader_test, submit=f"results/{epoch}_ema_result.csv", download=False, use_ema=True)
    
    print(f'epoch: {epoch:03d}, train auc: {train_auc:.4f}')
    
    count = 0
    if count == 0 and train_auc > 0.93:
        scheduler.step()
        count += 1
        
    if train_auc > 0.935:
        break

training :::: 100%|██████████| 14/14 [00:08<00:00,  1.62it/s]
evaluate :::: 100%|██████████| 14/14 [00:02<00:00,  5.73it/s]


epoch: 000, train auc: 0.8150


training :::: 100%|██████████| 14/14 [00:06<00:00,  2.11it/s]
evaluate :::: 100%|██████████| 14/14 [00:02<00:00,  5.67it/s]


epoch: 001, train auc: 0.8628


training :::: 100%|██████████| 14/14 [00:06<00:00,  2.12it/s]
evaluate :::: 100%|██████████| 14/14 [00:02<00:00,  5.79it/s]


epoch: 002, train auc: 0.8761


training :::: 100%|██████████| 14/14 [00:06<00:00,  2.13it/s]
evaluate :::: 100%|██████████| 14/14 [00:02<00:00,  5.71it/s]


epoch: 003, train auc: 0.8807


training :::: 100%|██████████| 14/14 [00:06<00:00,  2.15it/s]
evaluate :::: 100%|██████████| 14/14 [00:02<00:00,  5.92it/s]


epoch: 004, train auc: 0.8828


training :::: 100%|██████████| 14/14 [00:06<00:00,  2.18it/s]
evaluate :::: 100%|██████████| 14/14 [00:02<00:00,  5.77it/s]


epoch: 005, train auc: 0.8855


training :::: 100%|██████████| 14/14 [00:06<00:00,  2.14it/s]
evaluate :::: 100%|██████████| 14/14 [00:02<00:00,  5.91it/s]


epoch: 006, train auc: 0.8879


training :::: 100%|██████████| 14/14 [00:06<00:00,  2.17it/s]
evaluate :::: 100%|██████████| 14/14 [00:02<00:00,  5.95it/s]


epoch: 007, train auc: 0.8914


training :::: 100%|██████████| 14/14 [00:06<00:00,  2.18it/s]
evaluate :::: 100%|██████████| 14/14 [00:02<00:00,  5.95it/s]


epoch: 008, train auc: 0.8972


training :::: 100%|██████████| 14/14 [00:05<00:00,  2.43it/s]
evaluate :::: 100%|██████████| 14/14 [00:03<00:00,  4.57it/s]


epoch: 009, train auc: 0.8978


training :::: 100%|██████████| 14/14 [00:05<00:00,  2.44it/s]
evaluate :::: 100%|██████████| 14/14 [00:03<00:00,  4.51it/s]
inference :::: 100%|██████████| 6234/6234 [00:23<00:00, 269.75it/s]


epoch: 010, train auc: 0.9096


training :::: 100%|██████████| 14/14 [00:06<00:00,  2.10it/s]
evaluate :::: 100%|██████████| 14/14 [00:02<00:00,  5.94it/s]
inference :::: 100%|██████████| 6234/6234 [00:22<00:00, 274.26it/s]


epoch: 011, train auc: 0.9181


training :::: 100%|██████████| 14/14 [00:05<00:00,  2.39it/s]
evaluate :::: 100%|██████████| 14/14 [00:03<00:00,  4.18it/s]
inference :::: 100%|██████████| 6234/6234 [00:22<00:00, 271.17it/s]


epoch: 012, train auc: 0.9228


training :::: 100%|██████████| 14/14 [00:05<00:00,  2.36it/s]
evaluate :::: 100%|██████████| 14/14 [00:03<00:00,  4.15it/s]
inference :::: 100%|██████████| 6234/6234 [00:22<00:00, 274.57it/s]

epoch: 013, train auc: 0.9248





In [28]:
inference(loader_test, submit="results/submit.csv")

inference :::: 100%|██████████| 6234/6234 [00:22<00:00, 274.80it/s]


sshpass -p 'QW3ee425#c5bd!4713=a67ddf*e04c3e34c892' scp share@113.31.111.86:/data/lpzhang/vloong/results/submit.csv /Users/lubberit/Downloads/results/


In [None]:
for i in range(num_flods):
    trin_graphs, vail_graphs = separate_data(train_graphs, i, seed=seed, flod=num_flods)
    
    # 创建 dataset
    dateset_trin = GraphaDataset(trin_graphs)
    dateset_vail = GraphaDataset(vail_graphs)

    # 创建 dataloader
    loader_trin = DataLoader(dateset_trin, batch_size=batch_size, shuffle=True)
    loader_vail = DataLoader(dateset_vail, batch_size=batch_size, shuffle=True)

    # 创建 model 和 optimizer
    # model = GCNNet(hidden_channels=128).to(device)
    model = GCNTest(dateset_train, 4, 256).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, weight_decay=0.01)
    criterion = torch.nn.CrossEntropyLoss()
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[1, 2, 3], gamma=0.4)

    # 训练模型
    for epoch in range(num_epochs):
        logger.info(f'{"/" * 50} flod : {i + 1}   epoch : {epoch + 1} {"/" * 50}')
        
        train(loader_trin)

        train_auc = test(loader_trin, epoch)
        test_auc = test(loader_vail, epoch)

        logger.info(f'flod {i + 1}, epoch: {epoch:03d}, train auc: {train_auc:.4f}, val auc: {test_auc:.4f}')

        count = 0
        if count == 0 and test_auc > 0.925:
            scheduler.step()
        if test_auc > 0.935:
            break
    
    if not os.path.exists("results_cv"):
        os.makedirs("results_cv")

    inference(loader_test, submit=f"results_cv/flod_{i+1}_epoch_{epoch + 1}submit.csv", download=False)

    del model, optimizer, loader_trin, loader_vail, dateset_trin, dateset_vail, trin_graphs, vail_graphs
    gc.collect()

In [None]:
scores = np.array([0.8576, 0.8488, 0.8588, 0.8564, 0.8595, 0.8561])
scores = scores / scores.sum()

In [None]:
final_submit = pd.read_csv('results_cv1/submit.csv')
final_submit["score"] = final_submit["score"] * scores[-1]
for idx, f in enumerate([
            'results_cv1/flod_1_epoch_32submit.csv',
            'results_cv1/flod_2_epoch_13submit.csv',
            'results_cv1/flod_3_epoch_11submit.csv',
            'results_cv1/flod_4_epoch_10submit.csv',
            'results_cv1/flod_5_epoch_11submit.csv',
        ]):
    submit = pd.read_csv(f)
    final_submit["score"] = final_submit["score"] + submit["score"] * scores[idx]

In [None]:
submit = pd.read_csv("results/submit.csv")
final_submit["score"] = final_submit["score"] + submit["score"]

In [None]:
final_submit.to_csv("submit_cv.csv", index=False)

In [None]:
# 最优 AUC 阈值搜索
def Find_Optimal_Cutoff(TPR, FPR, threshold):
    y = TPR - FPR
    Youden_index = np.argmax(y)
    optimal_threshold = threshold[Youden_index]
    print(optimal_threshold)
    point = [FPR[Youden_index], TPR[Youden_index]]
    return optimal_threshold, point

def acu_curve(index_name,y,prob):
    font = {'family': 'Times New Roman', 'size': 12}
    sns.set(font_scale=1.2)
    plt.rc('font', family='Times New Roman')
    fpr, tpr, thresholds = roc_curve(y,prob)
    roc_auc = auc(fpr,tpr)
    lw = 2
    plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.3f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    optimal_th, optimal_point = Find_Optimal_Cutoff(TPR=tpr, FPR=fpr, threshold=thresholds)
    print(optimal_point)
    plt.plot(optimal_point[0], optimal_point[1], marker='o', color='r')
    plt.text(
                optimal_point[0], 
                optimal_point[1], 
                (
                    float('%.2f'% optimal_point[0]),
                    float('%.2f'% optimal_point[1])
                ),
                ha='right', 
                va='top', 
                fontsize=12
            )
    plt.text(optimal_point[0], optimal_point[1],  f'Threshold:{optimal_th:.2f}', fontsize=12)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate',fontsize = 14)
    plt.ylabel('True Positive Rate',fontsize = 14)
    plt.title('ROC analysis of '+ index_name,fontsize = 14)
    plt.legend(loc="lower right",fontsize = 12)
    plt.show()

In [None]:
def search_threshold(loader, model):
    model.eval()
    preps = []
    trues = []

    with torch.no_grad():
        for data in tqdm(loader, desc=f"evaluate :::"):
            data.to(device)
            trues.extend(data.y.cpu().numpy().tolist())
            out = model(data)
            pred = F.softmax(out)[:, 1]
            preps.extend(pred.cpu().numpy().tolist())

    acu_curve("vloong", trues, preps)

    return roc_auc_score(trues, preps)

In [None]:
search_threshold(loader_test)