In [None]:
import os
import gc
import re
import math
import toad
import json
import torch
import pickle
import random
import argparse
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
import matplotlib.pyplot as plt
from toad.metrics import KS, AUC
from sklearn.metrics import make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

warnings.filterwarnings("ignore")


def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

def load_pickle(file):
    with open(file, "rb") as f:
        return pickle.load(f)

def save_pickle(obj, file):
    with open(file, "wb") as f:
        pickle.dump(obj, f)

In [None]:
import torch
from torch.nn import Linear, Dropout
import torch.nn.functional as F

from torch_geometric.data import Data
from torch_geometric.data import InMemoryDataset
from torch_geometric.loader import DataLoader

from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool
from torch_geometric.nn import Linear, HeteroConv, GraphConv, GAT, RGCNConv, BatchNorm, GCN
from torch_geometric.nn.models import JumpingKnowledge

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, roc_auc_score, auc

In [None]:
batch_size = 2048
pseudo_ratio = 0.4
num_epoches = 13
seed_everything(seed=3407)
pd.set_option('display.max_rows', 256)

In [None]:
batch_size = 256
pseudo_ratio = 0.4
num_epoches = 6
seed_everything(seed=3407)
pd.set_option('display.max_rows', 256)

In [None]:
def read_pickle(file, f, pseudo=0):
    if f.endswith(".pkl"):
        data, metadata = load_pickle(f"data/{file}/{f}")
        data = pd.DataFrame(data, columns=['volt','current','soc','max_single_volt','min_single_volt','max_temp','min_temp','timestamp'])
        # data["pwoer"] = data["volt"] * (-data["current"])
        
        # 一致性
        # data["consistency"] = data["volt"].std() * 0.2286 + data["soc"].std() * 0.3699 + (data.max_temp - data.min_temp).std() * 0.1242 + (data["volt"] / (-data["current"])).std() * 0.1774 #  + (data["current"] * data["timestamp"].diff().fillna(0) / data["soc"].diff().fillna(0)).std() * 0.0999

        # 热风险分析算法
        # data["single_volt_range"] = data.max_single_volt - data.min_single_volt
        # data["single_volt_range_diff"] = data["single_volt_range"].diff().fillna(0)
        # data["temp_range"] = data.max_temp - data.min_temp

        data["mileage"] = metadata.get("mileage")
        if metadata.get("label"):
            data["label"] = 1 if metadata.get("label") == "10" else 0
        else:
            data["label"] = pseudo
        
        data["file_name"] = f
        return data #.drop(columns=['max_single_volt','min_single_volt','max_temp','min_temp'])

In [None]:
train = [read_pickle("Train", f) for f in tqdm(os.listdir("data/Train/"), desc="load train :::")]
test = [read_pickle("Test_A", f) for f in tqdm(os.listdir("data/Test_A/"), desc="load  test :::")]

In [None]:
test_index = [data["file_name"][0] for data in test]

In [None]:
bad_index = int(sum([d["label"][0] for d in train]) / len(train) * pseudo_ratio * len(test))
good_index = int((len(train) - sum([d["label"][0] for d in train])) / len(train) * pseudo_ratio * len(test))

In [None]:
pred_score = pd.read_csv("result.csv").sort_values("score").reset_index(drop=True)
good_filename = pred_score["file_name"].tolist()[:good_index]
bad_filename = pred_score["file_name"].tolist()[-bad_index:]
pseudo = [read_pickle("Test_A", f, pseudo=1) for f in tqdm(bad_filename, desc="load pseudo  bad :::")] \
    # + [read_pickle("Test_A", f, pseudo=0) for f in tqdm(good_filename, desc="load pseudo good :::")]
train.extend(pseudo)

In [None]:
def data_split(full_list, ratio, shuffle=False):
    n_total = len(full_list)
    offset = int(n_total * ratio)

    if n_total == 0 or offset < 1:
        return [], full_list

    if shuffle:
        random.shuffle(full_list)

    sublist_1 = full_list[:offset]
    sublist_2 = full_list[offset:]
    
    return sublist_1, sublist_2

In [None]:
source = []
target = []
for i in range(1, 2):
    source.extend([i for i in range(256 - i)])
    target.extend([i for i in range(i, 256)])

In [None]:
def process_graph(datasets, mode="train"):
    # rations = set()
    graphs = []
    for row_data in tqdm(datasets, desc=f"process {mode} graph :::"):
        data = row_data.copy()
        # rations.update(data["timestamp"].diff().fillna(0).astype(int).unique())
        data["timestamp"] = (data["timestamp"] - data["timestamp"].min()).astype(int)
        data["timestamp_diff"] = data["timestamp"].diff().fillna(0).apply(lambda x: math.log(x+1, 60))
        x = torch.FloatTensor(data.drop(columns=[col for col in ["file_name", "label"] if col in data.columns]).values.tolist())
        edge_index = torch.LongTensor([source, target]) # ([[i for i in range(255)], [i + 1 for i in range(255)]])
        edge_weight = data["timestamp"].loc[target].values - data["timestamp"].loc[source].values
        edge_attr = torch.FloatTensor(edge_weight)
        # edge_attr = torch.FloatTensor(data["timestamp"].diff().fillna(0).apply(lambda x: math.log(x+1, 60)).tolist()[1:])

        if mode != "test":
            # 对正向样本进行采样扩充数据
            y = torch.LongTensor([data["label"][0]])
            
            # if data["label"][0] == 1:
            #     for i in range(1):
            #         _data = row_data.sample(frac=0.95).sort_index().reset_index(drop=True)
            #         _data = pd.concat([row_data[:6], _data, row_data[-(len(row_data) - len(_data) - 6):]]).reset_index(drop=True)
            #         _data["timestamp"] = (_data["timestamp"] - _data["timestamp"].min()).astype(int)
            #         _data["timestamp_diff"] = _data["timestamp"].diff().fillna(0).apply(lambda x: math.log(x+1, 60) if x > 0 else 0)
            #         _x = torch.FloatTensor(_data.drop(columns=[col for col in ["file_name", "label"] if col in _data.columns]).values.tolist())
            #         _edge_index = torch.LongTensor([[i for i in range(len(_data))], [i + 1 for i in range(len(_data))]])
            #         _edge_attr = torch.FloatTensor(_data["timestamp"].diff().fillna(0).apply(lambda x: math.log(x+1, 60) if x > 0 else 0).tolist()[1:])
            #         _grpha = Data(x=_x, edge_index=_edge_index, edge_attr=_edge_attr, y=y)
            #         graphs.append(_grpha)

            grpha = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
        else:
            grpha = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
            
        graphs.append(grpha)
    # print(rations, len(rations))
    return graphs

In [None]:
graph_train = process_graph(train, mode="train")
graph_test = process_graph(test, mode="test")

In [None]:
graph_trin, graph_vail = data_split(graph_train, ratio=0.7, shuffle=True)

In [None]:
class GraphaDataset(InMemoryDataset):
    def __init__(self, data_list):
        super().__init__()
        self.data, self.slices = self.collate(data_list)

In [None]:
dateset_train = GraphaDataset(graph_train)
dateset_trin = GraphaDataset(graph_trin)
dateset_vail = GraphaDataset(graph_vail)
dateset_test = GraphaDataset(graph_test)

In [None]:
loader_train = DataLoader(dateset_train, batch_size=batch_size, shuffle=True)
loader_trin = DataLoader(dateset_trin, batch_size=batch_size, shuffle=True)
loader_vail = DataLoader(dateset_vail, batch_size=batch_size, shuffle=True)
loader_test = DataLoader(dateset_test, batch_size=1, shuffle=False)

In [None]:
class GCNTest(torch.nn.Module):
    def __init__(self, dataset, num_layers, hidden):
        super().__init__()
        self.conv1 = GCNConv(dataset.num_node_features, hidden)
        self.bn1 = BatchNorm(hidden)
        self.convs = torch.nn.ModuleList()
        for i in range(num_layers - 1):
            self.convs.append(GCNConv(hidden, hidden))
        # self.lin1 = Linear(hidden, hidden)
        self.lin2 = Linear(hidden, dataset.num_classes)

    def reset_parameters(self):
        self.bn1.reset_parameters()
        self.conv1.reset_parameters()
        for conv in self.convs:
            conv.reset_parameters()
        # self.lin1.reset_parameters()
        self.lin2.reset_parameters()

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        if self.training:
            x = self.add_noise(x)
        x = F.dropout(x, p=0.2, training=self.training)
        x = F.relu(self.bn1(self.conv1(x, edge_index)))
        for conv in self.convs:
            x = F.relu(conv(x, edge_index))
        x = global_mean_pool(x, batch)
        # x = F.relu(self.lin1(x))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin2(x)
        return F.log_softmax(x, dim=-1)

    @staticmethod
    def add_noise(x, perturb_noise=0.05):
        perturb = torch.empty_like(x).uniform_(-perturb_noise, perturb_noise)
        return x + perturb

    def __repr__(self):
        return self.__class__.__name__

In [None]:
device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")
# model = GCN(hidden_channels=128).to(device)
model = GCNTest(dateset_train, 4, 256).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, weight_decay=0.025)
t_total = (len(dateset_train) // batch_size + 1) * num_epoches
# scheduler = get_default_cosine_schedule_with_warmup(optimizer, t_total, warmup_ratio=0.1)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, final_div_factor=1000, div_factor=10, max_lr=5e-4, total_steps=t_total, pct_start=0.4)
# scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[1, 2, 3], gamma=0.5)
criterion = torch.nn.CrossEntropyLoss(weight=torch.FloatTensor([0.4, 0.6]).to(device))

In [None]:
def train(loader):
    model.train()
    data_iter = tqdm(loader, desc="pyg model training :::")

    for data in data_iter:
        data.to(device)
        optimizer.zero_grad()

        out = model(data)
        loss = criterion(out, data.y)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 10.0)
        
        optimizer.step()
        scheduler.step()
        
        data_iter.set_postfix(loss='{:.4f}'.format(loss.cpu().item()))

In [None]:
def test(loader, epoch, mode="train"):
    model.eval()
    preps = []
    trues = []
    with torch.no_grad():
        for data in tqdm(loader, desc=f"pyg {mode} evaluate :::"):
            trues.extend(data.y.cpu().numpy().tolist())
            data.to(device)
            out = model(data)
            pred = F.softmax(out)[:, 1]
            preps.extend(pred.cpu().numpy().tolist())

    return roc_auc_score(trues, preps)

In [None]:
def inference(loader):
    model.eval()

    results = []
    with torch.no_grad():
        for data in tqdm(loader, desc="pyg model inference :::"):
            data.to(device)
            out = model(data)
            results.extend(F.softmax(out)[:, 1].cpu().numpy().tolist())

    return results

In [None]:
for epoch in range(num_epoches):
    train(loader_train)
    train_auc = test(loader_train, epoch, mode="train")

    if epoch >= 3:
        pd.DataFrame(list(zip(test_index, inference(loader_test))), columns=["file_name", "score"]).to_csv(f"{epoch}_result.csv", index=False)
    
    print(f'epoch: {epoch:03d}, train auc: {train_auc:.4f}')
    
    # count = 0
    # if count == 0 and train_auc > 0.925:
    #     scheduler.step()
    #     count += 1

#     # train(loader_trin)
#     # train_auc = test(loader_trin, epoch, mode="train")
#     # test_auc = test(loader_vail, epoch, mode=" vail")
#     # print(f'epoch: {epoch:03d}, train auc: {train_auc:.4f}, val auc: {test_auc:.4f}')

In [None]:
# for epoch in range(num_epoches):
#     # model.train()
#     train(loader_trin)

#     # model.eval()
#     train_auc = test(loader_trin, epoch)
#     test_auc = test(loader_vail, epoch)
#     print(f'epoch: {epoch:03d}, train auc: {train_auc:.4f}, val auc: {test_auc:.4f}')

In [None]:
pred = inference(loader_test)

In [None]:
pd.DataFrame(list(zip(test_index, pred)), columns=["file_name", "score"]).to_csv("result.csv", index=False)

In [None]:
pred_score = pd.DataFrame(
    list(zip(test_index, pred)), columns=["file_name", "score"]
).sort_values("score").reset_index(drop=True)
pred_score = pd.read_csv("submit.csv").sort_values("score").reset_index(drop=True)

In [None]:
good_filename = pred_score["file_name"].tolist()[:good_index]
bad_filename = pred_score["file_name"].tolist()[-bad_index:]