In [1]:
# %%
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.functional as f
from torch.nn import Parameter
from torch.utils.data import Dataset,DataLoader
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence 
import itertools
from sklearn.metrics import f1_score


from prettytable import PrettyTable
import datetime

import os
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import time

import warnings
warnings.filterwarnings("ignore")  # Ignore all warnings
import math
import time
import pandas as pd
import numpy as np
import progressbar
from joblib import Parallel, delayed
import multiprocessing
from scipy.stats import skew, kurtosis
from numpy.lib.stride_tricks import as_strided as stride
from geopy.distance import geodesic
import _pickle as cPickle
import argparse
import random
from utils import *
from sklearn.model_selection import train_test_split
print('lmy/7_model_gps_road.py')

lmy/7_model_gps_road.py


In [2]:
filenames =get_filenames()

# %%
args = get_parser()
args.transformer = 1
args.gpu = 0
args.STAT_NET_input_road = np.array([ 8, 11, 20, 21, 27,  6])#train_data.stat_data.loc[:,get_road_name()].max().values+1#最大值加1

args.device = 'cuda:{}'.format(args.gpu) if (args.gpu>=0) & torch.cuda.is_available() else 'cpu'
#args.now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f==') #时间
args.model_save = '/DATA2/lvxiaoling/limengyuan/SHL2023/save/models2023-06-19 23_59_29_759129==.pth'

args.CONV_SENSORS_input_dim = 12
args.CONV_GEO_num_feat = 6
args.STAT_NET_input_sensors = 195
args.STAT_NET_input_geo = 26


In [3]:

class  dset(Dataset):
    def __init__(self, data_sensor,data_gps,stat_data):
        super(dset, self).__init__()
        self.data_sensor = data_sensor
        self.data_gps = data_gps
        self.stat_data = stat_data
        self.name_dict = {name:i for i,name in enumerate(stat_data)}
        self.stat_sensors = [self.name_dict[i] for i in get_sensors_name()]
        self.stat_gps = [self.name_dict[i] for i in get_gps_name()]
        self.stat_road = [self.name_dict[i] for i in get_road_name()]
    def __len__(self):
        return len(self.data_gps)
    def  __getitem__(self, i):
        stat_value = self.stat_data[self.stat_data['idx'].isin(self.data_gps[i][args.L1-1:,-3])].values
        if len(stat_value)!= len(self.data_gps[i][args.L1-1:,-3]):
            print(i)
        return torch.tensor(self.data_sensor[i]),torch.tensor(self.data_gps[i]),torch.tensor(stat_value[:,self.stat_sensors]),torch.tensor(stat_value[:,self.stat_gps]),torch.tensor(stat_value[:,self.stat_road]).long()
    #返回的10分钟内的数据
    #data_sensor,data_gps,label,idx

# %% [markdown]
# ## 加载数据

# %%
def collate_batch(batch):
    data_sensor_list,data_gps_list,lengths,stat_list1,stat_list2,stat_list3 =  [], [],[],[], [],[]
    batch.sort(key=lambda x: len(x[0]), reverse=True)#按照长度的大小进行排序
    for (data_sensor_,data_gps_,stat_list1_i,stat_list2_i,stat_list3_i) in batch:



        data_sensor_list.append(data_sensor_)
        data_gps_list.append(data_gps_)
        
        lengths.append(data_gps_.shape[0])
        stat_list1.append(stat_list1_i)
        stat_list2.append(stat_list2_i)
        stat_list3.append(stat_list3_i)

    data_sensor_list = pad_sequence(data_sensor_list, padding_value=args.pad_value, batch_first=True)#进行填充，每个batch中的句子需要有相同的长度
    data_gps_list = pad_sequence(data_gps_list, padding_value=args.pad_value, batch_first=True)#进行填充，每个batch中的句子需要有相同的长度
    
    stat_list1 = pad_sequence(stat_list1, padding_value=args.pad_value, batch_first=True)#进行填充，每个batch中的句子需要有相同的长度
    stat_list2 = pad_sequence(stat_list2, padding_value=args.pad_value, batch_first=True)#进行填充，每个batch中的句子需要有相同的长度
    stat_list3 = pad_sequence(stat_list3, padding_value=args.pad_value, batch_first=True)#进行填充，每个batch中的句子需要有相同的长度
    roads = []
    for i in range(6):
        road_i = stat_list3[:,:,[i]]
        road_i[road_i==args.pad_value] = args.STAT_NET_input_road[i]
        roads.append(road_i)
    stat_list3 = torch.cat(roads,dim=-1)

        #stat_list3[:,:,i][stat_list3[:,:,i]==args.pad_value] = args.STAT_NET_input_road[i]

    label =  torch.tensor(data_gps_list)[:,:,-4].long()
    label[label>0] = label[label>0]-1
    #data_sensor,data_gps,stat_sensors,stat_gps,stat_road, label,idx,trip_idx, lengths in train_dataloader
    return data_sensor_list.float(),data_gps_list[:,:,:-4].float(),\
        stat_list1.float(),stat_list2.float(),stat_list3.long(),\
        label,torch.tensor(data_gps_list)[:,:,-3].long(),torch.tensor(data_gps_list)[:,:,-2].long(), lengths
        #'label','idx','trajectory_id','label_idx'\



In [4]:

# %%
class CONV_SENSORS(nn.Module):
    def __init__(self,input_dim=3, num_filter = 64,kernel_size = 500, stride=100):
        super(CONV_SENSORS,self).__init__()
        self.conv = nn.Conv1d(input_dim, num_filter, kernel_size, stride = stride)

    def forward(self,data_sensor):# traj:batch_size*seq_len*17
        # 地理卷积
        data_sensor = data_sensor.permute(0,2,1)#batch_size,seq_len,num
        data_sensor = F.elu(self.conv(data_sensor)).permute(0,2,1)# L*seq_len'*num_filter
        return data_sensor


# %%
class CONV_GEO(nn.Module):
    def __init__(self,kernel_size=5,num_filter=64,num_feat = 6):
        super(CONV_GEO,self).__init__()
        self.process_coords = nn.Linear(2,16)
        self.conv1 = nn.Conv1d(16,num_filter,kernel_size)
        self.conv2 = nn.Conv1d(num_feat,num_filter,kernel_size)

    def forward(self,data_gps):# traj:batch_size*seq_len*17
        # 地理卷积
        
        lngs_lats = data_gps[:,:,:2] #batch_size*seq_len*2
        locs1 = torch.tanh(self.process_coords(lngs_lats))# batch_size*seq_len*16
        locs1 =locs1.permute(0,2,1)# batch_size*16*seq_len
        conv_locs1 = F.elu(self.conv1(locs1)).permute(0,2,1)# L*seq_len'*num_filter
        
        # 特征卷积
        features = data_gps[:,:,2:]# batch_size*seq_len*14
        locs2 = features.permute(0,2,1)# batch_size*14*seq_len
        conv_locs2 = F.elu(self.conv2(locs2)).permute(0,2,1)# L*seq_len'*num_filter
        
        return torch.concat([conv_locs1,conv_locs2],dim=2)#地理、特征、时间
        ## L*seq_len'*num_filter

# %%
class STAT_NET(nn.Module):
    def __init__(self,args=args,
                 input_road = [3,4,5,6,7,8],road_embedding =16, 
                 input_sensors=125,sensors_embedding = 64,
                 input_geo=125,geo_embedding = 64):
        super(STAT_NET, self).__init__()
        self.pad_value = args.pad_value
        self.args = args
        self.input_road = input_road
        self.emb = nn.ModuleList([nn.Embedding(i+1,road_embedding,padding_idx=i)    for  i in input_road])
        self.fc_sensors = nn.Linear(input_sensors,sensors_embedding)
        self.fc_geo = nn.Linear(input_geo,geo_embedding)
        #embedding层
        

    def forward(self, stat_sensors,stat_gps,stat_road):
        stat_sensors = self.fc_sensors(stat_sensors)
        stat_gps = self.fc_geo(stat_gps)

        roads = []
        for i,layer in enumerate(self.emb):
            road_i = stat_road[:,:,i]#batch_size,sqe_len,feat_num
            #road_i[road_i==args.pad_value] = self.input_road[i]
            road_i = layer(road_i)
            roads.append(road_i)
        roads = torch.cat(roads,dim=-1)
        return roads

# %%
class BILSTM(torch.nn.Module):
    def __init__(self,args=args,input_dim=64+128, d_model = 128,out_dim=8):
        super(BILSTM, self).__init__()
        self.pad_value = args.pad_value
        self.args = args
        #embedding层
        self.lstm = nn.LSTM(input_dim,d_model//2, num_layers = args.lstm_layer, bidirectional = True,
                                dropout=args.dropout, batch_first=True)
        self.projection = nn.Linear(d_model, out_dim)

    def forward(self, x, lengths):
        lengths = torch.tensor(lengths)-(self.args.L1-1)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True)        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)#lstm层
        # hidden = [n layers *2, batch size, hidden dim]最后一个step的hidden
        # cell = [n layers * 2, batch size, hidden dim]最终一个step的cell
        x, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        x = self.projection(x)
        return x
if args.transformer:
    class BILSTM(torch.nn.Module):
        def __init__(self,args=args,input_dim=64+128, d_model = 128,out_dim=8):
            super(BILSTM, self).__init__()
            self.pad_value = args.pad_value
            self.args = args
            #embedding层
            self.lstm = nn.LSTM(input_dim,d_model//2, num_layers = args.lstm_layer, bidirectional = True,
                                    dropout=args.dropout, batch_first=True)
            self.encoder_layer = nn.TransformerEncoderLayer(d_model * args.nheads, args.nheads, args.ff_size, args.dropout, batch_first=True)
            self.encoder = nn.TransformerEncoder(self.encoder_layer, args.n_layers)
            self.fc = nn.Linear(d_model * args.nheads, d_model)

            self.projection = nn.Linear(d_model, out_dim)

        def forward(self, x, lengths):
            lengths = torch.tensor(lengths)-(self.args.L1-1)
            packed_embedded = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True)        
            packed_output, (hidden, cell) = self.lstm(packed_embedded)#lstm层
            # hidden = [n layers *2, batch size, hidden dim]最后一个step的hidden
            # cell = [n layers * 2, batch size, hidden dim]最终一个step的cell
            x, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
            mask = self.get_mask(lengths,x.shape[0])
            x = self.encoder(x.repeat(1, 1, self.args.nheads), src_key_padding_mask=mask)
            x = self.fc(x)
            x = self.projection(x)


            return x
        def get_mask(self,sentence_lengths,batch_len):
            # 计算最大句子长度
            max_length = max(sentence_lengths)

            # 创建一个零填充的张量，大小为(batch_size, max_length)
            src_key_padding_mask = torch.zeros((batch_len, max_length), dtype=torch.bool)

            # 对每个句子进行遍历，根据句子长度进行填充
            for i, length in enumerate(sentence_lengths):
                src_key_padding_mask[i, :length] = 1

# %%
if False:
    data_sensor,data_gps,stat_sensors,stat_gps,stat_road, label,idx,trip_idx, length = next(iter(train_loader))

    m1 = CONV_SENSORS(input_dim=args.CONV_SENSORS_input_dim, num_filter =args.num_filter)
    r1 = m1(data_sensor)
    m2 = CONV_GEO(num_feat = args.CONV_GEO_num_feat,num_filter =args.num_filter)
    r2 = m2(data_gps)
    m3 = STAT_NET(  input_road = args.STAT_NET_input_road,road_embedding =args.STAT_NET_road_embedding, 
                    input_sensors=args.STAT_NET_input_sensors,sensors_embedding = args.STAT_NET_sensors_embedding,
                    input_geo=args.STAT_NET_input_geo,geo_embedding = args.STAT_NET_geo_embedding)
    r3 = m3(stat_sensors,stat_gps,stat_road)
    r = torch.cat([r1,r2,r3],dim=-1)

    m3 = BILSTM(input_dim=3*args.num_filter +args.STAT_NET_geo_embedding+len( args.STAT_NET_input_road)*args.STAT_NET_road_embedding,args=args)
    r4 = m3(r,length)


def evaluate(model_s,model_g,model_stat,model,loss_fn,train_dataloader):
    model_s.eval()
    model_g.eval()
    model_stat.eval()
    model.eval()

    
    losses = 0
    correct = 0
    with torch.no_grad():
        for data_sensor,data_gps,stat_sensors,stat_gps,stat_road, label,idx,trip_idx, lengths in train_dataloader:
            data_sensor = data_sensor.to(args.device)
            data_gps = data_gps.to(args.device)
            stat_sensors,stat_gps,stat_road = stat_sensors.to(args.device),stat_gps.to(args.device),stat_road.to(args.device)
            label = label.to(args.device)[:,(args.L1-1):]

            output_s = model_s(data_sensor)
            output_g = model_g(data_gps)
            output_stat = model_stat(stat_sensors,stat_gps,stat_road)
            out = model(torch.cat([output_s,output_g,output_stat],dim=-1),lengths)

            
            loss = loss_fn( out.reshape(-1, out.shape[-1]), label.reshape(-1))
           
            losses += loss.item() 
            
            mask = label ==args.pad_value
            pred = torch.argmax(out, dim=2)
            correct +=  (pred ==label ).masked_fill(mask,0).sum().item() / (~mask).sum()
            
    model_s.train()
    model_g.train()
    model_stat.train()
    model.train()
    return losses / len(train_dataloader),correct/len(train_dataloader)

# %%
def weight_init(m):
    if isinstance(m, nn.Linear):
        
        nn.init.xavier_normal_(m.weight)
        nn.init.constant_(m.bias, 0)
    # 也可以判断是否为conv2d，使用相应的初始化方式 
    elif isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
     # 是否为批归一化层
    elif isinstance(m, nn.BatchNorm2d):
        nn.init.constant_(m.weight, 1)
        nn.init.constant_(m.bias, 0)

# %%


# %%
model_s = CONV_SENSORS(input_dim=args.CONV_SENSORS_input_dim, num_filter =args.num_filter).to(args.device).apply(weight_init)
model_g = CONV_GEO(num_feat = args.CONV_GEO_num_feat,num_filter =args.num_filter).to(args.device).apply(weight_init)
model_stat = STAT_NET(input_road = args.STAT_NET_input_road,road_embedding =args.STAT_NET_road_embedding, 
                 input_sensors=args.STAT_NET_input_sensors,sensors_embedding = args.STAT_NET_sensors_embedding,
                 input_geo=args.STAT_NET_input_geo,geo_embedding = args.STAT_NET_geo_embedding).to(args.device).apply(weight_init)
model = BILSTM(input_dim=3*args.num_filter+len( args.STAT_NET_input_road)*args.STAT_NET_road_embedding,args=args).to(args.device).apply(weight_init)


if args.weightloss:
    weight = torch.tensor([1.0048, 1.0022, 2.9012, 1.0453, 0.7708, 0.8641, 0.7827, 1.0274]).to(args.device)
    criterion  = nn.CrossEntropyLoss(ignore_index=args.pad_value,weight=weight)
else:
    criterion  = nn.CrossEntropyLoss(ignore_index=args.pad_value)


#args.model_save = '/DATA1/EvolveGCN/limengyuan/datas/models/models2023-06-16 09:35:17:728874==.pth'
checkpoint = torch.load( args.model_save)
model_s.load_state_dict(checkpoint['model_s'])
model_g.load_state_dict(checkpoint['model_g'])
model_stat.load_state_dict(checkpoint['model_stat'])
model.load_state_dict(checkpoint['model'])
print(args.model_save)

/DATA2/lvxiaoling/limengyuan/SHL2023/save/models2023-06-19 23_59_29_759129==.pth


In [5]:

def prediction_val(model_s,model_g,model_stat,model,loss_fn,train_dataloader):
    model_s.eval()
    model_g.eval()
    model_stat.eval()
    model.eval()


    losses = 0
    correct = 0
    preds = []
    idxs = []
    labels = []
    outs= []
    trip_idxs = []
    with torch.no_grad():
        for data_sensor,data_gps,stat_sensors,stat_gps,stat_road, label,idx,trip_idx, lengths in train_dataloader:
            data_sensor = data_sensor.to(args.device)
            data_gps = data_gps.to(args.device)
            stat_sensors,stat_gps,stat_road = stat_sensors.to(args.device),stat_gps.to(args.device),stat_road.to(args.device)
            label = label.to(args.device)[:,(args.L1-1):]

            output_s = model_s(data_sensor)
            output_g = model_g(data_gps)
            output_stat = model_stat(stat_sensors,stat_gps,stat_road)
            out = model(torch.cat([output_s,output_g,output_stat],dim=-1),lengths)

            loss = loss_fn( out.reshape(-1, out.shape[-1]), label.reshape(-1))

            losses += loss.item() 

            mask = label ==args.pad_value
            pred = torch.argmax(out, dim=2)
            correct +=  (pred ==label ).masked_fill(mask,0).sum().item() / (~mask).sum()

            preds.append(pred)
            idxs.append(idx[:,(args.L1-1):])
            labels.append(label)
            outs.append(F.softmax(out,dim=-1))
            trip_idxs.append(trip_idx)

    model_s.train()
    model_g.train()
    model.train()
    model_stat.train()
    return torch.cat(preds),torch.cat(idxs),torch.cat(trip_idxs),torch.cat(labels),torch.cat(outs),losses / len(train_dataloader),correct/len(train_dataloader)



In [7]:

def val_dataset(loc='Hand'):


    train_hand = pd.read_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hand/raw_data.pkl')
    stat_data = pd.read_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hand/data.pkl').fillna(0).drop_duplicates(keep='first')   

    data_x = np.arange(len(train_hand[0]))#trajectory id
    data_y = [int(i[0][0,-4]) for i in train_hand[1]]
    val_sensor = [j for i in data_x for j in train_hand[0][i]]
    val_gps = [j for i in data_x for j in train_hand[1][i]]
    val_data = dset(val_sensor,val_gps,stat_data)


    test_loader1 = torch.utils.data.DataLoader(
        dataset=val_data,
        batch_size=args.batch_size,
        shuffle=False,
        collate_fn=collate_batch,
        num_workers = args.num_workers
        )
    return test_loader1
val_loader = val_dataset(loc='Hand')


test_loss,test_acc  = evaluate(model_s,model_g,model_stat,model, criterion, val_loader)
print('test loss',test_loss,'test acc',test_acc.item())


preds,idxs,trip_idxs,labels,outs,_,_ = prediction_val(model_s,model_g,model_stat,model, criterion, val_loader)


idxs_1 = idxs.reshape(-1,1).cpu().numpy()
labels_1 = labels.reshape(-1,1).cpu().numpy()
preds_1 = preds.reshape(-1,1).cpu().numpy()
outs_1 = outs.reshape(-1,outs.shape[2]).cpu().numpy()

results = pd.DataFrame(np.concatenate((idxs_1,labels_1,preds_1,outs_1), axis =1))
results.columns = ['idx','label_true','preds'] + [i for i in range(8)]


#results_new = results.groupby('id')[[i for i in range(8)]].mean()
results_new = results.groupby('idx')[['label_true']].min().reset_index().astype(int)
results_new['preds'] = results.groupby('idx')[['preds']].agg(lambda x: x.value_counts().index[0]).values.astype(int)
results_new[['out_{}'.format(i) for i in range(8)]] = results.groupby('idx')[[i for i in range(8)]].mean().values
results_new['out_pred'] = results_new[['out_{}'.format(i) for i in range(8)]].idxmax(axis=1).str.replace('out_','').astype(int).values


results_new = results_new[results_new['idx']>=0]
print('acc_pred',(results_new['label_true']==results_new['preds']).mean())
print('acc_outs',(results_new['label_true']==results_new['out_pred']).mean())


y_true = results_new['label_true']
y_pred = results_new['out_pred']
label_sort = ['Still','Walk','Run','Bike','Car','Bus','Train', 'Subway']
print(classification_report(y_true, y_pred, target_names = label_sort))


test loss 0.7691177040338516 test acc 0.8454403281211853
acc_pred 0.8579065271855568
acc_outs 0.8616122211133831
              precision    recall  f1-score   support

       Still       0.93      0.94      0.93     29676
        Walk       0.95      0.95      0.95     25879
         Run       0.98      1.00      0.99      2754
        Bike       0.81      0.82      0.82     12001
         Car       1.00      0.40      0.57     20438
         Bus       0.38      0.88      0.53      9138
       Train       1.00      0.97      0.99     21763
      Subway       0.95      0.98      0.96     21644

    accuracy                           0.86    143293
   macro avg       0.88      0.87      0.84    143293
weighted avg       0.91      0.86      0.86    143293



In [8]:
f1 = f1_score(y_true, y_pred, average='macro')
print(f1)

0.8429934120478508


In [9]:
results_new.to_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/out_put2/val_dl2.pkl')


In [None]:
a = pd.read_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hand/data_m.pkl')

In [8]:
m = results_new[results_new['idx'].isin(a['idx'])]
m

Unnamed: 0,idx,label_true,preds,out_0,out_1,out_2,out_3,out_4,out_5,out_6,out_7,out_pred
972,98151959,2,2,6.812557e-10,0.000016,0.999497,4.834754e-04,3.061111e-06,1.079840e-08,6.627248e-09,7.514964e-11,2
973,98152059,2,2,1.117635e-09,0.000022,0.999392,5.822952e-04,3.624860e-06,1.461321e-08,7.341805e-09,1.216021e-10,2
974,98152159,2,2,1.428232e-09,0.000035,0.999253,7.084942e-04,4.146106e-06,1.660578e-08,7.957381e-09,1.642354e-10,2
975,98152259,2,2,1.738339e-09,0.000047,0.999216,7.321134e-04,4.364371e-06,1.686876e-08,7.958896e-09,1.999083e-10,2
976,98152359,2,2,1.895418e-09,0.000059,0.999245,6.913923e-04,4.295467e-06,1.617337e-08,6.955311e-09,2.326166e-10,2
...,...,...,...,...,...,...,...,...,...,...,...,...
130332,111147513,0,1,1.754528e-01,0.821100,0.000005,3.233377e-07,3.206798e-11,4.019823e-08,3.440872e-03,3.214815e-07,1
130333,111147613,0,1,1.537386e-01,0.842450,0.000006,3.394254e-07,3.330260e-11,3.256310e-08,3.804275e-03,2.659943e-07,1
130334,111147713,0,1,1.680085e-01,0.826683,0.000007,3.873066e-07,4.646695e-11,2.810959e-08,5.300879e-03,2.992055e-07,1
130335,111147813,0,1,2.387138e-01,0.753047,0.000007,3.962014e-07,6.450293e-11,3.752051e-08,8.231108e-03,3.968620e-07,1


In [9]:
print(classification_report(m['label_true'].values, m['out_pred'].values, target_names = label_sort))

              precision    recall  f1-score   support

       Still       0.96      0.94      0.95      3661
        Walk       0.93      1.00      0.96      3743
         Run       1.00      1.00      1.00       310
        Bike       1.00      1.00      1.00      1840
         Car       1.00      0.89      0.94      6710
         Bus       0.30      1.00      0.47       236
       Train       1.00      1.00      1.00      7370
      Subway       1.00      1.00      1.00      2167

    accuracy                           0.96     26037
   macro avg       0.90      0.98      0.91     26037
weighted avg       0.98      0.96      0.97     26037



# 加载原始的训练集

In [10]:
def val_dataset(loc='Hand'):


    train_hand = cPickle.load(open('/DATA2/lvxiaoling/limengyuan/SHL2023/train/{}/raw_data_m.pkl'.format(loc),'rb'))
    stat_data = cPickle.load(open('/DATA2/lvxiaoling/limengyuan/SHL2023/train/{}/data_m.pkl'.format(loc),'rb')).fillna(0).drop_duplicates(keep='first')   

    data_x = np.arange(len(train_hand[0]))#trajectory id
    data_y = [int(i[0][0,-4]) for i in train_hand[1]]
    val_sensor = [j for i in data_x for j in train_hand[0][i]]
    val_gps = [j for i in data_x for j in train_hand[1][i]]
    val_data = dset(val_sensor,val_gps,stat_data)


    test_loader1 = torch.utils.data.DataLoader(
        dataset=val_data,
        batch_size=args.batch_size,
        shuffle=False,
        collate_fn=collate_batch,
        num_workers = args.num_workers
        )
    return test_loader1
val_loader = val_dataset(loc='Hand')


test_loss,test_acc  = evaluate(model_s,model_g,model_stat,model, criterion, val_loader)
print('test loss',test_loss,'test acc',test_acc.item())

test loss 0.06333301598226235 test acc 0.991351842880249


In [11]:
preds,idxs,trip_idxs,labels,outs,_,_ = prediction_val(model_s,model_g,model_stat,model, criterion, val_loader)

idxs_1 = idxs.reshape(-1,1).cpu().numpy()
labels_1 = labels.reshape(-1,1).cpu().numpy()
preds_1 = preds.reshape(-1,1).cpu().numpy()
outs_1 = outs.reshape(-1,outs.shape[2]).cpu().numpy()

results = pd.DataFrame(np.concatenate((idxs_1,labels_1,preds_1,outs_1), axis =1))
results.columns = ['idx','label_true','preds'] + [i for i in range(8)]


#results_new = results.groupby('id')[[i for i in range(8)]].mean()
results_new = results.groupby('idx')[['label_true']].min().reset_index().astype(int)
results_new[['out_{}'.format(i) for i in range(8)]] = results.groupby('idx')[[i for i in range(8)]].mean().values
results_new['out_pred'] = results_new[['out_{}'.format(i) for i in range(8)]].idxmax(axis=1).str.replace('out_','').astype(int).values


results_new = results_new[results_new['idx']>=0]
print('acc_outs',(results_new['label_true']==results_new['out_pred']).mean())
results_new.to_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/out_put/train_dl2.pkl')


y_true = results_new['label_true']
y_pred = results_new['out_pred']
label_sort = ['Still','Walk','Run','Bike','Car','Bus','Train', 'Subway']
print(classification_report(y_true, y_pred, target_names = label_sort))

acc_outs 0.991591031742138
              precision    recall  f1-score   support

       Still       1.00      0.98      0.99    120940
        Walk       0.99      1.00      1.00    117175
         Run       1.00      0.95      0.98     35805
        Bike       0.98      1.00      0.99    110793
         Car       1.00      1.00      1.00    146300
         Bus       1.00      1.00      1.00    121841
       Train       0.99      0.99      0.99    147554
      Subway       0.98      0.99      0.99    116827

    accuracy                           0.99    917235
   macro avg       0.99      0.99      0.99    917235
weighted avg       0.99      0.99      0.99    917235



# 加载原始验证集

In [12]:
def val_dataset(loc='Hand'):


    train_hand = cPickle.load(open('/DATA2/lvxiaoling/limengyuan/SHL2023/valid/{}/raw_data_m.pkl'.format(loc),'rb'))
    stat_data = cPickle.load(open('/DATA2/lvxiaoling/limengyuan/SHL2023/valid/{}/data_m.pkl'.format(loc),'rb')).fillna(0).drop_duplicates(keep='first')   

    data_x = np.arange(len(train_hand[0]))#trajectory id
    data_y = [int(i[0][0,-4]) for i in train_hand[1]]
    val_sensor = [j for i in data_x for j in train_hand[0][i]]
    val_gps = [j for i in data_x for j in train_hand[1][i]]
    val_data = dset(val_sensor,val_gps,stat_data)


    test_loader1 = torch.utils.data.DataLoader(
        dataset=val_data,
        batch_size=args.batch_size,
        shuffle=False,
        collate_fn=collate_batch,
        num_workers = args.num_workers
        )
    return test_loader1
val_loader = val_dataset(loc='Hand')


test_loss,test_acc  = evaluate(model_s,model_g,model_stat,model, criterion, val_loader)
print('test loss',test_loss,'test acc',test_acc.item())

test loss 0.6045635119080544 test acc 0.9281517863273621


In [13]:
preds,idxs,trip_idxs,labels,outs,_,_ = prediction_val(model_s,model_g,model_stat,model, criterion, val_loader)

idxs_1 = idxs.reshape(-1,1).cpu().numpy()
labels_1 = labels.reshape(-1,1).cpu().numpy()
preds_1 = preds.reshape(-1,1).cpu().numpy()
outs_1 = outs.reshape(-1,outs.shape[2]).cpu().numpy()

results = pd.DataFrame(np.concatenate((idxs_1,labels_1,preds_1,outs_1), axis =1))
results.columns = ['idx','label_true','preds'] + [i for i in range(8)]


#results_new = results.groupby('id')[[i for i in range(8)]].mean()
results_new = results.groupby('idx')[['label_true']].min().reset_index().astype(int)
results_new[['out_{}'.format(i) for i in range(8)]] = results.groupby('idx')[[i for i in range(8)]].mean().values
results_new['out_pred'] = results_new[['out_{}'.format(i) for i in range(8)]].idxmax(axis=1).str.replace('out_','').astype(int).values


results_new = results_new[results_new['idx']>=0]
print('acc_outs',(results_new['label_true']==results_new['out_pred']).mean())
results_new.to_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/out_put/val_dl2.pkl')


y_true = results_new['label_true']
y_pred = results_new['out_pred']
label_sort = ['Still','Walk','Run','Bike','Car','Bus','Train', 'Subway']
print(classification_report(y_true, y_pred, target_names = label_sort))

acc_outs 0.9332412002222998
              precision    recall  f1-score   support

       Still       0.99      0.96      0.98     30422
        Walk       0.95      1.00      0.97     30804
         Run       1.00      0.86      0.93      9000
        Bike       0.99      0.99      0.99     18040
         Car       0.81      0.94      0.87     32418
         Bus       0.92      0.71      0.81     28475
       Train       1.00      0.97      0.98     30079
      Subway       0.90      1.00      0.95     24091

    accuracy                           0.93    203329
   macro avg       0.95      0.93      0.93    203329
weighted avg       0.94      0.93      0.93    203329



# 加载原始测试集

In [14]:
def val_dataset(loc='Hand'):


    train_hand = cPickle.load(open('/DATA2/lvxiaoling/limengyuan/SHL2023/test/raw_data.pkl'.format(loc),'rb'))
    stat_data = cPickle.load(open('/DATA2/lvxiaoling/limengyuan/SHL2023/test/data.pkl'.format(loc),'rb')).fillna(0).drop_duplicates(keep='first')   

    data_x = np.arange(len(train_hand[0]))#trajectory id
    data_y = [int(i[0][0,-4]) for i in train_hand[1]]
    val_sensor = [j for i in data_x for j in train_hand[0][i]]
    val_gps = [j for i in data_x for j in train_hand[1][i]]
    val_data = dset(val_sensor,val_gps,stat_data)


    test_loader1 = torch.utils.data.DataLoader(
        dataset=val_data,
        batch_size=args.batch_size,
        shuffle=False,
        collate_fn=collate_batch,
        num_workers = args.num_workers
        )
    return test_loader1
val_loader = val_dataset(loc='Hand')


test_loss,test_acc  = evaluate(model_s,model_g,model_stat,model, criterion, val_loader)
print('test loss',test_loss,'test acc',test_acc.item())

test loss 0.0 test acc nan


In [15]:
preds,idxs,trip_idxs,labels,outs,_,_ = prediction_val(model_s,model_g,model_stat,model, criterion, val_loader)

idxs_1 = idxs.reshape(-1,1).cpu().numpy()
labels_1 = labels.reshape(-1,1).cpu().numpy()
preds_1 = preds.reshape(-1,1).cpu().numpy()
outs_1 = outs.reshape(-1,outs.shape[2]).cpu().numpy()

results = pd.DataFrame(np.concatenate((idxs_1,labels_1,preds_1,outs_1), axis =1))
results.columns = ['idx','label_true','preds'] + [i for i in range(8)]


#results_new = results.groupby('id')[[i for i in range(8)]].mean()
results_new = results.groupby('idx')[['label_true']].min().reset_index().astype(int)
results_new[['out_{}'.format(i) for i in range(8)]] = results.groupby('idx')[[i for i in range(8)]].mean().values
results_new['out_pred'] = results_new[['out_{}'.format(i) for i in range(8)]].idxmax(axis=1).str.replace('out_','').astype(int).values


results_new = results_new[results_new['idx']>=0]

results_new.to_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/out_put/test_dl2.pkl')

print('acc_outs',(results_new['label_true']==results_new['out_pred']).mean())


acc_outs 0.0


In [21]:
m = 400
results_new['out_pred'].values[m:m+200]

array([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5])

In [26]:
results_new['out_pred']

1         3
2         3
3         3
4         3
5         3
         ..
462266    1
462267    1
462268    1
462269    1
462270    1
Name: out_pred, Length: 462270, dtype: int64