# 数据读取
读取csv数据，初次得到的数据结构如下：
```
data=
[
    {
        'date',
        'time',
        'data':[
            第road_id行，存放road_id的数据：[status,speed]
        ]
        不同时间的不同data不等长
    }
]
```

In [4]:
import csv
import os
file_path='data'
file_list=sorted(os.listdir(file_path))
date_time={}
for fname in file_list:
    if fname[0]=='.':
        continue
        
    date=fname[8:13]
    time=fname[14:22]
    
    if date>='10-02' and date<='10-31':
        if date not in date_time:
            date_time[date]=[time]
        else:
            date_time[date].append(time)
#把每一天的时间段数据归一化，抛弃尾端数据
min_time_num=216
for date,times in date_time.items():
    if len(times)<min_time_num:
        min_time_num=len(times)
for date in date_time:
    if len(date_time[date])>min_time_num:
        date_time[date]=date_time[date][0:min_time_num]

road_dict={}
data=[]

for date,times in date_time.items():
    for time in times:
        fname='ts_2021-'+date+' '+time+'.csv'
        date_data=[[0,0] for i in range(len(road_dict))]
        with open(file_path+'/'+fname,'r') as f:
            csv_reader = csv.reader(f)
            for row in csv_reader:
                if (row[0],row[2]) in road_dict:
                    date_data[road_dict[(row[0],row[2])]['id']][0]=int(row[1])
                    date_data[road_dict[(row[0],row[2])]['id']][1]=int(row[4])
                else:
                    road_dict[(row[0],row[2])]={'id':len(road_dict)}
                    date_data.append([int(row[1]),int(row[4])])
            f.close()
        #time=(int(fname[14:16])-5)*60+int(fname[17:19])
        data.append({'date':date,'time':time,'data':date_data})

# 数据清洗
二次处理读到的csv数据，使其转化为光栅(Raster)数据
```
dataset=[
    for each day
    [
        for each roads
        [
            for each time
            [
                [status,speed]
            ]
        ]
    ]
]
```
记每天为一个样本组，天数=样本数=N，每个样本记录了E个道路信息，每个道路包含T个时间片的监测数据，每个数据有C个特征

In [5]:
import numpy as np
from collections import Counter
dataset=[]
raster=[[] for i in range(len(road_dict))]
for index,item in enumerate(data):
    for road in road_dict:
        road_id=road_dict[(road)]['id']
        if road_id<len(item['data']):
            raster[road_id].append(item['data'][road_id])
        else:
            raster[road_id].append([0,0])

    if index==len(data)-1 or item['date']!=data[index+1]['date']:
        dataset.append(raster)
        raster=[[] for i in range(len(road_dict))]
dataset=np.array(dataset)

将车速归一化

In [6]:
regular=dataset.transpose(3,0,1,2)
regular=regular.astype('float')
for d in range(len(regular[1])):
    for r in range(len(regular[1][d])):
        if regular[1][d][r].max()>0:
            # 因为全天畅通路段的最低速度=0而全天拥堵路段的最高速度=1这显然不合理
            # regular[1][d][r]=(regular[1][d][r]-regular[1][d][r].min())/regular[1][d][r].ptp()
            
            # 这里有一个假设，路段不会全天拥堵，全天最高速度是路段的最高限速
            regular[1][d][r]=regular[1][d][r]/regular[1][d][r].max()
        else:
            regular[1][d][r]=0.0
dataset=regular.transpose(1,2,3,0) 

将每一天的数据拼接
```
dataset=[
    for each roads
    [
        for all time
        [
            [status,speed]
        ]
    ]
]
```
样本记录了E个道路信息，每个道路包含N*T个时间片的监测数据，每个数据有C个特征

In [7]:
dataset=np.concatenate(dataset,axis=1)

将数据缺失率在5%以上的路段剔除<br>
road_zip是之后代码块使用的数据源

In [8]:
road_zip=dict(zip(road_dict.keys(),dataset))

delete_roads=[]
for road,data in road_zip.items():
    null_cnt=0
    zero_cnt=Counter(data[:,0])[0]
    if zero_cnt>len(data)*0.05:
        delete_roads.append(road)
for road in delete_roads:
    road_zip.pop(road)

road_zip_keys=list(road_zip.keys())
road_zip_values=np.array(list(road_zip.values())).reshape(len(road_zip),-1,min_time_num,2)
road_zip=dict(zip(road_zip_keys,road_zip_values))

保存road_zip

In [9]:
import pickle
with open('road_zip.pkl','wb') as f:
    pickle.dump(road_zip, f)