In [1]:
import math
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

### 坐标转换

In [2]:
x_pi = 3.14159265358979324 * 3000.0 / 180.0
# π
pi = 3.1415926535897932384626
# 长半轴
a = 6378245.0
# 偏心率平方
ee = 0.00669342162296594323

def wgs84_to_gcj02(lng, lat):
    """
    WGS84转GCJ02(火星坐标系)
    :param lng:WGS84坐标系的经度
    :param lat:WGS84坐标系的纬度
    :return:
    """
    if out_of_china(lng, lat):  # 判断是否在国内
        return [lng, lat]
    dlat = _transformlat(lng - 105.0, lat - 35.0)
    dlng = _transformlng(lng - 105.0, lat - 35.0)
    radlat = lat / 180.0 * pi
    magic = math.sin(radlat)
    magic = 1 - ee * magic * magic
    sqrtmagic = math.sqrt(magic)
    dlat = (dlat * 180.0) / ((a * (1 - ee)) / (magic * sqrtmagic) * pi)
    dlng = (dlng * 180.0) / (a / sqrtmagic * math.cos(radlat) * pi)
    mglat = lat + dlat
    mglng = lng + dlng
    return [mglng, mglat]


def out_of_china(lng, lat):
    """
    判断是否在国内，不在国内不做偏移
    :param lng:
    :param lat:
    :return:
    """
    return not (lng > 73.66 and lng < 135.05 and lat > 3.86 and lat < 53.55)

def _transformlat(lng, lat):
    ret = -100.0 + 2.0 * lng + 3.0 * lat + 0.2 * lat * lat + \
          0.1 * lng * lat + 0.2 * math.sqrt(math.fabs(lng))
    ret += (20.0 * math.sin(6.0 * lng * pi) + 20.0 *
            math.sin(2.0 * lng * pi)) * 2.0 / 3.0
    ret += (20.0 * math.sin(lat * pi) + 40.0 *
            math.sin(lat / 3.0 * pi)) * 2.0 / 3.0
    ret += (160.0 * math.sin(lat / 12.0 * pi) + 320 *
            math.sin(lat * pi / 30.0)) * 2.0 / 3.0
    return ret


def _transformlng(lng, lat):
    ret = 300.0 + lng + 2.0 * lat + 0.1 * lng * lng + \
          0.1 * lng * lat + 0.1 * math.sqrt(math.fabs(lng))
    ret += (20.0 * math.sin(6.0 * lng * pi) + 20.0 *
            math.sin(2.0 * lng * pi)) * 2.0 / 3.0
    ret += (20.0 * math.sin(lng * pi) + 40.0 *
            math.sin(lng / 3.0 * pi)) * 2.0 / 3.0
    ret += (150.0 * math.sin(lng / 12.0 * pi) + 300.0 *
            math.sin(lng / 30.0 * pi)) * 2.0 / 3.0
    return ret

### 读取.csv文件

In [3]:
reader = pd.read_csv('day20170215.csv', iterator=True)
#每个表都没有达到1亿的数据量,这里为了方便直接填了1亿
try:
    data = reader.get_chunk(100000000)
except Exception as e:
    print(e)

### 剔除异常数据和熄火状态的车辆

In [4]:
data = data.drop(data[data.CAR_STAT1 == 7].index)
data = data.drop(data[data.LONGITUDE == 0].index)
data = data.drop(data[data.LATITUDE == 0].index)
data = data.reset_index(drop=True)
data

Unnamed: 0,LICENSEPLATENO,LONGITUDE,LATITUDE,CAR_STAT1
0,粤AD733P,113.32040,23.13620,5
1,粤AQ4Q70,113.27130,23.20610,4
2,粤AP0P54,113.28370,23.20990,5
3,粤AZ7E21,113.34840,23.12760,5
4,粤A6KJ26,113.26330,22.99510,4
...,...,...,...,...
69514745,粤AD23S0,113.31841,23.03823,5
69514746,粤AW5Q04,113.32805,23.12357,5
69514747,粤AA71E4,113.29580,23.14821,4
69514748,粤AB74Y0,113.34416,23.13844,4


In [5]:
data_ = np.array(data)

### 提取轨迹

In [6]:
taxi = {}
with tqdm(total=len(data_)) as bar:
    for i in range(len(data_)):
        bar.update(1)
        temp = [data_[i][1], data_[i][2], data_[i][3]]
        if data_[i][0] not in taxi.keys():
            taxi[data_[i][0]] = []
        taxi[data_[i][0]].append(temp)

100%|██████████████████████████████████████████████████████████████████| 69514750/69514750 [04:53<00:00, 237078.26it/s]


In [7]:
num = {}
for key in taxi.keys():
    num[key] = len(taxi[key])
#对轨迹点数进行排序
num = sorted(num.items(), key=lambda x:x[1], reverse=False)
del num[0:1500]
del num[-2000:-1]
sample = random.sample(num, 800)

In [8]:
taxi_ = {}
for item in sample:
    taxi_[item[0]] = taxi[item[0]] 

In [9]:
empty_trajs = []
passger_trajs = []
with tqdm(total=len(taxi_)) as bar:
    for key in taxi_.keys():
        bar.update(1)
        split = 0
        for i in range(len(taxi_[key])-1):
            if (taxi_[key][i][2] == 4) and (taxi_[key][i+1][2] == 5):
                empty_trajs.append(np.array(taxi_[key])[split:i+1, 0:2])
                split = i+1
            if (taxi_[key][i][2] == 5) and (taxi_[key][i+1][2] == 4):
                passger_trajs.append(np.array(taxi_[key])[split:i+1, 0:2])
                split = i+1

100%|████████████████████████████████████████████████████████████████████████████████| 800/800 [15:07<00:00,  1.13s/it]


In [11]:
trajs = []
for traj in passger_trajs:
    if len(traj) > 100:
        trajs.append(traj)

### 进行坐标转换

In [14]:
final_trajs = {}
with tqdm(total=len(trajs)) as bar:
    for i in range(len(trajs)):
        bar.update(1)
        locations = []
        for lng, lat in trajs[i]:
            bar.update(1)
            locations.append(wgs84_to_gcj02(lng, lat))
        final_trajs[i] = locations

713404it [00:10, 66173.23it/s]                                                                                         


### 存入json文件

In [15]:
import json
trajs_day_20170215 = json.dumps(final_trajs, ensure_ascii = False)
fp1 = open('trajs_day_20170215.json', 'w+')
fp1.write(trajs_day_20170215)
fp1.close()