# 240G出租车数据处理

In [1]:
# Import relevant libraries
import pandas as pd # Pandas (pd) offers data structures and operations for manipulating numerical data
import numpy as np # Numpy(np) supports operations on large multi-dimensional arrays and matrices
import matplotlib.pyplot as plt #Matplotlib supports plotting data in Python
import seaborn as sns #Seaborn is a Python data visualization library based on matplotlib
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score # Machine learning library for the Python programming language
import pickle
import os
from tqdm import tqdm
import sys 
sys.path.append('../')

In [2]:
#打开文件
file = "../Datasets/GPS_data/taxi_gps/2023-03-10"
df = pd.read_csv(file, sep=",", header=None)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222682944 entries, 0 to 222682943
Data columns (total 17 columns):
 #   Column  Dtype  
---  ------  -----  
 0   0       object 
 1   1       object 
 2   2       float64
 3   3       float64
 4   4       object 
 5   5       float64
 6   6       float64
 7   7       float64
 8   8       int64  
 9   9       int64  
 10  10      float64
 11  11      float64
 12  12      float64
 13  13      float64
 14  14      float64
 15  15      float64
 16  16      int64  
dtypes: float64(11), int64(3), object(3)
memory usage: 28.2+ GB


In [3]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,2023-03-10 00:00:00,粤BDG2073,114.098065,22.558573,2023-03-09 22:16:19,,31.0,86.0,768,268435456,,,,,539613.0,0.0,83
1,2023-03-10 00:00:00,粤BDH1151,114.024942,22.649313,2023-03-09 22:15:01,,1.0,184.0,256,268435456,,,,,373740.0,0.0,82
2,2023-03-10 00:00:00,粤BDG0239,113.949077,22.555780,2023-03-09 22:25:37,,47.0,70.0,768,268435456,,,,,660153.0,0.0,74
3,2023-03-10 00:00:00,粤BDR7946,114.060970,22.563798,2023-03-09 22:10:35,,10.0,178.0,256,268435456,,,,,564894.0,0.0,81
4,2023-03-10 00:00:00,粤BD85069,113.905293,22.556068,2023-03-09 22:15:48,,21.0,316.0,768,268435456,,,,,664636.0,0.0,79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222682939,2023-03-10 23:59:59,粤BAK8549,113.871813,22.588615,2023-03-10 21:37:33,,0.0,20.0,768,131072,,,,,47599.9,0.0,0
222682940,2023-03-10 23:59:59,粤BDE5749,114.022322,22.593372,2023-03-10 21:31:27,,51.0,74.0,768,0,,,,,85143.3,0.0,0
222682941,2023-03-10 23:59:59,粤BAX0156,114.054240,22.563628,2023-03-10 21:36:47,,29.0,178.0,256,0,,,,,10906.4,0.0,0
222682942,2023-03-10 23:59:59,粤BD80185,114.063403,22.537698,2023-03-10 21:32:00,,0.0,260.0,768,268435456,,,,,626486.0,0.0,60


In [4]:
#添加列名
header = ['time','id','lon','lat','ptime','no_1','speed','no_2','no_7','cardno','no_3','no_4','no_5','no_6','mileage','status','direction']

df.columns = header

In [5]:
#检查一下列值
a = pd.DataFrame(df['no_7'].value_counts())
a

Unnamed: 0_level_0,count
no_7,Unnamed: 1_level_1
256,110656913
768,84551394
257,5223564
770,3958472
260,3540659
...,...
676,2
4897,2
672,1
290,1


In [6]:
#数据清洗
df = df.drop(['ptime', 'no_1','no_2','no_3','no_4','no_5','no_6','no_7','cardno', 'status'], axis=1) #删除不需要的列

In [7]:
#坐标转换
#将taxi的GPS数据由wgs84坐标转换为gcj02坐标下的web墨卡托坐标
from coordinate_transform import wgs_to_gcj02_to_mercator

# #读取文件
# with open("../Datasets/GPS_data/taxi/GPS_taxi.pkl", 'rb') as f:
#     data = pickle.load(f, encoding='bytes')

#data = pd.read_csv("../Datasets/GPS_data/taxi/2018-10-01/part-r-00000", sep=",", header=None)
#data.columns = ['id','lon','lat','time','device','speed','direction','pstatus','warning','cardno','status','color']

transformed_coords = []
for lon, lat in zip(df['lon'].values, df['lat'].values):
    transformed_coords.append(wgs_to_gcj02_to_mercator(lon, lat))
    
df['lon'], df['lat'] = zip(*transformed_coords)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222682944 entries, 0 to 222682943
Data columns (total 7 columns):
 #   Column     Dtype  
---  ------     -----  
 0   time       object 
 1   id         object 
 2   lon        float64
 3   lat        float64
 4   speed      float64
 5   mileage    float64
 6   direction  int64  
dtypes: float64(4), int64(1), object(2)
memory usage: 11.6+ GB


In [8]:
df

Unnamed: 0,time,id,lon,lat,speed,mileage,direction
0,2023-03-10 00:00:00,粤BDG2073,1.270191e+07,2.578399e+06,31.0,539613.0,83
1,2023-03-10 00:00:00,粤BDH1151,1.269376e+07,2.589328e+06,1.0,373740.0,82
2,2023-03-10 00:00:00,粤BDG0239,1.268530e+07,2.578023e+06,47.0,660153.0,74
3,2023-03-10 00:00:00,粤BDR7946,1.269778e+07,2.579025e+06,10.0,564894.0,81
4,2023-03-10 00:00:00,粤BD85069,1.268042e+07,2.578052e+06,21.0,664636.0,79
...,...,...,...,...,...,...,...
222682939,2023-03-10 23:59:59,粤BAK8549,1.267670e+07,2.581978e+06,0.0,47599.9,0
222682940,2023-03-10 23:59:59,粤BDE5749,1.269347e+07,2.582580e+06,51.0,85143.3,0
222682941,2023-03-10 23:59:59,粤BAX0156,1.269703e+07,2.579003e+06,29.0,10906.4,0
222682942,2023-03-10 23:59:59,粤BD80185,1.269805e+07,2.575880e+06,0.0,626486.0,60


In [9]:
#保存文件
with open("../Datasets/GPS_data/taxi_new/taxi_big10.pkl", 'wb') as f:
    pickle.dump(df, f)