In [1]:
import numpy as np
import pandas as pd
import utm
import math
import os
import sys
sys.path.append('./src')
import matplotlib
import matplotlib.pyplot as plt
import display_osm
import display_gm
import plot
%matplotlib inline

In [61]:
dataset = 'siping'
datatype = '4g'
disp_path = './display/%s_%s/' % (dataset, datatype)
data_path = './data/%s_%s/' % (dataset, datatype)
data_file = './data/%s_%s/data_%s.csv' % (dataset, datatype, datatype)

# 数据预处理
### 数据范围
* key_form GSM or LTE
* bounding_box 

### 数据错误
* (Mcc, Mnc)
    - (460, 0) 移动(程序初始化问题)
    - (460, 1) 联通
* 4G数据只有主基站信息

### 数据乱序
* Timestamp 乱序

### 数据重复
* First Tower 和第一个 Connected Tower重复
* 完全重复的数据

In [3]:
TimeStamp = ['MRTime']
Location = ['Longitude', 'Latitude', 'Altitude', 'Accuracy', 'Speed']
SignalRecord = ['Dbm', 'isGsm', 'SignalToNoiseRatio', 'EvdoEcio', 'Level']
Battery = ['BLevel', 'BCapacity']
Geomagnetism = ['X', 'Y', 'Z', 'X_theta', 'Y_theta', 'Z_theta']
Towers = ['Mcc', 'Mnc', 'RNCID', 'CellID', 'Arfcn', 'Basic_psc_pci', 'Lon', 'Lat', 'AsuLevel', 'SignalLevel', 'Dbm', 'Type']
Others = ['TrajID', 'IMEI', 'IMSI']
Basic_prop_len = len(TimeStamp) + len(Location) + len(SignalRecord) + len(Battery) + len(Geomagnetism)
NumConnLoc = Basic_prop_len + len(Towers)
MncLoc = NumConnLoc + 2
len(Towers), Basic_prop_len

(12, 19)

In [12]:
def get_properties(num_tower):
    basic_prop = []
    basic_prop.extend(Others)
    basic_prop.extend(TimeStamp)
    basic_prop.extend(Location)
    basic_prop.extend(SignalRecord)
    basic_prop.extend(Battery)
    basic_prop.extend(Geomagnetism)
    basic_prop.append('Num_Connected')
    for i in range(num_tower):
        tower_i = [x + '_' + str(i+1) for x in Towers]
        basic_prop.extend(tower_i)
    return basic_prop

def load_data(folder, bounding_box, key_form, min_num=20):
    if not os.path.exists(folder):
        print 'not exist'
        return
    iterator = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]
    iterator = sorted(iterator, key=lambda x: int(x.split('.')[0]))
    data = []
    to_del = []
    for f in iterator:
        coors = load_file(os.path.join(folder, f), bounding_box, key_form, min_num=min_num)
        if type(coors) is str:
            print 'discard filename:', f, ' for reason:', coors
            to_del.append(os.path.join(folder, f))
            continue
        data.append(coors)
    for filename in to_del:
        os.remove(filename)
    return data

def outside(x, y, bounding_box):
    x0, x1, y0, y1 = bounding_box
    return x < x0 or x > x1 or y < y0 or y > y1
    
def load_file(filename, bounding_box, key_form, min_num=20):
    coors = pd.DataFrame()
    tr_id = int(filename.split('/')[-1].split('.')[0])
    with open(filename) as fp:
        data = fp.readlines()
        imei = int(data[0].strip().split(':')[-1])
        imsi = int(data[1].strip().split(':')[-1])
        total = int(data[7].strip())
        max_num_tower = 0
        result = []
        if key_form not in data[8]:
            return 'not' + key_form
        idx = 8
        last_stamp = 0
        for line in data[8:8+total]:
            idx += 1
            mr = line.strip().split(',')
            lat, lng = float(mr[2]), float(mr[1])
            x, y, _, _ = utm.from_latlon(lat, lng)
            stamp = int(mr[0]) / 1000
            try:
                mcc, mnc = int(mr[MncLoc - 1]), int(mr[MncLoc])
            except:
                continue
#             print idx, mcc, mnc
            num_tower = int(mr[NumConnLoc])
            if mnc == 11:
                return 'mnc error'
            if stamp == last_stamp:
#                 print filename, 'dup', idx
                continue
            if outside(x, y, bounding_box):
                return 'outside'
            if mcc!=460 or num_tower == 0:
                print filename
                continue
            assert (len(mr) - Basic_prop_len - 1) % len(Towers) == 0
            max_num_tower = max(max_num_tower, num_tower)
            mr = mr[:Basic_prop_len] + mr[NumConnLoc:]
            mr.insert(0, imsi)
            mr.insert(0, imei)
            mr.insert(0, tr_id)
            tup = tuple(mr)
            result.append(tup)
            last_stamp = stamp
        properties = get_properties(max_num_tower)
        if len(result) > 0:
            coors = pd.DataFrame(result, columns=properties).sort_values(by=['MRTime']).reset_index(drop=True)
            coors = coors[coors['RNCID_1'].astype(int)!=-1]
            coors = coors.fillna(-1)
    return coors

In [53]:
# bounding_box = (328000, 332000, 3461000, 3464000)
bounding_box = (356000, 359000, 3460000, 3463000)
key_form = 'LTE'
key_form_map = {'GSM':2, 'LTE':4}

In [52]:
for i in range(24):
#     if i!= 7:
#         continue
    print 'index=',i
    fname =  './data/uploads/' + str(i) + '.txt'
    coors = load_file(fname, bounding_box, key_form, min_num=20)
    if type(coors) is str:
        print 'discard filename:', fname, ' for reason:', coors

index= 0
index= 1
index= 2
index= 3
index= 4
index= 5
index= 6
index= 7
index= 8
index= 9
index= 10
index= 11
index= 12
index= 13
index= 14
index= 15
index= 16
index= 17
index= 18
index= 19
index= 20
index= 21
index= 22
index= 23


In [43]:
# coors[['MRTime', 'Dbm_1', 'Dbm_2', 'Dbm_3', 'Dbm_4', 'Dbm_5', 'Dbm_6', 'Dbm_7']]
# pd.DataFrame(coors[:10], columns=get_properties(7))
ff = coors[['Mcc_1', 'Mnc_1', 'RNCID_1', 'CellID_1', 'Mcc_2', 'Mnc_2', 'RNCID_2', 'CellID_2']]
ff[ff['Mnc_1']==0]

Unnamed: 0,Mcc_1,Mnc_1,RNCID_1,CellID_1,Mcc_2,Mnc_2,RNCID_2,CellID_2


In [69]:
# points = load_data(data_path + 'trajs/', bounding_box, key_form, min_num=20)
points = load_data('./data/uploads/', bounding_box, key_form, min_num=20)

# Display

In [71]:
display_osm = reload(display_osm)
for traj in points:
#     print traj['TrajID'][0], len(traj)
    traj_id = int(traj['TrajID'][0])
    display_osm.df_to_html(traj, disp_path + 'raw/%d.html' % traj_id, more_info=True)

In [70]:
for traj in points:
    print traj['TrajID'][0], len(traj)

0 957
1 144
2 212
3 193
4 180
5 161
6 51
7 122
8 162
9 197
10 122
11 149


In [73]:
data = pd.concat(points, axis=0)[get_properties(7)]
data.to_csv(data_file, index=False)

In [74]:
len(data)

2650

In [75]:
plot = reload(plot)
fig_width=20
axis = bounding_box
for traj in points:
    trajs = []
    tr_id = int(traj['TrajID'][0])
    for i in range(len(traj)):
        point = traj.iloc[i]
        lat, lng = float(point['Latitude']), float(point['Longitude'])
        x, y, _, _ = utm.from_latlon(lat, lng)
        trajs.append((x, y))
    plt.figure(figsize=(fig_width, fig_width*(axis[3]-axis[2])/(axis[1]-axis[0])))
    ca = plt.gca()
    plot.draw_traj_on_cells(trajs, ca, axis, 'r')
    plt.axis(axis)
    plt.title('traj id = ' + str(tr_id))
    plt.savefig(disp_path + 'raw/%d.png' % tr_id)
    plt.close()

# Rename file

In [67]:
def renamebytime(folder):
    if not os.path.exists(folder):
        print 'not exist'
        return
    iterator = (f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f)))
    timeline = []
    for f in iterator:
#         record_time = int(f.strip().split('_')[-2])
        record_time = int(f.strip().split('.')[0])
        timeline.append((f, record_time))
    timeline = sorted(timeline, key=lambda x: x[1])
    idx = 0
    for filename, _ in timeline:
        old_name = os.path.join(folder, filename)
        new_name = os.path.join(folder, '%d.txt' % idx)
        os.rename(old_name, new_name)
        idx += 1

In [68]:
renamebytime('./data/uploads/')