In [11]:
%matplotlib inline
import pandas as pd
import numpy as np
import os, math
from pprint import pprint  # pretty-printer
from collections import defaultdict
from gensim.models import CoherenceModel, LdaModel, LdaMulticore


In [14]:
filepath = "../../data/chengdu_taxi/"
filename = filepath + os.path.join(filepath, '20140803_train.txt')
dataset = pd.read_csv(filename, header=None, names=["ID", "lat", "lng", "passengers", "timestamp"])
print(dataset.head())
print(dataset.info())
dataset['timestamp'] = pd.to_datetime(dataset['timestamp'])
dataset['hour'] = dataset['timestamp'].dt.hour

   ID        lat         lng  passengers          timestamp
0   1  30.624806  104.136604           1  2014/8/3 21:18:46
1   1  30.624809  104.136612           1  2014/8/3 21:18:15
2   1  30.624811  104.136587           1  2014/8/3 21:20:17
3   1  30.624811  104.136596           1  2014/8/3 21:19:16
4   1  30.624811  104.136619           1  2014/8/3 21:17:44
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53045407 entries, 0 to 53045406
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   ID          int64  
 1   lat         float64
 2   lng         float64
 3   passengers  int64  
 4   timestamp   object 
dtypes: float64(2), int64(2), object(1)
memory usage: 2.0+ GB
None


# 清空变量内存

In [13]:
# import gc
# del [dataset]
# gc.collect()

505

# 划分区域 生成区域

In [36]:
# split gird
# 1经度111KM

def gird_by_span():
    #根据框个数划分
    column_num, row_num = 200, 200
    gird_lat_span = (LATMAX - LATMIN)/row_num
    gird_lng_span = (LNGMAX - LNGMIN)/column_num
    return gird_lat_span, gird_lng_span, column_num, row_num

def gird_by_distance(lng_dis = 0.5, lat_dis = 0.4):
    #根据距离划分
    #纬度跨度0.4km 经度跨度0.5km
    #经纬度距离估算https://blog.csdn.net/weixin_35301706/article/details/112527068

    gird_lat_span = 1 / (111 / lat_dis)
    gird_lng_span = 1 / ((111 * math.cos( (LATMAX + LATMIN) / 2)) / lng_dis)
    row_num = round((LATMAX - LATMIN) / gird_lat_span)
    column_num = round((LNGMAX - LNGMIN) / gird_lng_span)
    return gird_lat_span, gird_lng_span, column_num, row_num

LATMAX, LATMIN, LNGMAX, LNGMIN = np.max(dataset['lat']), np.min(dataset['lat']), np.max(dataset['lng']), np.min(dataset['lng'])
print(LATMAX, LATMIN, LNGMAX, LNGMIN)

# gird_lat_span, gird_lng_span, column_num, row_num = gird_by_span()
gird_lat_span, gird_lng_span, column_num, row_num = gird_by_distance()


def LngLat2GirdID(lat,lng):
    if lng < LNGMIN or lng > LNGMAX or lat < LATMIN or lat > LATMAX:
        return -1

    return int((lat-LATMIN)/gird_lat_span) + int((lng-LNGMIN)/gird_lng_span) * column_num 


def GirdID2LngLat(gird_id):
    curr_row = int(gird_id/row_num)
    curr_column = gird_id % column_num

    return [LATMIN + gird_lat_span * (curr_row + 0.5 ), LNGMIN + gird_lng_span * (curr_column + 0.5)]

#划分区域id

31.032468 30.290675 104.609693 103.269638


In [37]:
dataset['gird_id'] = dataset.apply(lambda x: LngLat2GirdID(x['lat'], x['lng']), axis=1)

# 生成用于[轨迹-小时-序列]，用于后续生成文档

In [52]:
hour_list = np.unique(dataset['hour'])
out_filename_part = '20140803_train_'
alltraSequence = pd.DataFrame(columns=['TraID', 'hour', 'sequence'])

for i, v in enumerate(hour_list):
    traSequence = []
    outfile = out_filename_part + str(v) + '.csv'
    dataset_sub = dataset.loc[dataset['hour'] == v] 
    dataset_sub = dataset_sub.groupby([dataset_sub['ID'], dataset_sub['hour']])
    
    for index, group in dataset_sub:
        group_sorted = group.sort_values(['timestamp'], ascending = [True])
        traSequence.append([index[0], index[1], group_sorted.gird_id.tolist()])
    
    traSequence = pd.DataFrame(traSequence, columns=['TraID', 'hour', 'sequence'])
    traSequence.to_csv(os.path.join(filepath, outfile), index=False)
    
    alltraSequence = pd.concat([alltraSequence, traSequence], ignore_index=True)

In [None]:
alltraSequence.to_csv(os.path.join(filepath, '20140803_sequence_full.csv'), index=False) #保存在本地 下次不用再次执行上边代码

# 第二次运行时可以从这里开始
免去数据组织

In [1]:
#load data from disk
import pandas as pd
import numpy as np
import os, math
from pprint import pprint  # pretty-printer
from collections import defaultdict
import ast
filepath = "../../data/chengdu_taxi/"
traSequence = pd.read_csv(os.path.join(filepath, '20140803_sequence_full.csv'))
traSequence['sequence'] = traSequence['sequence'].apply(lambda x: ast.literal_eval(x))

# construct documents texts dicionary corpus

In [4]:
# construct documents
documents = []
for seq in zip(traSequence['sequence']):
    my_list = list(seq)[0]
    my_list = [str(current) + '-' + str(my_list[idx + 1]) if idx < len(my_list) - 1 else None for idx, current in enumerate(my_list)][: -1]
    join_s = " "
    join_s = join_s.join(my_list)
    documents.append(join_s)

# construct texts
# remove words that appear only once
texts = [[word for word in document.lower().split()] for document in documents]
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text] for text in texts]

## dictionary 
from gensim.corpora.dictionary import Dictionary
dictionary = Dictionary(texts)
dictionary.save('./tmp/20140803_full.dict')  # store the dictionary, for future reference

corpus = [dictionary.doc2bow(text) for text in texts]


In [5]:
from gensim.models import LdaSeqModel

#find hour corresponding list
time_slice = [traSequence.index[traSequence['hour'] == hour][0] for i, hour in enumerate(np.unique(traSequence.hour).tolist())]

ldaseq = LdaSeqModel(corpus=corpus, time_slice=time_slice, num_topics=18, chunksize=100)

  convergence = np.fabs((bound - old_bound) / old_bound)


In [7]:
pprint(ldaseq.print_topics(time=0))

[[('9', 0.13409381204548854),
  ('10', 0.11466000998754254),
  ('11', 0.09137169443299875),
  ('2', 0.07918483111432532),
  ('7', 0.07907639383611106),
  ('5', 0.07826225346973789),
  ('4', 0.07351501667722589),
  ('6', 0.07344995495497192),
  ('1', 0.07179158725343673),
  ('3', 0.07066024824961405),
  ('8', 0.0670171042817487),
  ('0', 0.06691709369679869)],
 [('9', 0.13700349156547825),
  ('10', 0.1024057730781501),
  ('11', 0.08621747235151137),
  ('7', 0.08298758428169366),
  ('1', 0.08098021999830733),
  ('5', 0.07846665965686862),
  ('4', 0.07846599097325184),
  ('3', 0.07587932552738036),
  ('2', 0.07467687658173018),
  ('0', 0.07312390221441539),
  ('8', 0.07249535749462568),
  ('6', 0.05729734627658735)],
 [('1', 0.22452412555633425),
  ('2', 0.22452412555633425),
  ('0', 0.13771693207346059),
  ('9', 0.047335506369430166),
  ('3', 0.04573741380555508),
  ('4', 0.04573741380555508),
  ('5', 0.04573741380555508),
  ('6', 0.04573741380555508),
  ('7', 0.04573741380555508),
  ('8

# dynamic topic modeling analysis
- https://markroxor.github.io/gensim/static/notebooks/ldaseqmodel.html#topic=0&lambda=0.83&term=
- Printing Topics
- Looking for Topic Evolution
- Document - Topic Proportions
- Distances between documents
- Chain Variance
- Topic Coherence for DTM