In [1]:
import numpy as np 
import pandas as pd 
import math 
from sklearn import preprocessing
import gmplot
import matplotlib.pyplot as plt
from time import time
import operator
from datetime import datetime

In [2]:
from geopy.geocoders import Nominatim
import overpass
import folium
from folium import plugins
from folium.plugins import HeatMap
geolocator = Nominatim(user_agent="TFG")

In [3]:
# dataframe con device ids legibles
def device_ids_encoder(d):
    le = preprocessing.LabelEncoder()
    simple_ids=le.fit_transform(d['Device ID'].astype(str))    
    d['Device ID'] = simple_ids  #d['Device ID'].replace(simple_ids)   avoids warning, but too slow   
    return d

# punto medio de un usuario
def avg_location(d, user):
	j=0
	coords = [0, 0]
	for index, row in d.iterrows():
		if row['Device ID']==user:
			coords[0] += float(row['Latitude'])
			coords[1] += float(row['Longitude'])
			j += 1

	coords[0] = coords[0]/j;
	coords[1] = coords[1]/j;
	return coords

#Se obtiene un diccionario cuya clave es el user ID y el valor es un dataframe con las localizaciones del usuario
def get_info_by_user(d):
    users={}
    
    ids = list(d['Device ID'].unique())
    
    for x in ids:
        users[x] = d[d['Device ID'] == x]
        users[x] = users[x].sort_values(by='Time Stamp')
        users[x] = users[x].reset_index(drop=True)
            
    return users

# top n de usuarios por numero de localizaciones
def top_n_users(users, n):
	users_num_locations = {}
	users_top_n = {}
	i=0

	for k,v in users.items():
		users_num_locations[k] = len(v)

	users_num_locations = dict(sorted(users_num_locations.items(), key=operator.itemgetter(1), reverse=True))
    
	for k,v in users_num_locations.items():
		if i==n:
			break
		users_top_n[k] = users[k]
		i+=1
    
	return users_top_n

### Getting data from csv

In [35]:
start_time = time()
#-----------------------------------------------------------------
# read csv
df_raw = pd.read_csv('balearesjulio.csv', sep = ',')
#-----------------------------------------------------------------
total_time = time() - start_time
print(str(total_time) + " segundos")

1.768639087677002 segundos


In [36]:
df=df_raw
len(df_raw)

1276393

In [37]:
df.insert(1, 'Date Time', df['Time Stamp'].apply(lambda x: datetime.fromtimestamp(x)))
df = df.sort_values(by='Time Stamp')
df = df.reset_index(drop=True)

In [38]:
print("\nDataset original:")
df.head()


Dataset original:


Unnamed: 0,Time Stamp,Date Time,Device ID,OS,Latitude,Longitude,Accuracy,Offset
0,1561932000,2019-07-01 00:00:00,66805,0,39.884829,4.258047,16,7200.0
1,1561932000,2019-07-01 00:00:00,113782,0,38.91035,1.425199,1500,7200.0
2,1561932002,2019-07-01 00:00:02,85905,0,39.667625,2.578221,40,7200.0
3,1561932002,2019-07-01 00:00:02,95002,0,39.564881,3.215519,17,7200.0
4,1561932003,2019-07-01 00:00:03,64535,0,39.714935,3.460052,31,7200.0


Número de usuarios distintos:

In [39]:
len(df['Device ID'].unique())

13851

In [40]:
describe_table = df.describe()
describe_table.round(3)

Unnamed: 0,Time Stamp,Device ID,OS,Latitude,Longitude,Accuracy,Offset
count,1276393.0,1276393.0,1276393.0,1276393.0,1276393.0,1276393.0,1275102.0
mean,1563394000.0,75243.188,0.27,39.428,2.595,572.539,7194.043
std,684889.1,43555.201,0.444,0.39,0.892,12674.042,187.543
min,1561932000.0,5.0,0.0,38.641,1.161,0.0,-25200.0
25%,1562912000.0,37712.0,0.0,38.98,1.489,16.0,7200.0
50%,1563570000.0,74709.0,0.0,39.551,2.672,59.0,7200.0
75%,1563976000.0,113125.0,1.0,39.696,3.148,300.0,7200.0
max,1564459000.0,150958.0,1.0,40.093,4.327,2147482.0,10800.0


### Creamos una caspeta y un csv para cada usuario

In [41]:
users_top = []
users = list(df['Device ID'].unique())
for userID in users:
    if len(df[df['Device ID'] == userID]) > 850:
        users_top.append(userID)
len(users_top)

106

In [42]:
start_time = time()
#-----------------------------------------------------------------
users = list(df['Device ID'].unique())
for userID in users_top:
    print(userID)
    df_user = df[df['Device ID'] == userID]
    df_user.to_csv("./users_Baleares/"+str(userID)+".csv", sep='\t', index=False)
    #df_user.drop(df_user.index, inplace=True)
#-----------------------------------------------------------------
total_time = time() - start_time
print(str(total_time) + " segundos")

104819
38561
107459
37692
50284
47458
86300
34024
66266
111518
119326
69579
73527
10226
123886
70318
112479
67803
87691
116479
36455
105697
120311
136736
4010
627
101468
102671
103768
24303
75367
131727
61103
36243
20169
70609
141043
60464
92331
13067
104290
140263
119278
74982
139360
125967
137043
37058
97586
51016
134356
110792
118617
16706
108104
73204
92418
40311
114512
60266
87916
137267
24541
29787
85822
21159
29157
6792
62304
124819
134610
65551
45415
8678
2215
73049
134424
2294
35273
100549
19108
108635
48672
119282
41441
75646
29205
69079
50518
64681
55340
138442
17000
150769
119509
97229
54931
81419
83358
55054
148809
114904
75063
13797
10560
82169
3.1877830028533936 segundos
