In [28]:
import pandas as ps
import numpy as np
from grid import location_to_cell_id
from math import ceil

## 0. Read Dataframes

In [34]:
pois_df = ps.read_csv('pois.csv', sep=' ')
queries_df = ps.read_csv('queries.csv', sep=' ')

print(pois_df)
print(queries_df)

      poi_id  cell_id     poi_type        lat       lon
0        307        1          bar  46.504486  6.559631
1        331        1          bar  46.500259  6.554721
2        578        1  supermarket  46.506017  6.551165
3        897        1         club  46.504494  6.553889
4        972        1  supermarket  46.502984  6.550519
...      ...      ...          ...        ...       ...
1055     372      100   restaurant  46.563852  6.648454
1056     649      100          bar  46.564047  6.640250
1057     730      100          gym  46.565961  6.642228
1058     828      100         club  46.569177  6.642401
1059     990      100          bar  46.565290  6.640855

[1060 rows x 5 columns]
           ip_address        lat       lon   timestamp poi_type_query
0      34.101.177.245  46.532942  6.591174   14.912448      cafeteria
1      34.101.177.245  46.532942  6.591174   14.912448     restaurant
2      34.101.177.245  46.550342  6.602852   18.024657     restaurant
3      34.101.177.245  

## 1. Data cleaning

In [35]:
#Extract cell_id from which query happened
def get_cell_id(row):
    return location_to_cell_id(row['lat'], row['lon'])

queries_df['cell_id'] = queries_df.apply(lambda row: get_cell_id(row), axis=1)
print(queries_df)

           ip_address        lat       lon   timestamp poi_type_query  cell_id
0      34.101.177.245  46.532942  6.591174   14.912448      cafeteria       45
1      34.101.177.245  46.532942  6.591174   14.912448     restaurant       45
2      34.101.177.245  46.550342  6.602852   18.024657     restaurant       76
3      34.101.177.245  46.550342  6.602852   18.024657      cafeteria       76
4      34.101.177.245  46.532942  6.591174   36.334539      cafeteria       45
...               ...        ...       ...         ...            ...      ...
20438     11.173.13.2  46.524410  6.625246  449.159554    supermarket       38
20439     11.173.13.2  46.527363  6.628705  453.426750    supermarket       38
20440     11.173.13.2  46.527363  6.628705  453.426750            gym       38
20441     11.173.13.2  46.524410  6.625246  464.420041    supermarket       38
20442     11.173.13.2  46.527363  6.628705  464.420041     restaurant       38

[20443 rows x 6 columns]


In [67]:
#Get day of the query and time
def get_day(row):
    return ceil(row['timestamp'] / 24)

def get_hour_of_day(row):
    return int(row['timestamp'] % 24)

queries_df['day'] = queries_df.apply(lambda row: get_day(row), axis=1)
queries_df['time'] = queries_df.apply(lambda row: get_hour_of_day(row), axis=1)

#Get daytime
def get_daytime(row):
    time = row['time']
    if (time > 0 and time < 9):
        return 'Early'
    if (time >= 9 and time < 12):
        return 'Morning'
    if (time >= 12 and time < 17):
        return 'Afternoon'
    if (time >= 17 and time < 20):
        return 'Evening'
    if (time >= 20 and time <= 23):
        return 'Night'
queries_df['daytime'] = queries_df.apply(lambda row: get_daytime(row), axis=1)

print(queries_df)

           ip_address        lat       lon   timestamp poi_type_query  \
0      34.101.177.245  46.532942  6.591174   14.912448      cafeteria   
1      34.101.177.245  46.532942  6.591174   14.912448     restaurant   
2      34.101.177.245  46.550342  6.602852   18.024657     restaurant   
3      34.101.177.245  46.550342  6.602852   18.024657      cafeteria   
4      34.101.177.245  46.532942  6.591174   36.334539      cafeteria   
...               ...        ...       ...         ...            ...   
20438     11.173.13.2  46.524410  6.625246  449.159554    supermarket   
20439     11.173.13.2  46.527363  6.628705  453.426750    supermarket   
20440     11.173.13.2  46.527363  6.628705  453.426750            gym   
20441     11.173.13.2  46.524410  6.625246  464.420041    supermarket   
20442     11.173.13.2  46.527363  6.628705  464.420041     restaurant   

       cell_id  day  time    daytime  
0           45    1    14  Afternoon  
1           45    1    14  Afternoon  
2     

## 2.Statistics

In [68]:
!rm queries_stats.csv
queries_grouped = queries_df.groupby(['ip_address', 'daytime', 'cell_id'])\
    .size().reset_index(name='count')
queries_grouped = queries_grouped.where(queries_grouped['count'] > 4).dropna()
print(queries_grouped)
queries_grouped.to_csv('queries_stats.csv')

         ip_address    daytime  cell_id  count
0       0.98.248.97  Afternoon     18.0    8.0
1       0.98.248.97  Afternoon     63.0   16.0
4       0.98.248.97    Evening     18.0   20.0
5       0.98.248.97    Morning     18.0    5.0
8       0.98.248.97    Morning     63.0   14.0
...             ...        ...      ...    ...
1866  97.138.146.97  Afternoon     53.0   22.0
1868  97.138.146.97    Evening     22.0   10.0
1871  97.138.146.97    Morning     22.0    5.0
1874  97.138.146.97    Morning     53.0    8.0
1877  97.138.146.97      Night     22.0   16.0

[1032 rows x 4 columns]
