# Fine-Scale Prediction of People's Home Location using Social Media Footprints

_**Authors:** Hamdi Kavak, Daniele Vernon-Bido, and Jose Padilla_

_**Submitted:** SBP-BRIMS 2018 on January 11, 2018._

## Data cleaning and feature set generation

#### - Imports

In [1]:
### Home Location Prediction Paper ###########
# Task: Property generation
# Author: Hamdi Kavak
# Created: December 20, 2017
#########################################

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.path as pth

from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from datetime import timedelta

import shapely
import shapely.geometry
import fiona
import networkx as nx


#### - CONSTANTS

In [2]:
METER_PER_RADIAN = 6371000.0088
DISTANCE_RESOLUTION_IN_METER = 100
TIME_RESOLUTION_IN_MINUTES = 60
MIDNIGHT_CHECKIN_START = 0
MIDNIGHT_CHECKIN_END = 6
END_OF_DAY_CHECKIN_START = 17
END_OF_DAY_CHECKIN_END = 2

#### - Reading Chicago shapefile using fiona and checking point within using shapely

In [3]:
shapes = fiona.open("shapefiles/chi/boundary_zoning/geo_export_2b051fd9-93a5-400e-8399-42ce063b2f93.shp")
print 'Shapefile loaded with ', len(shapes), ' polygons'
shapely_residential_collection = []

for a_shape in shapes:
    poly = shapely.geometry.asShape( a_shape['geometry'] )
    if  a_shape['properties']['zone_class'].startswith('R') == True or \
        a_shape['properties']['zone_class'].startswith('DR') == True or \
        (a_shape['properties']['zone_class'].startswith('PD') == True and a_shape['properties']['pd_prefix'] == 'R'):
        shapely_residential_collection.append(poly)

print len(shapely_residential_collection), ' residential polygons found '

Shapefile loaded with  12287  polygons
4513  residential polygons found 


#### - Data cleaning and feature generation functions

In [5]:
def clean_location_footprints(data):
    # making sure there is enough data
    if len(data) < 2:
        return data

    # the remaining lines below looks consecutive tweets to make sure 
    #       they are not sent from the same place within a short timeframe'
    #       because it will introduce biases in the machine learning model.
    
    indexes_to_drop = []
    first_item = data.iloc[0,:]

    for i in range(1,len(data)-1):

        second_item = data.iloc[i,:]

        time_diff = second_item['local_datetime']-first_item['local_datetime']
        time_diff_in_minutes = time_diff / np.timedelta64(1, 'm')

        first_point = (first_item['latitude'], first_item['longitude'])
        second_point = (second_item['latitude'], second_item['longitude'])

        distance_in_meters = great_circle(first_point, second_point).meters

        if time_diff_in_minutes < TIME_RESOLUTION_IN_MINUTES and distance_in_meters < DISTANCE_RESOLUTION_IN_METER and second_item['home_confirmed'] == 'f':
            indexes_to_drop.append (i)
        else:
            first_item = second_item

    data = data.drop(data.index[indexes_to_drop])
    
    return data

def date_string(date):
    return str(date.year) + '/' + str(date.month) + '/' + str(date.day)

def point_within_polygons(point_lat, point_lon, polygons):
    point = shapely.geometry.Point(point_lon, point_lat)
    for a_polygon in polygons:
        if point.within(a_polygon):
            return 1
    return 0

def get_pageranks(data):
    
    # add all locations as nodes
    all_labels = data.label.unique()
    G = nx.DiGraph()
    
    for node in all_labels:
        G.add_node(node)
        
    # PageRank calculation
    # go over all records and identify transitions between places based on two conditions
    #   1) to a different location 
    #   2) happens the same day or within 3 hours (until 3am next day).
    
    edge_dict = {}
    
    first_item = data.iloc[0,:]
    
    for i in range(1,len(data)-1):
        
        second_item = data.iloc[i,:]
        

        if first_item['eod_day'] == second_item['eod_day'] and first_item['label'] != second_item['label']:
            key = (first_item['label'],second_item['label'])
            if key in edge_dict:
                edge_dict[key] = edge_dict[key] + 1
            else:
                edge_dict[key] = 1
            
        first_item = second_item
    
    for a_key in edge_dict:
        G.add_edge(a_key[0], a_key[1], weight=edge_dict[a_key])
        
    return nx.pagerank(G, weight='weight')


def get_reverse_pageranks(data):
    
    # add all locations as nodes
    all_labels = data.label.unique()
    G = nx.DiGraph()
    
    for node in all_labels:
        G.add_node(node)
        
    # PageRank calculation
    # go over all records and identify transitions between places based on two conditions
    #   1) to a different location 
    #   2) happens the same day or within 3 hours (until 3am next day).
    
    edge_dict = {}
    
    first_item = data.iloc[0,:]
    
    for i in range(1,len(data)-1):
        
        second_item = data.iloc[i,:]
        

        if first_item['eod_day'] == second_item['eod_day'] and first_item['label'] != second_item['label']:
            key = (first_item['label'],second_item['label'])
            if key in edge_dict:
                edge_dict[key] = edge_dict[key] + 1
            else:
                edge_dict[key] = 1
            
        first_item = second_item
    
    for a_key in edge_dict:
        # reversed
        G.add_edge(a_key[1], a_key[0], weight=edge_dict[a_key])
        
    return nx.pagerank(G, weight='weight')

# ----- 
# original https://stackoverflow.com/a/1518632/80738
def get_most_common(lst):
    return max(set(lst), key=lst.count)

def get_features(df_u, home_label, user_id):
    # FEATURES
    #  - Check-in rate
    #  - Check-in rate during midnight
    #  - Check-in rate at the last destination of a day
    #  - Check-in rate at the last destination of an active day
    
    # all unique place labels
    all_labels = df_u.label.unique()

    all_columns = df_u.columns.values

    # tweet subsets for different features
    # SUBSET 1: Midnight checkins
    df_midnight_checkins = df_u[(df_u['hour'] >= MIDNIGHT_CHECKIN_START) & (df_u['hour'] <= MIDNIGHT_CHECKIN_END)]

    # SUBSET 2: Last destination checkins
    df_end_of_day_checkin_candidates = df_u[(df_u['hour'] >= END_OF_DAY_CHECKIN_START) | (df_u['hour'] <= END_OF_DAY_CHECKIN_END)]
    all_eod_days = df_end_of_day_checkin_candidates.eod_day.unique()
    end_of_day_rows_list=[]

    for aDay in all_eod_days:
        aRow = df_end_of_day_checkin_candidates.loc[df_end_of_day_checkin_candidates['eod_day'] == aDay].tail(1)
        end_of_day_rows_list.append(aRow.iloc[0,:])
    
    df_end_of_day_checkin = pd.DataFrame(end_of_day_rows_list, columns=all_columns)

    # SUBSET 3: Last destination with inactive midnight
    all_days = df_u.eod_day.unique()
    end_of_day_inactive_rows_list=[]

    for aDay in all_days:
        aRow = df_u.loc[df_u['eod_day'] == aDay].tail(1)
        if aRow['eod_hour'].item() > END_OF_DAY_CHECKIN_END: # checks if the last entry is NOT after midnight
            end_of_day_inactive_rows_list.append(aRow.iloc[0,:])
    
    df_end_of_inactive_day_checkin = pd.DataFrame(end_of_day_inactive_rows_list, columns=all_columns)


    # number of checkins for different features
    num_of_total_checkins = len(df_u.index)
    num_of_total_midnight_checkins = len(df_midnight_checkins.index)
    num_of_total_end_of_day_checkins = len(df_end_of_day_checkin)
    num_of_total_end_of_inactive_day_checkins = len(df_end_of_inactive_day_checkin)

    #print 'total checkins: ',num_of_total_checkins
    #print 'total checkins from home', len(df_u.loc[df_u['label'] == home_label].index)
    #print 'midnight checkins: ',num_of_total_midnight_checkins
    #print 'end of day checkins: ',num_of_total_end_of_day_checkins
    #print 'end of inactive day checkins: ',num_of_total_end_of_inactive_day_checkins

    
    
    # calculate number of days
    first_checkin = df_u.head(1)
    last_elem = df_u.tail(1)
    is_home_labeled = last_elem['home_confirmed'].values[0] == 't' and last_elem['label'].values[0] == home_label

    if is_home_labeled:
        last_checkin = df_u.tail(2).head(1) # one before the last element
    else:
        last_checkin = df_u.tail(1)

    time_diff = last_checkin['local_datetime'].values[0]-first_checkin['local_datetime'].values[0]
    number_of_days = time_diff / np.timedelta64(1, 'D')
    # find checkin rate
    total_check_in_rate = num_of_total_checkins / number_of_days
       
    dataset_list = []

    # for finding the most checked-in location
    max_checkin_ratio = 0.0
    max_checkin_location_lat = 0
    max_checkin_location_lon = 0
    
    normal_pageranks = get_pageranks(df_u)
    reverse_pageranks = get_reverse_pageranks(df_u)
    
    for label in all_labels:
        all_checkins_with_current_label = df_u.loc[df_u['label'] == label]
        num_of_current_label_checkins = len(all_checkins_with_current_label.index)
        num_of_current_label_midnight_checkins = len(df_midnight_checkins.loc[df_midnight_checkins['label'] == label].index)
        num_of_current_label_end_of_day_checkins = len(df_end_of_day_checkin.loc[df_end_of_day_checkin['label'] == label].index)
        num_of_current_label_end_of_inactive_day_checkins = len(df_end_of_inactive_day_checkin.loc[df_end_of_inactive_day_checkin['label'] == label].index)


        check_in_rate = num_of_current_label_checkins*1.0/num_of_total_checkins
    
        if num_of_total_midnight_checkins == 0:
            midnight_check_in_rate = 0.0
        else:
            midnight_check_in_rate = num_of_current_label_midnight_checkins*1.0/num_of_total_midnight_checkins

        if num_of_total_end_of_day_checkins == 0:
            end_of_day_checkin_rate = 0.0
        else:
            end_of_day_checkin_rate = num_of_current_label_end_of_day_checkins*1.0/num_of_total_end_of_day_checkins

        if num_of_total_end_of_inactive_day_checkins == 0:
            end_of_inactive_day_checkin_rate = 0.0
        else:
            end_of_inactive_day_checkin_rate = num_of_current_label_end_of_inactive_day_checkins*1.0/num_of_total_end_of_inactive_day_checkins
        
        # calculate centroid of the cluster and check whether it is within a residential 
        centroid_lat = np.mean(all_checkins_with_current_label.latitude)
        centroid_lon = np.mean(all_checkins_with_current_label.longitude)
        
        is_residential = point_within_polygons(centroid_lat, centroid_lon, shapely_residential_collection)

        datasetRow = {'checkin_ratio': check_in_rate, 
                      'midnight_ratio': midnight_check_in_rate,
                      'end_of_day_ratio': end_of_day_checkin_rate, 
                      'end_of_inactive_day_ratio': end_of_inactive_day_checkin_rate,
                      'centroid_lat': centroid_lat, 'centroid_lon': centroid_lon, 
                      'is_residential':is_residential,
                      'number_of_checkins_at_this_location': num_of_current_label_checkins,
                      'num_of_total_checkins': num_of_total_checkins,
                      'daily_total_checkin_rate': total_check_in_rate,
                      'meter_distance_to_most_checked_in': 0,
                      'page_rank':normal_pageranks[label],
                      'reverse_page_rank':reverse_pageranks[label],
                      'kilometer_distance_to_most_checked_in': 0,
                      'user_id': user_id, 
                      'is_home': label.item() is home_label}
        
        dataset_list.append(datasetRow)
        
        if max_checkin_ratio < check_in_rate:
            max_checkin_ratio = check_in_rate
            max_checkin_location_lat = centroid_lat
            max_checkin_location_lon = centroid_lon
    
    
    # calculate distance to most checkedin location for all
    
    most_checked_in_point = (max_checkin_location_lat, max_checkin_location_lon)
    
    
    for item in dataset_list:
        cluster_centroid_point = (item['centroid_lat'], item['centroid_lon'])
        distance_in_meters = great_circle(cluster_centroid_point, most_checked_in_point).meters
        item['meter_distance_to_most_checked_in'] = distance_in_meters
        item['kilometer_distance_to_most_checked_in'] = distance_in_meters / 1000.0
    
    return dataset_list

#### - Load and enhance data

In [6]:
## all_data.csv contains anonymized ~1.2 million tweets from 1268 users
parse_dates = ['utc_datetime', 'local_datetime']
df = pd.read_csv('all_data.csv',low_memory=False, parse_dates=parse_dates)
df["hour"] = [t.hour for t in pd.DatetimeIndex(df.local_datetime)]

# eod means End Of Day
df["eod_hour"] = [t.hour+24 if t.hour <= END_OF_DAY_CHECKIN_END else t.hour for t in pd.DatetimeIndex(df.local_datetime)]
df["eod_day"] = [date_string(t-timedelta(days=1)) if t.hour <= END_OF_DAY_CHECKIN_END else date_string(t) for t in pd.DatetimeIndex(df.local_datetime)]


df.sort_values('id',inplace=True)

## We use only one users' data to generate machine learning model attributes
##   after getting this done, other users will be quite straigtforward.

num_of_rows = len(df.index)
print num_of_rows, ' rows loaded.'

1222971  rows loaded.


#### - Iterate through users - clean data, generate features, and export all

In [7]:
# generate a feature set list
feature_set_list = []

unique_user_ids = df.user_id.unique()

index = 1

total_tweets = len(df.index)
total_tweets_after_cleaning = 0

for selected_user_id in unique_user_ids:
    print 'user #:',index
    
    # get only data from this user
    df_user = df.loc[df['user_id'] == selected_user_id] 
    
    # clean data for repeated entries
    df_user = clean_location_footprints(df_user) 
    
    total_tweets_after_cleaning = total_tweets_after_cleaning + len (df_user.index)
    
    # apply clustering algorithm to enhance data further
    coords = df_user.as_matrix(columns=['latitude', 'longitude'])
    epsilon = DISTANCE_RESOLUTION_IN_METER / METER_PER_RADIAN
    db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
    cluster_labels = db.labels_
    
    # add cluster IDs as a property called 'label'
    df_user = df_user.assign(label=pd.Series(cluster_labels).values)
    
    # mark tweets that is confirmed as home
    home_confirmed_tweets = df_user.loc[df_user['home_confirmed'] == 't']
    
    # there is a possibility that home tweets will be assigned to multiple clusters
    # we chose the most commonly seen one
    home_label = get_most_common(home_confirmed_tweets['label'].tolist())
    
    features = get_features(df_user, home_label, index)

    feature_set_list.extend(features)
    index = index + 1
    

df_entire_dataset = pd.DataFrame(feature_set_list)

print 'before_cleaning', total_tweets
print 'after_cleaning', total_tweets_after_cleaning

user #: 1
user #: 2
user #: 3
user #: 4
user #: 5
user #: 6
user #: 7
user #: 8
user #: 9
user #: 10
user #: 11
user #: 12
user #: 13
user #: 14
user #: 15
user #: 16
user #: 17
user #: 18
user #: 19
user #: 20
user #: 21
user #: 22
user #: 23
user #: 24
user #: 25
user #: 26
user #: 27
user #: 28
user #: 29
user #: 30
user #: 31
user #: 32
user #: 33
user #: 34
user #: 35
user #: 36
user #: 37
user #: 38
user #: 39
user #: 40
user #: 41
user #: 42
user #: 43
user #: 44
user #: 45
user #: 46
user #: 47
user #: 48
user #: 49
user #: 50
user #: 51
user #: 52
user #: 53
user #: 54
user #: 55
user #: 56
user #: 57
user #: 58
user #: 59
user #: 60
user #: 61
user #: 62
user #: 63
user #: 64
user #: 65
user #: 66
user #: 67
user #: 68
user #: 69
user #: 70
user #: 71
user #: 72
user #: 73
user #: 74
user #: 75
user #: 76
user #: 77
user #: 78
user #: 79
user #: 80
user #: 81
user #: 82
user #: 83
user #: 84
user #: 85
user #: 86
user #: 87
user #: 88
user #: 89
user #: 90
user #: 91
user #: 

In [9]:
df_entire_dataset.to_csv('training_test_set_data_export.csv')