# Fine-Scale Prediction of People's Home Location using Social Media Footprints

_**Authors:** Hamdi Kavak, Daniele Vernon-Bido, and Jose Padilla_

_**Submitted:** SBP-BRIMS 2018 on January 11, 2018._

## Repeated 5-fold validation

#### - Imports

In [1]:
### Home Location Prediction Paper ###########
# Task: Training and test
# Author: Hamdi Kavak
# Created: January 03, 2017
#########################################

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
from random import shuffle
from sklearn import metrics

#### - Validation function

In [2]:
def evaluate(training_df, feature_columns, class_column, number_of_repeats=5, k_fold=5, evaluation_treshold_diff=None):
    
    # evaluates all points including non-home tweets for each individial. 
    # gets the highest score using weights and intercept values and eliminates when highest score is below the treshold for each individual again.
    # returns 
    # - average accuracy for all runs
    # - average coverage (based on the treshold)
    # - min, max, median, and average max points
    # - creates an average weights and intercept values and report how it performes (accuracy and coverage again)
    
    # get all unique user ids
    user_ids = training_df.user_id.unique()
    
    # create a k-fold object
    kf = KFold(n_splits=k_fold)
    
    max_points = []
    covarege_percentages = []
    accuracy_list = []
    f1_list = []
    precision_list = []
    recall_list = []
    weights_list = []
    intercept_list = []
    
    for i in range(number_of_repeats):
        print 'repeat:',i
        #shuffle user ids for repeated k-fold cross validation
        shuffle(user_ids)
        
        # splitting is per user so we actually splitting user ids
        for train_index, test_index in kf.split(user_ids):
            training_ids = user_ids[train_index]
            test_ids = user_ids[test_index]
            
            # get actual training and test sets
            df_training = df[df.user_id.isin(training_ids)]
            df_test = df[df.user_id.isin(test_ids)]
        
            # train the model
            model = SVC(kernel='linear',class_weight='balanced')
            y_training = df_training[class_name]
            x_training = df_training[feature_names]
            model.fit(x_training, y_training)
            
            weights_list.append(model.coef_.ravel())
            intercept_list.append(model.intercept_)
            
                        
            treshold = model.intercept_ if evaluation_treshold_diff == None else avg_intercept + evaluation_treshold_diff 
            # print 'weight:', model.coef_.ravel(), ' - intercept:', model.intercept_, ' - treshold:', treshold
            
            users_covered = 0
            expected_list = []
            predicted_list = []
            
            # go through all test users
            for user_id in test_ids:
                
                df_test_user = df_test.loc[(df_test['user_id'] == user_id)]
                
                max_score = -999999
                current_expected = False
                
                for i,row in df_test_user.iterrows():
                    # prediction function
                    
                    total_score = sum(model.coef_.ravel()*row[feature_names].values)

                    if (total_score > max_score):
                        max_score = total_score
                        current_expected = row[class_name]

                if max_score >= treshold:
                    users_covered = users_covered + 1
                    # lets record this user's prediction 
                    
                    predicted_list.append(max_score >= model.intercept_)
                    expected_list.append(current_expected)
                    
                max_points.append(max_score)
                
            accuracy_score = metrics.accuracy_score(expected_list, predicted_list)
            f1_score = metrics.f1_score(expected_list, predicted_list)
            precision_score = metrics.precision_score(expected_list, predicted_list)
            recall_score = metrics.recall_score(expected_list, predicted_list)
            
            accuracy_list.append(accuracy_score)
            f1_list.append(f1_score)
            precision_list.append(precision_score)
            recall_list.append(recall_score)
            
            coverage_perc = users_covered*100.0/len(test_ids)
            covarege_percentages.append(coverage_perc)
    
    
    
    avg_weight = np.mean(weights_list, axis=0)
    avg_intercept = np.mean(intercept_list)
    
    #print 'avg_weight:',avg_weight
    
    final_users_covered = 0
    final_expected_list = []
    final_predicted_list = []
    
    treshold = avg_intercept if evaluation_treshold_diff == None else avg_intercept + evaluation_treshold_diff 

    for usr_id in user_ids:
        df_test_user = training_df.loc[(training_df['user_id'] == usr_id)]
        
        max_score = -999999
        current_expected = False
                
        for i,row in df_test_user.iterrows():
            # prediction function
            total_score = sum(avg_weight*row[feature_names].values)
            #print 'row[feature_names].values:', row[feature_names].values, ' - total_score:', total_score
            
            if (total_score > max_score):
                max_score = total_score
                current_expected = row[class_name]

        if max_score >= treshold:
            final_users_covered = final_users_covered + 1
            # lets record this user's prediction 

            final_predicted_list.append(max_score >= avg_intercept)
            final_expected_list.append(current_expected)
    
    
    final_return_dict = {}
    
    final_return_dict['k_fold'] = {'avg_coverage':np.mean(covarege_percentages),
                                   'avg_accuracy':np.mean(accuracy_list),
                                   'avg_f1':np.mean(f1_list),
                                   'avg_precision':np.mean(precision_list),
                                   'avg_recall':np.mean(recall_list),
                                   'max_point_stat':{'min':np.min(max_points),
                                                     'max':np.max(max_points),
                                                     'avg':np.max(max_points)}}
    
    final_return_dict['avg_model'] = {'coverage':final_users_covered*100.0/len(user_ids),
                                      'accuracy':metrics.accuracy_score(final_expected_list, final_predicted_list),
                                      'f1':metrics.f1_score(final_expected_list, final_predicted_list),
                                      'precision':metrics.precision_score(final_expected_list, final_predicted_list),
                                      'recall':metrics.recall_score(final_expected_list, final_predicted_list),
                                      'weights':avg_weight,
                                      'intercept':avg_intercept}
    
    return final_return_dict

#### - Prepare feature pairs to be evaluated

In [3]:
feature_dict = {}

feature_dict['end_of_day_ratio']={
    'name':'End of Day Ratio',
    'code': 'EDR'
}
feature_dict['checkin_ratio']={
    'name':'Checkin Ratio',
    'code': 'CR'
}
feature_dict['end_of_inactive_day_ratio']={
    'name':'End of Inactive Day Ratio',
    'code': 'EIDR'
}
feature_dict['midnight_ratio']={
    'name':'Midnight Ratio',
    'code': 'MR'
}
feature_dict['page_rank']={
    'name':'PageRank',
    'code': 'PR'
}
feature_dict['reverse_page_rank']={
    'name':'Reverse PageRank',
    'code': 'RPR'
}
feature_dict['is_residential']={
    'name':'Land Use',
    'code': 'LU'
}
feature_dict['kilometer_distance_to_most_checked_in']={
    'name':'Kilometer Distance to most checked in',
    'code': 'KM'
}

feature_pairs = [['end_of_day_ratio'],
                 ['checkin_ratio'],
                 ['end_of_inactive_day_ratio'],
                 ['midnight_ratio'],
                 ['page_rank'],
                 ['reverse_page_rank'],
                 ['kilometer_distance_to_most_checked_in'],
                 ['is_residential'],
                 ['end_of_day_ratio','is_residential'],
                 ['checkin_ratio','is_residential'],
                 ['end_of_inactive_day_ratio','is_residential'],
                 ['midnight_ratio','is_residential'],
                 ['page_rank','is_residential'],
                 ['reverse_page_rank','is_residential'],
                 ['kilometer_distance_to_most_checked_in','is_residential'],
                 ['end_of_day_ratio', 'checkin_ratio'],
                 ['end_of_day_ratio', 'end_of_inactive_day_ratio'],
                 ['end_of_day_ratio', 'midnight_ratio'],
                 ['end_of_day_ratio', 'page_rank'],
                 ['end_of_day_ratio', 'reverse_page_rank'],
                 ['end_of_day_ratio', 'kilometer_distance_to_most_checked_in'],
                 ['checkin_ratio', 'end_of_inactive_day_ratio'],
                 ['checkin_ratio', 'midnight_ratio'],
                 ['checkin_ratio', 'page_rank'],
                 ['checkin_ratio', 'reverse_page_rank'],
                 ['checkin_ratio', 'kilometer_distance_to_most_checked_in'],
                 ['end_of_inactive_day_ratio', 'midnight_ratio'],
                 ['end_of_inactive_day_ratio', 'page_rank'],
                 ['end_of_inactive_day_ratio', 'reverse_page_rank'],
                 ['end_of_inactive_day_ratio', 'kilometer_distance_to_most_checked_in'],
                 ['midnight_ratio', 'page_rank'],
                 ['midnight_ratio', 'reverse_page_rank'],
                 ['end_of_day_ratio', 'checkin_ratio','end_of_inactive_day_ratio','midnight_ratio','page_rank','reverse_page_rank','kilometer_distance_to_most_checked_in']]


training_set_folder = 'training_test_set/'
feature_scores_folder = 'feature_scores/'
class_name = 'is_home'

#### - Evaluate features

In [4]:
filenames = ['training_test_set......csv']

for afile in filenames:
    df = pd.read_csv(training_set_folder+afile)
    
    num_of_rows = len(df.index)
    print num_of_rows, ' rows loaded.'
    
     
    all_results = []
    rows_list = []

    for feature_names in feature_pairs:
        print feature_names
        result = evaluate(df, feature_names, class_name)
        all_results.append({'features':feature_names, 'results':result})

        kfold_result = result['k_fold']

        dict1 = {}
        
        # first column
        column = feature_dict[feature_names[0]]['name']
        for i in range(1,len(feature_names)):
            column = column + '+' + feature_dict[feature_names[i]]['name']
        dict1['Feature'] = column   
        
        # second column
        column = feature_dict[feature_names[0]]['code']
        for i in range(1,len(feature_names)):
            column = column + '+' + feature_dict[feature_names[i]]['code']
        dict1['Short Form'] = column
        
        # third column
        dict1['Accuracy'] = kfold_result['avg_accuracy']

        # fourth column
        dict1['Precision'] = kfold_result['avg_precision']

        # fifth column
        dict1['Recall'] = kfold_result['avg_recall']

        # sixth column
        dict1['F1'] = kfold_result['avg_f1']

        # seventh column
        dict1['Coverage'] = kfold_result['avg_coverage']

        rows_list.append(dict1)
    
    df_features = pd.DataFrame(rows_list, columns=['Feature', 'Short Form', 'Accuracy',
                                                   'Precision','Recall','F1','Coverage'])
   
    # print output
    df_features.to_csv(feature_scores_folder + 'weighted_score_all_features_' + afile, index=False)
    f = open(feature_scores_folder+"feature_training_outputs_w.txt", "a+")
    f.write("\n\nBy number " + afile+ " \n")
    f.write(''.join(str(all_results)))
    f.close()


78812  rows loaded.
['end_of_day_ratio']
repeat: 0
repeat: 1
repeat: 2
repeat: 3
repeat: 4
['checkin_ratio']
repeat: 0
repeat: 1
repeat: 2
repeat: 3
repeat: 4
['end_of_inactive_day_ratio']
repeat: 0
repeat: 1
repeat: 2
repeat: 3
repeat: 4
['midnight_ratio']
repeat: 0
repeat: 1
repeat: 2
repeat: 3
repeat: 4
['page_rank']
repeat: 0
repeat: 1
repeat: 2
repeat: 3
repeat: 4
['reverse_page_rank']
repeat: 0
repeat: 1
repeat: 2
repeat: 3
repeat: 4
['kilometer_distance_to_most_checked_in']
repeat: 0


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


repeat: 1
repeat: 2
repeat: 3
repeat: 4
['is_residential']
repeat: 0
repeat: 1
repeat: 2
repeat: 3
repeat: 4
['end_of_day_ratio', 'is_residential']
repeat: 0
repeat: 1
repeat: 2
repeat: 3
repeat: 4
['checkin_ratio', 'is_residential']
repeat: 0
repeat: 1
repeat: 2
repeat: 3
repeat: 4
['end_of_inactive_day_ratio', 'is_residential']
repeat: 0
repeat: 1
repeat: 2
repeat: 3
repeat: 4
['midnight_ratio', 'is_residential']
repeat: 0
repeat: 1
repeat: 2
repeat: 3
repeat: 4
['page_rank', 'is_residential']
repeat: 0
repeat: 1
repeat: 2
repeat: 3
repeat: 4
['reverse_page_rank', 'is_residential']
repeat: 0
repeat: 1
repeat: 2
repeat: 3
repeat: 4
['kilometer_distance_to_most_checked_in', 'is_residential']
repeat: 0
repeat: 1
repeat: 2
repeat: 3
repeat: 4
['end_of_day_ratio', 'checkin_ratio']
repeat: 0
repeat: 1
repeat: 2
repeat: 3
repeat: 4
['end_of_day_ratio', 'end_of_inactive_day_ratio']
repeat: 0
repeat: 1
repeat: 2
repeat: 3
repeat: 4
['end_of_day_ratio', 'midnight_ratio']
repeat: 0
repeat: 1
re