# Facebook Check-Ins

In [165]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
from sklearn import neighbors, datasets
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier as KNN

In [166]:
#Read data
train = pd.read_csv('~/Documents/School/Kaggle Decal/kaggle_fa16/datasets/facebook/train.csv')
test = pd.read_csv('~/Documents/School/Kaggle Decal/kaggle_fa16/datasets/facebook/test.csv')

In [116]:
#Take a look at the datasets
print("Training data size is" + str(train.shape))
print("Testing data size is" + str(test.shape))

print(train['accuracy'].describe())
print(train.head(3))
print(test.head(3))

Training data size is(29118021, 6)
Testing data size is(8607230, 5)
count    2.911802e+07
mean     8.284912e+01
std      1.147518e+02
min      1.000000e+00
25%      2.700000e+01
50%      6.200000e+01
75%      7.500000e+01
max      1.033000e+03
Name: accuracy, dtype: float64
   row_id       x       y  accuracy    time    place_id
0       0  0.7941  9.0809        54  470702  8523065625
1       1  5.9567  4.7968        13  186555  1757726713
2       2  8.3078  7.0407        74  322648  1137537235
   row_id       x       y  accuracy    time
0       0  0.1675  1.3608       107  930883
1       1  7.3909  2.5301        35  893017
2       2  8.0978  2.3473        62  976933


In [117]:
train.describe()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
count,29118020.0,29118020.0,29118020.0,29118020.0,29118020.0,29118020.0
mean,14559010.0,4.99977,5.001814,82.84912,417010.4,5493787000.0
std,8405649.0,2.857601,2.887505,114.7518,231176.1,2611088000.0
min,0.0,0.0,0.0,1.0,1.0,1000016000.0
25%,7279505.0,2.5347,2.4967,27.0,203057.0,3222911000.0
50%,14559010.0,5.0091,4.9883,62.0,433922.0,5518573000.0
75%,21838520.0,7.4614,7.5103,75.0,620491.0,7764307000.0
max,29118020.0,10.0,10.0,1033.0,786239.0,9999932000.0


In [167]:
#Separate spatial data into grid for computation in smaller bits
#Code mostly taken from 'Sandro' on Kaggle Kernels

def prepare_data(df, n_cell_x, n_cell_y):
#     """
#     Feature engineering and computation of the grid.
#     """
    #Creating the grid
    size_x = 10. / n_cell_x #divide 10 (x values in [0,10]) by number of cells you wish to generate by x
    size_y = 10. / n_cell_y #divide by 10 (y values in [0, 10]) by number of cells you wish to generate by y
    eps = 0.00001 #why do eps? Just so for values less than this, just do 0 b/c too small anyways? 
    xs = np.where(df.x.values < eps, 0, df.x.values - eps)
    ys = np.where(df.y.values < eps, 0, df.y.values - eps)
    pos_x = (xs / size_x).astype(np.int64) #changing position for x into new grid 
    pos_y = (ys / size_y).astype(np.int64) #changing position for y into new grid
    df['grid_cell'] = (pos_y * n_cell_x + pos_x) #the grid # (how many y's up (so # of rows), then add x (column))
    
    #Feature engineering
    fw = [500, 1000, 4, 3, 1./22., 2, 10] #feature weights (black magic here), to better feature engineer
        #each fw represents x, y, hour, weekday, day, month, year changing weights
        #Note: Changed y fw from 1000 to 750 b/c exceeds int64 and produces negative. like too big
        #thing to keep in mind: n_cell_x can't be too big, then will have negative values due to data type overflow
        
    df.x = df.x.values * fw[0]
    df.y = df.y.values * fw[1]
    initial_date = np.datetime64('2014-01-01T01:01', dtype='datetime64[m]') #arbitrary date
    d_times = pd.DatetimeIndex(initial_date + np.timedelta64(int(mn), 'm') 
                               for mn in df.time.values)    
    df['hour'] = d_times.hour * fw[2]
    df['weekday'] = d_times.weekday * fw[3]
    df['day'] = (d_times.dayofyear * fw[4]).astype(int)
    df['month'] = d_times.month * fw[5]
    df['year'] = (d_times.year - 2013) * fw[6]

    #f = df.drop(['time'], axis=1) #drop time b/c converted into time
    return df

In [168]:
#Also code adapted from 'Sandro'
#jk most of this stuff is mine now

def process_one_cell(df_train, df_test, grid_id, threshold, n_cell_x, n_cell_y):
    """   
    Throw in a training dataset and it will split it into local training and testing sets, and 
    do a KNN classification inside one grid cell.
    """   
    
    #Working on df_train to train ONE CELL (from grid_cell column)
    df_cell_train = df_train.loc[df_train.grid_cell == grid_id] #gets all the data within one grid
    place_counts = df_cell_train.place_id.value_counts()
        #getting counts of places in that cell
    
    mask = (place_counts[df_cell_train.place_id.values] >= threshold).values 
    df_cell_train = df_cell_train.loc[mask] #weeds out ID's with less than threshold# cases
    
    #Preparing data
#     le = LabelEncoder()
#     y = le.fit_transform(df_cell_train.place_id.values)
#      X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int)
#      X_test = df_cell_test.drop(['grid_cell'], axis = 1).values.astype(int)
    
#     #Applying the classifier
#     clf = KNeighborsClassifier(n_neighbors=25, weights='distance', 
#                                metric='manhattan') #algorithm applied on one grid cell
#     clf.fit(X, y)
#     y_pred = clf.predict_proba(X_test)
#     pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3])    
#     return pred_labels, row_ids

    #Working on df_test
    df_cell_test = df_test.loc[df_test.grid_cell == grid_id]
    
    #Saving row ids of test for our output later, which will be along with top 3 predictions
    row_ids = df_cell_test.index
    
    
    features = ['x','y','hour','day','weekday','month','year','accuracy']
    
    train_y = df_cell_train['place_id']
    train_x = df_cell_train[features]

    test_x = df_cell_test[features]

    
    # KNN algorithm and test accuracy
    knn = KNN(15) #15 nearest neighbors
    knn.fit(train_x, train_y) #classifying data based on 15-nearest neighbors
    all_preds = knn.predict_proba(test_x)
    
    #Saving predictions into preds_per_cell
    preds_per_cell = np.zeros((test_x.shape[0], 3), dtype=int)
    for record in range(len(all_preds)):
        top3_idx = all_preds[record].argsort()[-3:][::-1]
        preds = knn.classes_[top3_idx]
        preds_per_cell[record] = preds
           
    train_acc = knn.score(train_x, train_y) # score KNN on train set
    
    return preds_per_cell, row_ids, train_acc

In [169]:
def process_grid(df_train, df_test, threshold, n_cells, n_cell_x, n_cell_y):
    """
    Iterates over all grid cells, return average training and testing accuracies
    """ 
    preds = np.zeros((df_test.shape[0], 3), dtype=int)
    small_train_acc_sum = 0
    #small_test_acc_sum = 0
    
    for grid_id in range(n_cells):
        if grid_id % 100 == 0:
            print('iter: %s' %(grid_id)) #Print iteration per 100 grids
            print(small_train_acc_sum / (grid_id - 1))
        
        #Applying classifier to one grid cell
        pred_labels, row_ids, small_train_acc = process_one_cell(df_train, df_test,
                                                                 grid_id, threshold, 
                                                                 n_cell_x, n_cell_y)
        
        small_train_acc_sum += small_train_acc #add up each training accuracy
        #small_test_acc_sum += small_test_acc #add up each testing accuracy
        
        #Updating predictions           
        preds[row_ids] = pred_labels

    train_acc_avg = small_train_acc_sum/n_cells
    #test_acc_avg = small_test_acc_sum/n_cells
    
    print('Generating submission file ...')
    
    #Auxiliary dataframe with the 3 best predictions for each sample
    df_aux = pd.DataFrame(preds, dtype=str, columns = ['l1', 'l2', 'l3'])  
    
    #Concatenating the 3 predictions for each sample
    ds_sub = df_aux.l1.str.cat([df_aux.l2, df_aux.l3], sep=' ')
    
    #Writting to csv
    ds_sub.name = 'place_id'
    ds_sub.to_csv('sub_knn.csv', index=True, header=True, index_label='row_id') 
    
    return train_acc_avg#, test_acc_avg

## Notes
All you need to call is the process_grid() method with appropriate parameters

- Inside process_grid(), the process_one_cell() method is used on each cell, and 

- Inside each process_one_cell() call, prepare_data() is used 

In [None]:
#Defining the size of the grid
# n_cell_x = 1500
# n_cell_y = 1500
#Take one cell
# gridded = prepare_data(acc_train, n_cell_x, n_cell_y) #acc_train with the grid_cell column
# subset = gridded[gridded['grid_cell'] == 125] #take some random grid cell
# print(subset.shape)
# print(subset.head(3))

In [None]:
# IGNORE THIS BLOCK FOR NOW
#Extract from datasets to build mini training and testing sets
#small_train = acc_train.sample(frac = 0.1, axis = 0, random_state = 123) #10% randomly taken from the entire train dataset
# small_train = acc_train.loc[acc_train['x'] < 0.2, acc_train['y'] < 0.2] #take sliver of map where x < 0.1 (ranges 0-10)
# #print(small_train.head(3))
# small_train.shape

In [None]:
#Run kNN on small data and test it out

#Train test split on small portion of training data
#local_train, local_test = train_test_split(subset, test_size = 0.2)
# local_train_y = local_train['place_id']
# local_train_x = local_train.drop('place_id', axis=1)

# local_test_y = local_test['place_id']
# local_test_x = local_test.drop('place_id', axis=1)

# th = 6 #Keeping place_ids with more than th samples.   
# process_grid(local_train, local_test, th, n_cell_x*n_cell_y)
# #KNN and test accuracy
# knn = KNN(4)
# knn.fit(local_train_x, local_train_y)

# train_acc = knn.score(local_train_x, local_train_y) # score KNN on train set
# test_acc = knn.score(local_test_x, local_test_y) # score KNN on test set

## Implementation

In [170]:
#Required Variables
n_cell_x = 30
n_cell_y = 30
threshold = 3


In [171]:
#First, feature engineering + separating training data into amplified grids for both train and test
df_train = prepare_data(train, n_cell_x, n_cell_y) 
df_test = prepare_data(test, n_cell_x, n_cell_y)

In [154]:
df_train.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id,grid_cell,hour,weekday,day,month,year
0,0,397.05,4540.45,54,470702,8523065625,81623,88,18,14,22,10
1,1,2978.35,2398.4,13,186555,1757726713,43078,56,15,5,10,10
2,2,4153.9,3520.35,74,322648,1137537235,63549,8,6,10,16,10
3,3,3683.25,1258.25,65,704587,6567393236,22720,32,3,5,10,20
4,4,2048.05,565.35,31,472130,7440663949,10022,84,0,14,22,10


In [153]:
df_test.head()

Unnamed: 0,row_id,x,y,accuracy,time,grid_cell,hour,weekday,day,month,year
0,0,83.75,680.4,107,930883,12005,44,12,12,20,20
1,1,3695.45,1265.05,35,893017,22721,16,18,11,18,20
2,2,4048.9,1173.65,62,976933,21242,44,3,14,22,20
3,3,499.5,529.55,62,907285,9329,8,6,12,18,20
4,4,333.5,4862.7,40,914399,87320,4,0,12,18,20


In [172]:
# #Then running classification model through process_grid
process_grid(df_train, df_test, threshold, n_cell_x * n_cell_y, n_cell_x, n_cell_y)

iter: 0
-0.0
iter: 100
0.599774807555
iter: 200
0.593966522817
iter: 300
0.593445928144
iter: 400
0.593596084973
iter: 500
0.592632261681
iter: 600
0.592642362994
iter: 700
0.592400075126
iter: 800
0.592141796336
Generating submission file ...


0.59129363488438125

Unnamed: 0,row_id,x,y,accuracy,time,place_id
0,0,0.7941,9.0809,54,470702,8523065625
1,1,5.9567,4.7968,13,186555,1757726713
2,2,8.3078,7.0407,74,322648,1137537235
3,3,7.3665,2.5165,65,704587,6567393236
4,4,4.0961,1.1307,31,472130,7440663949
