In [1]:
import requests, zipfile, StringIO
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss
import pickle
from copy import deepcopy
import seaborn as sns
# Plotting Options
sns.set_style("whitegrid")
sns.despine()

<matplotlib.figure.Figure at 0x113e14490>

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize


In [3]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.grid_search import GridSearchCV
from sklearn import cross_validation, metrics

In [4]:
from sklearn.metrics import log_loss

In [5]:
import warnings
warnings.filterwarnings("ignore") 
from IPython.core.debugger import Tracer

In [6]:
def read_csv_zip(filename):
    z = zipfile.ZipFile(filename+'.zip')
    df = pd.read_csv(z.open(filename),index_col=False)
    return df
train = read_csv_zip("train.csv")
test = read_csv_zip("test.csv")
train_sample = train.sample(frac=0.01)

In [7]:
def get_datetime(df):
    df["time_dt"] = pd.to_datetime(df["time"]*60,unit='s')
    df["year"] = df["time_dt"].apply( lambda x : x.year)
    df["month"] = df["time_dt"].apply( lambda x : x.month)
    df["day_of_week"] = df["time_dt"].apply( lambda x: x.weekday())
    df["hour"] = df["time_dt"].apply( lambda x: x.hour)
    df["day_of_year"] = df["time_dt"].apply( lambda x: x.dayofyear)

In [8]:
train_sample= train.sample(frac=0.1) #switch to full training data


In [9]:
test_sample = test.sample(frac=0.1)

In [10]:
test_data = test_sample
msk = np.random.rand(len(train_sample)) < 0.8
train_data = train_sample[msk]
val_data = train_sample[~msk]

In [11]:
def unique_place_bins(df,bin_size):
    df["x_bin"] = df["x"]*bin_size//df["x"].max()
    df["y_bin"] = df["y"]*bin_size//df["y"].max()
    dfbin = df.groupby(["x_bin","y_bin"],as_index=False).agg({"place_id": lambda x: x.nunique()})
    return dfbin
    #dfreset = dfbin.reset_index()

In [12]:
def tot_checkin_bins(df):
    dfbin = df.groupby(["x_bin","y_bin"],as_index=False).size().reset_index()
    return dfbin

In [13]:
number_places = unique_place_bins(train_data,100)

In [14]:
number_places.head()

Unnamed: 0,x_bin,y_bin,place_id
0,0,0,32
1,0,1,39
2,0,2,43
3,0,3,45
4,0,4,34


In [15]:
def plot_heatmaps(number_places,i):
    number_places_pivot = number_places.pivot("x_bin","y_bin")
    X=number_places_pivot.columns.levels[1].values
    Y=number_places_pivot.index.values
    Z=number_places_pivot.values
    Xi,Yi = np.meshgrid(X, Y)
    plt.figure(i,figsize=(8,6))
    plt.contourf(Yi, Xi, Z, alpha=0.7, cmap=plt.cm.jet);
    plt.colorbar()
    plt.xlabel("X")
    plt.ylabel("Y")
    plt.show()



In [16]:
get_datetime(train_data)
get_datetime(test_data)
get_datetime(val_data)

In [17]:
train_data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id,x_bin,y_bin,time_dt,year,month,day_of_week,hour,day_of_year
1320825,1320825,4.5485,4.5476,419,182344,6383875308,45,45,1970-05-07 15:04:00,1970,5,3,15,127
27814796,27814796,1.7018,3.5148,5,297348,1152196899,17,35,1970-07-26 11:48:00,1970,7,6,11,207
28644887,28644887,5.3055,1.2691,56,691349,8187823707,53,12,1971-04-26 02:29:00,1971,4,0,2,116
28755646,28755646,8.3078,5.2137,16,583604,4819761395,83,52,1971-02-10 06:44:00,1971,2,2,6,41
5208006,5208006,3.9429,2.5444,47,712698,2438895977,39,25,1971-05-10 22:18:00,1971,5,0,22,130


In [18]:
def numerical_feature_preprocess(continuous_features,train,test,val):
    scaler = StandardScaler()
    for col in continuous_features:
        #print train[col].head()
        scaler.fit(train[col])
        train[col] = scaler.transform(train[col])
        test[col] = scaler.transform(test[col])
        val[col] = scaler.transform(val[col])
def categorical_feature_preprocess(categorical_feature,dataset):
    list_categorical = []
    for col in categorical_feature:
        dummy = pd.get_dummies(dataset[col])
        list_categorical.append(dummy)
    dataset_categorical = pd.concat(list_categorical,axis=1)
    return dataset_categorical

In [19]:
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
def modelfit(alg, dtrain, predictors,dtest,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        #print dtrain['place_encoded'].nunique()
        xgb_param['num_class'] = dtrain['place_encoded'].nunique()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain['place_encoded'].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,\
            metrics=['mlogloss'], early_stopping_rounds=early_stopping_rounds, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['place_encoded'],eval_metric='mlogloss')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    
        
    #Print model report:
    print "\nModel Report"
    print "Accuracy (Train): %.4g" % metrics.accuracy_score(dtrain['place_encoded'].values, dtrain_predictions)
    #print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['place_encoded'], dtrain_predprob)
    #feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    #feat_imp.plot(kind='bar', title='Feature Importances (Train)')
    #plt.ylabel('Train Feature Importance Score')
    
    test_prediction = alg.predict_proba(dtest[predictors])
    return test_prediction

In [20]:
def process_cell(train_data,val_data,test_data):
    place_encoded = LabelEncoder()
    place_encoded.fit(train_data["place_id"])
    train_data["place_encoded"] = place_encoded.transform(train_data["place_id"])
    #val_data["place_encoded"] = place_encoded.transform(val_data["place_id"])

    #categorical_features = ["day_of_week"]
    numerical_feature_preprocess(continuous_features,train_data,test_data,val_data)
    #train_categorical = categorical_feature_preprocess(categorical_features,train_data)
    #test_categorical = categorical_feature_preprocess(categorical_features,test_data)
    #val_categorical = categorical_feature_preprocess(categorical_features,val_data)
    # print train_categorical.head()
    # print test_categorical.head()
    #train_data_processed = pd.concat([train_data[continuous_features],train_categorical],axis = 1)
    #test_data_processed = pd.concat([test_data[continuous_features],test_categorical],axis = 1)
    #val_data_processed = pd.concat([val_data[continuous_features],val_categorical],axis = 1)
    xgb = XGBClassifier(learning_rate = 0.01,n_estimators = 1000, max_depth = 4, min_child_weight = 6, gamma = 0,\
                        subsample = 0.8, colsample_bytree=0.8,reg_alpha=0.005,objective= 'multi:softprob',\
                        nthread=4, scale_pos_weight=1, seed=27)
    test_prediction = modelfit(xgb,train_data,continuous_features,test_data)
    #knn.fit(train_data[continuous_features],train_data["place_encoded"])
    #train_accuracy = np.mean(train_data["place_encoded"] == knn.predict(train_data[continuous_features]))
    #val_accuracy = np.mean(val_data["place_encoded"]== knn.predict(val_data[continuous_features]))
    #train_log_loss = log_loss(train_data["place_encoded"],np.array(knn.predict_proba(train_data[continuous_features])))
    #print "shape val_log_loss", val_data_processed.shape
    #print "shape val_data", val_data.shape
    #val_log_loss = log_loss(val_data["place_encoded"],np.array(knn.predict_proba(val_data_processed)))

    #print train_data_processed.shape

    #test_prediction = knn.predict_proba(test_data[continuous_features])
    pred_labels = place_encoded.inverse_transform(np.argsort(test_prediction, axis=1)[:,::-1][:,:3]) 
    return pred_labels#test_prediction,pred_labels
    
    
    

In [None]:
def process_grid(train_data,val_data,test_data,x_step,y_step,xy_range):
    preds = np.zeros((test_data.shape[0], 3), dtype=np.int64)
    #preds_val = np.zeros((val_data.shape[0],3),dtype=np.int64)
    #print test_data.shape[0]
    #train_accuracy = np.zeros((len(xy_range),len(xy_range)),dtype=float)
    #val_accuracy = np.zeros((len(xy_range),len(xy_range)),dtype=float)
    #train_log_loss = np.zeros((len(xy_range),len(xy_range)),dtype=float)
    #val_log_loss = np.zeros((len(xy_range),len(xy_range)),dtype=float)
    i, j = 0, 0
    for x in xy_range:
        j = 0
        for y in xy_range:
            train_data_cell = train_data.loc[(train_data["x"] >= (x-x_step)) & (train_data["x"] <= (x)) & (train_data["y"] >= (y-y_step)) & (train_data["y"] <= (y))] 
            # print train_data_cell.head()
            test_data_cell = test_data.loc[(test_data["x"] >= (x-x_step)) & (test_data["x"] <= (x)) & (test_data["y"] >= (y-y_step)) & (test_data["y"] <= (y))] 
            val_data_cell = val_data.loc[(val_data["x"] >= (x-x_step)) & (val_data["x"] <= (x)) & (val_data["y"] >= (y-y_step)) & (val_data["y"] <= (y))]
            
            row_ids = test_data_cell.index
            row_ids_val = val_data_cell.index
            pred_labels = process_cell(train_data_cell,val_data_cell,test_data_cell)
            #print max(row_ids)
            #print test_data.shape
            preds[row_ids] = pred_labels
            # print x,y
            # print i,j
            j += 1
        i += 1
    pre_result = pd.DataFrame(preds,dtype = str,columns=["l1","l2","l3"])
    result = pre_result.l1.str.cat([pre_result.l2,pre_result.l3],sep = " ")
    result.name = "place_id"
    result.to_csv("fb_checkin.csv",index=True, header=True, index_label = "row_id")
    #print result.head()
    #print train_accuracy
    #print train_log_loss
    return result
    
    

In [None]:
file_path = "/Users/ChiYuan/Documents/python/Kaggle/Facebook Checkin/"
continuous_features = ["x","y","time","year","month","hour","day_of_year","day_of_week","accuracy"]
x_step = 0.5
y_step = 0.5
xy_range = np.linspace(0.5, 10.0, 20)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)
result = process_grid(train_data,val_data,test_data,x_step,y_step,xy_range)


Will train until cv error hasn't decreased in 50 rounds.



Model Report
Accuracy (Train): 0.6791


Will train until cv error hasn't decreased in 50 rounds.



Model Report
Accuracy (Train): 0.6412


Will train until cv error hasn't decreased in 50 rounds.



Model Report
Accuracy (Train): 0.6652


Will train until cv error hasn't decreased in 50 rounds.



Model Report
Accuracy (Train): 0.6575


Will train until cv error hasn't decreased in 50 rounds.


In [None]:
#result.to_csv(file_path+"fb_checkin.csv",index=True, header=True, index_label = "row_id")
    #print result.head()