In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn import cross_validation as cv
from sklearn import svm
from sklearn import ensemble
from sklearn import linear_model

In [2]:
train = pd.read_csv('../data/raw/train.csv')

print train.shape

(29118021, 6)


In [3]:
uniq = train['place_id'].nunique()
print uniq

108390


* The number of unique values is huge. This makes me think in a direction where we could center basis functions at the centers of discovered clusters. Discover cluster centers via K-Means?

In [4]:
train_X = train.values[:,:-1]
train_t = train.values[:,-1]

print train_X.shape
print train_t.shape

(29118021L, 5L)
(29118021L,)


In [5]:
train.describe()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
count,29118020.0,29118020.0,29118020.0,29118020.0,29118020.0,29118020.0
mean,14559010.0,4.99977,5.001814,82.84912,417010.4,5493787000.0
std,8405649.0,2.857601,2.887505,114.7518,231176.1,2611088000.0
min,0.0,0.0,0.0,1.0,1.0,1000016000.0
25%,7279505.0,2.5347,2.4967,27.0,203057.0,3222911000.0
50%,14559010.0,5.0091,4.9883,62.0,433922.0,5518573000.0
75%,21838520.0,7.4614,7.5103,75.0,620491.0,7764307000.0
max,29118020.0,10.0,10.0,1033.0,786239.0,9999932000.0


In [6]:
train.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
0,0,0.7941,9.0809,54,470702,8523065625
1,1,5.9567,4.7968,13,186555,1757726713
2,2,8.3078,7.0407,74,322648,1137537235
3,3,7.3665,2.5165,65,704587,6567393236
4,4,4.0961,1.1307,31,472130,7440663949


In [None]:
train.tail()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
29118016,29118016,6.5133,1.1435,67,399740,8671361106
29118017,29118017,5.9186,4.4134,67,125480,9077887898
29118018,29118018,2.9993,6.368,67,737758,2838334300
29118019,29118019,4.0637,8.0061,70,764975,1007355847
29118020,29118020,7.4523,2.0871,17,102842,7028698129


* Null Hypothesis: the plotted joints are identical

In [None]:
# train['place_id'].value_counts().plot(kind='bar')

# train['place_id'].value_counts().plot(kind='barh')

In [None]:
sb.distplot(train['accuracy'], bins=50, kde=False, rug=True);

In [None]:
sb.distplot(train['accuracy'], hist=False, rug=True);

In [None]:
with sb.axes_style("white"):
    sb.jointplot(x=train['x'], y=train['y'], kind="hex", color="k");

* We have p = 0.068, hence the null hypothesis does not hold

In [None]:
with sb.axes_style("white"):
    sb.jointplot(x=train['accuracy'], y=train['time'], kind="hex", color="k");

* We have p = 0, hence the null hypothesis does not hold
* We can also observe that as time passes, we mostly observe that accuracy falls in 3 distinct ranges

# 1. Analysis

## Notes

### Essential questions

* Did you specify the type of data analytic question (e.g. exploration, association causality) before touching the data?
    * We are trying to order the places (i.e by their likelihood) based on the following measurements from the dataset: coordinates, accuracy (?), time (?) and place_id.

* Did you define the metric for success before beginning?
    * The metric is Mean Average Precision (What is this?)

* Did you understand the context for the question and the scientific or business application?
    *We are building a system that would rank a list of places given 'coords', 'accuracy' and 'time'. The purpose might be to enable for specific ads (i.e interesting places around the hotel) to be shown to the person (on FB?) depending on this list.

* Did you record the experimental design?
    * Given.

* Did you consider whether the question could be answered with the available data?
    * We need to further explore 'accuracy' and to check if we could identify different clusters of users - we don't know if the data was genereted by 1 person or many, so we need to check its structure.

### Checking the data

* Null values?
    * No!

* What do we know of the measurements?
    * First column is ID and is useless.

    * Second and Third are coords., they are in kilometers and are floating point. Min is (0,0) and max is (10,10);

    * Fourth column is accuracy. Range is (1, 1033) and seems to follow a power law distribution. We assume that this is the accuracy of the location given by the GPS. This claim is supported by the fact that the data comes from a mobile device, which is able to give location but this information is sometimes not accurate (i.e in buildings), so we would like to know what is the accuracy of the reading. In order to convert this into real accuracy, we need to normalize the column and assign it values of (1 - current_val).

    * The fifth column is time given as a timestamp. What patterns are there?

    * Last column is the class_id, given as an integer

# 2. Pre-processing

In [None]:
col_headers = list(train.columns.values)
print col_headers
train[col_headers[1:-1]] = train[col_headers[1:-1]].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
train['accuracy'] = 1 - train['accuracy']

In [None]:
train.describe()

In [None]:
train.head()

In [None]:
train.tail()

## 2.1 K-Means clustering

In [None]:
K = uniq
clusters = range(0,K)
batch_size = 10000
n_init = 100

In [None]:
random_state = np.random.RandomState(0)

mbk = MiniBatchKMeans(init='random', n_clusters=K, batch_size=batch_size,
                      n_init=n_init, max_no_improvement=10, verbose=0)
X_kmeans = mbk.fit_transform(X)
print "Done!"

In [None]:
print X_kmeans.shape