In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from matplotlib import pyplot as plt
import sklearn
import sklearn.cluster
%matplotlib inline

In [2]:
train = pd.read_csv('../train.csv')[['lon', 'lat']]

In [3]:
test = pd.read_csv('../test.csv')[['lon', 'lat']]

In [4]:
cluster_learn = []
lonlat = train.values
for lon, lat in lonlat:
    cluster_learn.append([float(lon), float(lat)])
cluster_learn = np.array(cluster_learn)

In [5]:
clst = sklearn.cluster.KMeans(init='k-means++', random_state=1337, n_clusters=420, n_jobs=36)
ids = clst.fit_predict(cluster_learn)

In [6]:
centers = clst.cluster_centers_

In [7]:
pd.DataFrame({'cluster_id' : ids}).to_csv("train_cluster_ids.csv")

In [8]:
cluster_get = []
lonlat = test.values
for lon, lat in lonlat:
    cluster_get.append([float(lon), float(lat)])
cluster_get = np.array(cluster_learn)

In [9]:
ids = clst.predict(cluster_get)

In [10]:
pd.DataFrame({'cluster_id' : ids}).to_csv("test_cluster_ids.csv")

In [11]:
import pygeoplot as gp
from IPython.display import Image

In [12]:
hm = pd.DataFrame({'lon' : centers[:, 0], 'lat' : centers[:, 1]})
map_center = list(hm[:10].ix[:, ['lat', 'lon']].mean())

In [13]:
points_map = gp.Map()
gp.placemarks_from_df(points_map, hm, lat_col='lat', lng_col='lon')
points_map.set_state(map_center, 5)

In [14]:
points_map.save_html('cluster_centers.html')

In [15]:
hm.to_csv("center_coords.csv")

# vectorize

In [16]:
train = pd.read_csv('train_cluster_ids.csv')
test = pd.read_csv('test_cluster_ids.csv')

In [17]:
train_and_test = pd.concat([train['cluster_id'], test['cluster_id']])

In [18]:
from sklearn.feature_extraction import DictVectorizer
vectorizer = DictVectorizer(sparse=False,dtype=np.int)

data_dict = ( {'cluster_id':str(cluster_id)} for cluster_id in train_and_test )

train_and_test_vectorized = pd.DataFrame(vectorizer.fit_transform(data_dict), columns=vectorizer.feature_names_)

In [19]:
train_and_test_vectorized.head(5)

Unnamed: 0,cluster_id=0,cluster_id=1,cluster_id=10,cluster_id=100,cluster_id=101,cluster_id=102,cluster_id=103,cluster_id=104,cluster_id=105,cluster_id=106,...,cluster_id=90,cluster_id=91,cluster_id=92,cluster_id=93,cluster_id=94,cluster_id=95,cluster_id=96,cluster_id=97,cluster_id=98,cluster_id=99
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
train_and_test_vectorized[:len(train)].to_csv("train_clusters_vectorized.csv")

In [21]:
train_and_test_vectorized[len(train):].to_csv("test_clusters_vectorized.csv")