In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from mpl_toolkits.basemap import Basemap
import importlib
from sklearn import neighbors

%matplotlib inline

# Simple KNN model to predict demographics

In [4]:
# import python module that processes data
import data_processing

In [91]:
importlib.reload(data_processing)

<module 'data_processing' from '/Users/kyu0110/Documents/Kaggle/TalkingData/data_processing.py'>

In [92]:
# read in data
demo_data_train = pd.read_csv('gender_age_train.csv')
demo_data_test = pd.read_csv('gender_age_test.csv')
phone_data = pd.read_csv('phone_brand_device_model.csv')

In [93]:
# replace brand names with english
phone_data = data_processing.translate_brand_names(phone_data)

# any brand
phone_data = data_processing.filter_minor_categories(phone_data, 'phone_brand', 'phone_brand_index', 600)

# any model with fewer than 100 users is minor
phone_data = data_processing.filter_minor_categories(phone_data, 'device_model', 'device_model_index', 100)

In [94]:
phone_data.head()

Unnamed: 0,device_id,phone_brand,device_model,phone_brand_index,device_model_index
0,-8890648629457979026,xiaomi,红米,10,246
1,1277779817574759137,xiaomi,MI 2,10,108
2,5137427614288105724,samsung,Galaxy S4,8,91
3,3669464369358936369,SUGAR,时尚手机,0,0
4,-5019277647504317457,samsung,Galaxy Note 2,8,81


In [95]:
all_data = pd.merge(demo_data_train, phone_data, on='device_id')
all_data.head()

Unnamed: 0,device_id,gender,age,group,phone_brand,device_model,phone_brand_index,device_model_index
0,-8076087639492063270,M,35,M32-38,xiaomi,MI 2,10,108
1,-2897161552818060146,M,35,M32-38,xiaomi,MI 2,10,108
2,-8260683887967679142,M,35,M32-38,xiaomi,MI 2,10,108
3,-4938849341048082022,M,30,M29-31,xiaomi,红米note,10,253
4,245133531816851882,M,30,M29-31,xiaomi,MI 3,10,112


In [96]:
# convert group to index
all_data = data_processing.assign_index(all_data, 'group', 'group_index')

In [100]:
X_train = all_data[['phone_brand_index', 'device_model_index']]
y_train = all_data['group_index']

In [128]:
demo_data_test.tail()

Unnamed: 0,device_id
112066,4280900819321920929
112067,818534825520551359
112068,-8956851351560395765
112069,6097318236795836256
112070,622421180514002079


In [149]:
phone_data = phone_data.drop_duplicates(['device_id'], keep='first')
np.sum(np.isnan(phone_data['device_id']))

Unnamed: 0,device_id,phone_brand,device_model,phone_brand_index,device_model_index
112066,4280900819321920929,HTC,802w,1,0
112067,818534825520551359,xiaomi,红米note,10,253
112068,-8956851351560395765,hammer,T1,0,167
112069,6097318236795836256,hammer,T1,0,167
112070,622421180514002079,xiaomi,MI 2S,10,111


In [148]:
merged_data_test = pd.merge(demo_data_test, phone_data, on='device_id', how='left')

In [150]:
X_test = merged_data_test[['phone_brand_index', 'device_model_index']].as_matrix()

In [151]:
# build model and fit
n = neighbors.KNeighborsClassifier(n_neighbors=5, weights='distance')
n.fit(X_train.as_matrix(), y_train.as_matrix())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='distance')

In [156]:
# make prediction
pred = n.predict_proba(X_test)

In [158]:
# reassign category label based on index
predictions = pd.DataFrame()
predictions['device_id'] = merged_data_test['device_id']
groups = demo_data_train['group'].unique()
groups.sort()
for label in groups:
    predictions[label] = 0

In [163]:
# give a probability of 1 to the KNN predicted group
ind = 0
for p in pred:
    predictions.iloc[ind,1:] = p
    ind += 1

In [164]:
# write to file
predictions.to_csv('knn_first_attemp.csv', index=False)

In [99]:
all_data

Unnamed: 0,device_id,gender,age,group,phone_brand,device_model,phone_brand_index,device_model_index,group_index
0,-8076087639492063270,M,35,M32-38,xiaomi,MI 2,10,108,10
1,-2897161552818060146,M,35,M32-38,xiaomi,MI 2,10,108,10
2,-8260683887967679142,M,35,M32-38,xiaomi,MI 2,10,108,10
3,-4938849341048082022,M,30,M29-31,xiaomi,红米note,10,253,9
4,245133531816851882,M,30,M29-31,xiaomi,MI 3,10,112,9
5,-1297074871525174196,F,24,F24-26,OPPO,R1C,4,140,1
6,236877999787307864,M,36,M32-38,酷派,F2,14,46,10
7,-8098239495777311881,M,38,M32-38,xiaomi,红米note,10,253,10
8,176515041953473526,M,33,M32-38,vivo,X3T,9,179,10
9,1596610250680140042,F,36,F33-42,samsung,Galaxy S4,8,91,4


In [161]:
predictions.head()

Unnamed: 0,device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
0,1002079943728939269,0.0,0.2,0.2,0.0,0.0,0.4,0.0,0.0,0.2,0.0,0.0,0.0
1,-1547860181818787117,0.2,0.0,0.0,0.2,0.0,0.0,0.2,0.0,0.2,0.0,0.0,0.2
2,7374582448058474277,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.2,0.0,0.0,0.4,0.0
3,-6220210354783429585,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.4,0.0,0.0,0.4,0.0
4,-5893464122623104785,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.2,0.4,0.0


In [162]:
pred

array([[ 0. ,  0.2,  0.2, ...,  0. ,  0. ,  0. ],
       [ 0.2,  0. ,  0. , ...,  0. ,  0. ,  0.2],
       [ 0. ,  0. ,  0. , ...,  0. ,  0.4,  0. ],
       ..., 
       [ 0. ,  0. ,  0. , ...,  0.2,  0.6,  0. ],
       [ 0. ,  0. ,  0. , ...,  0.2,  0.6,  0. ],
       [ 0. ,  0. ,  0. , ...,  0.2,  0.2,  0.2]])