In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import keras
from IPython.display import SVG
from keras.optimizers import Adam
from keras.utils.vis_utils import model_to_dot
from sklearn.model_selection import train_test_split
import lib
import pickle

Using TensorFlow backend.


In [2]:
def load_data(file_pref = '') -> (pd.DataFrame, pd.DataFrame):
    users = pd.read_csv(file_pref+'Users.csv', sep=';', dtype={'User_id': str, 'age': str})
    users.dropna(axis=0,inplace=True)
    users.age = pd.to_numeric(users.age, errors='coerce').fillna(30).astype(np.int64)
    users.loc[users.age>80, "age"] = 25
    users['User_id'] = users['User_id'].astype(np.int64)
    users.set_index('User_id', inplace=True)

    finder_decisions = pd.read_csv(file_pref+'Finder_decisions_.csv', sep=';')#, nrows=10000)
    finder_decisions.drop(finder_decisions[(~finder_decisions['Receiver_id'].isin(users.index))].index.values, inplace=True)
    finder_decisions.drop(finder_decisions[(~finder_decisions['Sender_id'].isin(users.index))].index.values, inplace=True)
    return users, finder_decisions

users, finder_decisions = load_data('')

In [3]:
print(finder_decisions.shape)
finder_decisions.head()

(16369297, 4)


Unnamed: 0,Decision,Number of Records,Receiver_id,Sender_id
0,like,1,3020120157,3019224295
1,like,1,3020143161,3021346119
2,like,1,3004593177,3021346119
3,like,1,3020177581,3021346119
4,like,1,3020193527,3019224295


In [4]:
print(users.shape)
users.head()

(82391, 3)


Unnamed: 0_level_0,Photo,age,gender
User_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3025414901,https://r.phts.io/hilybu9d0lpf7mngw8gg8s8gwg8w...,30,1
3024640165,https://r.phts.io/hilybu44zkz4sc01ogsskc0480w8...,62,1
3022146285,https://r.phts.io/hilybu7gufdsc9wf8k444w84okkw...,23,2
3020392717,https://r.phts.io/hilybu760vygze57cwow0cswggkg...,20,2
3023036685,https://r.phts.io/hilybu5shyp459tl0ksgcws48sw0...,28,1


In [14]:
sender_value_counts = finder_decisions['Sender_id'].value_counts()
decisions =     finder_decisions.drop(
        finder_decisions.index[finder_decisions['Sender_id'].isin(sender_value_counts.index[sender_value_counts < 3])])
del sender_value_counts

In [15]:
users['key'] = np.arange(0,users.shape[0])
#user_by_index=pd.DataFrame({"User_id":users.index.values}, users["key"])
#index_by_user=pd.DataFrame({"key":user_by_index.index}, user_by_index.User_id)

users_data = users[['age', 'gender', 'key']]
decisions = decisions.merge(users_data, how='left', left_on='Sender_id', right_index=True)
decisions.rename(columns={'age': 'Sender_age', 'gender': 'Sender_gender', 'key': 'Sender_key'}, inplace=True)
decisions = decisions.merge(users_data, how='left', left_on='Receiver_id', right_index=True)
decisions.rename(columns={'age': 'Receiver_age', 'gender': 'Receiver_gender', 'key': 'Receiver_key'},inplace=True)
decisions['Decision'] = (decisions['Decision'] == 'like')+1

In [16]:
decisions.head()

Unnamed: 0,Decision,Number of Records,Receiver_id,Sender_id,Sender_age,Sender_gender,Sender_key,Receiver_age,Receiver_gender,Receiver_key
0,2,1,3020120157,3019224295,33,1,19310,33,2,71522
1,2,1,3020143161,3021346119,19,2,6526,23,1,22061
2,2,1,3004593177,3021346119,19,2,6526,22,1,75255
3,2,1,3020177581,3021346119,19,2,6526,20,1,15735
4,2,1,3020193527,3019224295,33,1,19310,20,2,66367


In [17]:
from sklearn.cluster import KMeans
from scipy.sparse import coo_matrix, lil_matrix

def extract_sparce_matrix(decisions):
    for (rindex, d) in decisions.iterrows():
         yield (d['Sender_key'], d['Receiver_key'], d['Decision'])

dim=users.shape[0]
S = lil_matrix((dim, dim))
for (i,j,d) in extract_sparce_matrix(decisions):
    S[i,j] = d

labeler = KMeans(n_clusters=100, max_iter=200, verbose=True)
labeler.fit(S.tocsr())

Initialization complete
Iteration  0, inertia 23013804.000
Iteration  1, inertia 18633460.464
Iteration  2, inertia 18145492.941
Iteration  3, inertia 17914958.183
Iteration  4, inertia 17797598.549
Iteration  5, inertia 17734050.151
Iteration  6, inertia 17698168.786
Iteration  7, inertia 17678646.303
Iteration  8, inertia 17664203.740
Iteration  9, inertia 17649676.839
Iteration 10, inertia 17634964.057
Iteration 11, inertia 17625432.835
Iteration 12, inertia 17618402.290
Iteration 13, inertia 17610397.386
Iteration 14, inertia 17600819.174
Iteration 15, inertia 17592706.967
Iteration 16, inertia 17587101.841
Iteration 17, inertia 17582086.119
Iteration 18, inertia 17579002.718
Iteration 19, inertia 17577419.930
Iteration 20, inertia 17575326.436
Iteration 21, inertia 17573777.049
Iteration 22, inertia 17572363.879
Iteration 23, inertia 17571537.162
Iteration 24, inertia 17570678.021
Iteration 25, inertia 17570185.005
Iteration 26, inertia 17569795.111
Iteration 27, inertia 17569428.

Iteration 40, inertia 17519466.839
Iteration 41, inertia 17519462.497
Iteration 42, inertia 17519457.173
Iteration 43, inertia 17519456.448
Iteration 44, inertia 17519456.282
Converged at iteration 44: center shift 0.000000e+00 within tolerance 4.273602e-07
Initialization complete
Iteration  0, inertia 23053016.000
Iteration  1, inertia 18514508.340
Iteration  2, inertia 18074733.509
Iteration  3, inertia 17880962.974
Iteration  4, inertia 17797675.380
Iteration  5, inertia 17747484.869
Iteration  6, inertia 17710030.919
Iteration  7, inertia 17685023.094
Iteration  8, inertia 17670404.809
Iteration  9, inertia 17660203.291
Iteration 10, inertia 17652273.630
Iteration 11, inertia 17646748.313
Iteration 12, inertia 17639750.499
Iteration 13, inertia 17636433.916
Iteration 14, inertia 17634392.175
Iteration 15, inertia 17632297.933
Iteration 16, inertia 17630781.377
Iteration 17, inertia 17629106.829
Iteration 18, inertia 17627194.101
Iteration 19, inertia 17625040.333
Iteration 20, iner

Iteration  3, inertia 17751464.103
Iteration  4, inertia 17663262.820
Iteration  5, inertia 17603586.427
Iteration  6, inertia 17564145.078
Iteration  7, inertia 17537172.597
Iteration  8, inertia 17515297.128
Iteration  9, inertia 17501644.459
Iteration 10, inertia 17494027.828
Iteration 11, inertia 17488874.584
Iteration 12, inertia 17484549.763
Iteration 13, inertia 17477859.527
Iteration 14, inertia 17467548.273
Iteration 15, inertia 17459113.929
Iteration 16, inertia 17454854.573
Iteration 17, inertia 17451613.609
Iteration 18, inertia 17448833.565
Iteration 19, inertia 17446649.140
Iteration 20, inertia 17445201.205
Iteration 21, inertia 17443955.229
Iteration 22, inertia 17442912.150
Iteration 23, inertia 17442006.242
Iteration 24, inertia 17441108.768
Iteration 25, inertia 17440209.465
Iteration 26, inertia 17438921.468
Iteration 27, inertia 17438290.987
Iteration 28, inertia 17437420.401
Iteration 29, inertia 17434699.302
Iteration 30, inertia 17433184.727
Iteration 31, inerti

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=200,
    n_clusters=100, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=True)

In [18]:
with open('labeler100clusters.pickle', 'wb') as f:
    pickle.dump(labeler, f)
    f.close()

In [19]:
clusters=pd.DataFrame({"cluster":labeler.labels_}, np.arange(0,len(labeler.labels_)))
decisions = decisions.merge(clusters, how='left', left_on='Sender_key', right_index=True)
users = users.merge(clusters, how='left', left_on='key', right_index=True)

In [41]:
cluster_reciever=decisions.groupby(["cluster", "Receiver_id"], as_index=False)["Decision"].mean()

answer = finder_decisions[["Decision","Receiver_id","Sender_id"]]
answer["Decision"] = (answer["Decision"] == "like") + 1
answer = answer.merge(users[["cluster"]], how="left", left_on="Sender_id", right_index=True )
answer.head()

Unnamed: 0,Decision,Receiver_id,Sender_id,cluster
0,2,3020120157,3019224295,36
1,2,3020143161,3021346119,19
2,2,3004593177,3021346119,19
3,2,3020177581,3021346119,19
4,2,3020193527,3019224295,36


In [42]:

answer2 = pd.merge(answer, cluster_reciever,  how='left', left_on=['cluster','Receiver_id'], right_on = ['cluster','Receiver_id'])

In [44]:
answer2[answer2.Decision_x == 1+(answer2.Decision_y>1.5)].shape[0]/answer2.shape[0]

0.7918021769658159

In [23]:
import datetime
now = datetime.datetime.now()

print (now)

2018-02-25 04:05:08.519460
