In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [2]:
TRAIN_PATH = 'small_data/s_train.csv'
VAL_PATH = 'small_data/s_val.csv'
TEST_PATH = 'small_data/s_test.csv'

RANDOM_STATE = 42

In [3]:
sns.set()
tqdm.pandas()

In [30]:
def fill_u_city(df, ego_id):
    most_freq_city = df[(df['u'] == 0) & (df['ego_id'] == ego_id)]['city_id_v'].value_counts()
    city = most_freq_city[(most_freq_city.index > -1)].idxmax()
    idx = df[(df['u'] == 0) & (df['ego_id'] == ego_id) & (df['city_id_u'] == -1)].index
    df.loc[idx, 'city_id_u'] = city

    return df


def fill_v_city(df, ego_id):
    city = df[(df['u'] == 0) & (df['ego_id'] == ego_id)]['city_id_u'].unique()[0]
    idx = df[(df['u'] == 0) & (df['ego_id'] == ego_id) & (df['city_id_v'] == -1)].index
    df.loc[idx, 'city_id_v'] = city

    return df

In [31]:
df = pd.read_csv(TRAIN_PATH)
df.head()

Unnamed: 0,ego_id,u,v,t,x1,x2,x3,age_u,city_id_u,sex_u,school_u,university_u,age_v,city_id_v,sex_v,school_v,university_v
0,10,15,0,594.5,0.001801046,0.0,1.0,122.0,812659840.0,2.0,429963652.0,305345287.0,40.0,812659840.0,1.0,-1.0,29142664.0
1,10,0,52,20.0,0.03482469,0.0,0.0,40.0,812659840.0,1.0,-1.0,29142664.0,38.0,592242923.0,2.0,-1.0,880545209.0
2,10,10,31,363.8,8.209834e-15,0.0,0.0,35.0,812659840.0,2.0,-1.0,-1.0,122.0,812659840.0,2.0,765951790.0,305345287.0
3,10,12,61,17.4,0.09022486,0.0,1.0,37.0,812659840.0,1.0,664022210.0,880545209.0,34.0,812659840.0,1.0,-1.0,37436168.0
4,10,67,126,87.9,1.499315e-07,0.0,0.0,43.0,-1.0,2.0,-1.0,-1.0,45.0,812659840.0,2.0,-1.0,-1.0


In [32]:
ego_ids = df['ego_id'].unique()

for ego_id in tqdm(ego_ids):
    df = fill_v_city(df, ego_id)
    df = fill_u_city(df, ego_id)

100%|██████████| 255/255 [00:03<00:00, 65.13it/s]


In [33]:
df[(df['city_id_v'] < 0) & (df['u'] == 0)]

Unnamed: 0,ego_id,u,v,t,x1,x2,x3,age_u,city_id_u,sex_u,school_u,university_u,age_v,city_id_v,sex_v,school_v,university_v
3702,27,0,115,,0.000000,0.0,1.0,21.0,794218497.0,1.0,-1.0,-1.0,33.0,-1.0,1.0,463457464.0,409736909.0
3770,27,0,25,43.0,0.297734,0.0,1.0,21.0,794218497.0,1.0,-1.0,-1.0,32.0,-1.0,2.0,-1.0,-1.0
3831,27,0,164,,0.000000,0.0,1.0,21.0,794218497.0,1.0,-1.0,-1.0,39.0,-1.0,2.0,-1.0,-1.0
3834,27,0,103,30.1,2.418744,0.0,0.0,21.0,794218497.0,1.0,-1.0,-1.0,32.0,-1.0,2.0,-1.0,-1.0
3843,27,0,105,28.7,0.040808,0.0,0.0,21.0,794218497.0,1.0,-1.0,-1.0,29.0,-1.0,1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541065,8589934981,0,36,53.9,0.696285,0.0,0.0,20.0,427211423.0,1.0,-1.0,-1.0,24.0,-1.0,2.0,-1.0,-1.0
541121,8589934981,0,136,50.6,0.020320,0.0,0.0,20.0,427211423.0,1.0,-1.0,-1.0,19.0,-1.0,1.0,-1.0,-1.0
541122,8589934981,0,173,44.7,0.979250,0.0,0.0,20.0,427211423.0,1.0,-1.0,-1.0,19.0,-1.0,1.0,-1.0,121483716.0
541145,8589934981,0,228,19.1,2.089374,0.0,0.0,20.0,427211423.0,1.0,-1.0,-1.0,19.0,-1.0,1.0,-1.0,-1.0


In [40]:
sex = df.groupby(['ego_id', 'u', 'sex_u', 'sex_v']).count()['city_id_u']
sex.loc[27].loc[0]

sex_u  sex_v
1.0    -1.0      1
        1.0     39
        2.0     33
Name: city_id_u, dtype: int64

In [42]:
df[(df['sex_v'] < 0) & (df['u'] == 0)]

Unnamed: 0,ego_id,u,v,t,x1,x2,x3,age_u,city_id_u,sex_u,school_u,university_u,age_v,city_id_v,sex_v,school_v,university_v
4102,27,0,112,,0.00092,0.0,1.0,21.0,794218497.0,1.0,-1.0,-1.0,33.0,-1.0,-1.0,-1.0,-1.0
43387,157,0,249,,0.0,0.0,1.0,46.0,230691174.0,1.0,983237916.0,448765999.0,-1.0,-1.0,-1.0,-1.0,-1.0
66240,196,0,286,39.5,0.451584,0.0,0.0,30.0,583263187.0,1.0,760988307.0,237132047.0,-1.0,583263187.0,-1.0,-1.0,-1.0
167762,350,0,271,,0.0,0.0,1.0,28.0,19973887.0,2.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
202087,444,0,79,177.6,4e-06,0.0,0.0,24.0,160062101.0,1.0,-1.0,138518111.0,27.0,105331516.0,-1.0,466745578.0,-1.0
240608,556,0,247,,0.0,0.0,1.0,61.0,876466243.0,1.0,-1.0,-1.0,84.0,387028443.0,-1.0,-1.0,662064925.0
351664,830,0,63,,0.0,0.0,1.0,14.0,861323792.0,1.0,-1.0,-1.0,9.0,-1.0,-1.0,-1.0,-1.0
360149,8589934616,0,60,,0.661995,0.0,0.0,40.0,258643337.0,2.0,787693834.0,731654302.0,-1.0,258643337.0,-1.0,-1.0,968151405.0
422496,8589934736,0,77,,0.0,0.0,1.0,65.0,425861037.0,2.0,14309101.0,991921978.0,-1.0,425861037.0,-1.0,296669579.0,-1.0
497109,8589934876,0,43,22.4,0.022988,0.0,0.0,35.0,690002492.0,2.0,713788089.0,489333284.0,-1.0,690002492.0,-1.0,-1.0,-1.0


In [45]:
df['same_city'] = df['city_id_u'] == df['city_id_v']

In [24]:
df[(df['ego_id'] == 10) & (df['u'] == 0)]['city_id_u']

1      812659840.0
5      812659840.0
17     812659840.0
19     812659840.0
21     812659840.0
          ...     
590    812659840.0
599    812659840.0
630    812659840.0
633    812659840.0
651    812659840.0
Name: city_id_u, Length: 78, dtype: float64

In [16]:
unique_vals = df.groupby(['ego_id', 'u', 'school_u']).nunique()
unique_vals

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,v,t,x1,x2,x3,age_u,city_id_u,sex_u,university_u,age_v,city_id_v,sex_v,school_v,university_v
ego_id,u,school_u,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
10,0,-1.0,78,61,67,3,1,1,1,1,1,29,11,2,36,25
10,1,875930901.0,4,4,4,1,1,1,1,1,1,4,3,2,4,3
10,2,232781636.0,18,16,13,1,1,1,1,1,1,13,7,2,12,8
10,3,41581261.0,5,2,3,1,2,1,1,1,1,5,3,1,4,3
10,4,-1.0,4,4,4,1,1,1,1,1,1,3,2,2,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8589934981,273,-1.0,1,0,1,1,1,1,1,1,1,1,1,1,1,1
8589934981,277,-1.0,2,0,2,1,1,1,1,1,1,1,2,1,1,1
8589934981,279,86062281.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1
8589934981,284,-1.0,1,0,1,1,1,1,1,1,1,1,1,1,1,1


In [18]:
unique_vals = df.groupby(['ego_id', 'u', 'sex_v']).nunique()
unique_vals

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,v,t,x1,x2,x3,age_u,city_id_u,sex_u,school_u,university_u,age_v,city_id_v,school_v,university_v
ego_id,u,sex_v,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
10,0,1.0,34,26,33,3,1,1,1,1,1,1,19,5,18,17
10,0,2.0,30,25,26,1,1,1,1,1,1,1,18,8,20,14
10,1,1.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1
10,1,2.0,3,3,3,1,1,1,1,1,1,1,3,2,3,2
10,2,1.0,7,7,6,1,1,1,1,1,1,1,6,3,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8589934981,279,1.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1
8589934981,282,1.0,1,1,1,1,1,0,0,0,0,0,1,1,1,1
8589934981,284,1.0,1,0,1,1,1,1,1,1,1,1,1,1,1,1
8589934981,288,1.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [14]:
df[(df['ego_id'] == 10) & (df['u'] == 0)]

Unnamed: 0,ego_id,u,v,t,x1,x2,x3,age_u,city_id_u,sex_u,school_u,university_u,age_v,city_id_v,sex_v,school_v,university_v
1,10,0,52,20.0,3.482469e-02,0.0,0.0,40.0,812659840.0,1.0,-1.0,29142664.0,38.0,592242923.0,2.0,-1.0,880545209.0
5,10,0,2,594.5,2.201947e-18,0.0,0.0,40.0,812659840.0,1.0,-1.0,29142664.0,38.0,812659840.0,2.0,232781636.0,880545209.0
17,10,0,27,594.5,1.296749e-22,0.0,0.0,40.0,812659840.0,1.0,-1.0,29142664.0,,,,,
19,10,0,38,30.6,4.175174e-03,0.0,0.0,40.0,812659840.0,1.0,-1.0,29142664.0,33.0,812659840.0,2.0,517078491.0,880545209.0
21,10,0,92,48.5,2.825643e-04,0.0,0.0,40.0,812659840.0,1.0,-1.0,29142664.0,38.0,812659840.0,2.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590,10,0,95,89.1,8.918856e-04,0.0,0.0,40.0,812659840.0,1.0,-1.0,29142664.0,28.0,-1.0,2.0,681795736.0,880545209.0
599,10,0,49,27.7,1.577549e+00,0.0,0.0,40.0,812659840.0,1.0,-1.0,29142664.0,34.0,59666061.0,1.0,-1.0,473453255.0
630,10,0,88,332.1,1.296749e-22,0.0,0.0,40.0,812659840.0,1.0,-1.0,29142664.0,39.0,812659840.0,2.0,-1.0,-1.0
633,10,0,116,402.4,5.869718e-03,0.0,0.0,40.0,812659840.0,1.0,-1.0,29142664.0,40.0,955378706.0,2.0,388758120.0,301954457.0


In [15]:
df[(df['city_id_v'] == -1) & (df['university_u'] > 0)]

Unnamed: 0,ego_id,u,v,t,x1,x2,x3,age_u,city_id_u,sex_u,school_u,university_u,age_v,city_id_v,sex_v,school_v,university_v
8,10,2,66,278.4,3.636031e-02,0.000000,0.0,38.0,812659840.0,2.0,232781636.0,880545209.0,-1.0,-1.0,1.0,-1.0,-1.0
31,10,0,70,594.5,3.764216e-02,0.000000,0.0,40.0,812659840.0,1.0,-1.0,29142664.0,-1.0,-1.0,1.0,-1.0,-1.0
43,10,5,95,78.5,4.010742e-02,0.000000,0.0,29.0,812659840.0,2.0,43061855.0,880545209.0,28.0,-1.0,2.0,681795736.0,880545209.0
53,10,0,110,164.4,1.268873e-11,0.000000,0.0,40.0,812659840.0,1.0,-1.0,29142664.0,39.0,-1.0,1.0,-1.0,-1.0
79,10,68,22,,1.735019e-04,0.000000,0.0,33.0,530650612.0,1.0,331780419.0,484443386.0,18.0,-1.0,1.0,207949403.0,856305020.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541466,8589934981,128,57,86.9,3.518087e+00,3.401197,1.0,20.0,427211423.0,1.0,-1.0,121483716.0,20.0,-1.0,1.0,-1.0,121483716.0
541468,8589934981,46,38,50.3,1.559557e-03,0.000000,0.0,19.0,427211423.0,2.0,342908623.0,121483716.0,37.0,-1.0,1.0,-1.0,647159329.0
541471,8589934981,122,233,,6.436268e-02,0.000000,0.0,19.0,330463101.0,1.0,-1.0,40773103.0,15.0,-1.0,2.0,-1.0,-1.0
541475,8589934981,93,181,113.1,8.699742e+00,4.543295,1.0,19.0,-1.0,1.0,-1.0,121483716.0,20.0,-1.0,1.0,-1.0,-1.0
