In [1]:
import pandas as pd
import os.path

DATASET_FOLDER = '/Users/jilljenn/Desktop/datasets/'

In [2]:
truth = pd.read_csv(os.path.join(DATASET_FOLDER, 'duck/DUCKgt.txt'), sep='\t', names=('item_id', 'truth'))

In [3]:
people = pd.read_csv(os.path.join(DATASET_FOLDER, 'duck/DUCKlabel.txt'), sep=' ', names=('item_id', 'worker', 'label'))

In [4]:
people.head()

Unnamed: 0,item_id,worker,label
0,36618,896,0
1,11619,896,1
2,36620,896,1
3,36621,896,1
4,36622,896,1


In [5]:
df = people.merge(truth, on='item_id')

In [6]:
df.head()

Unnamed: 0,item_id,worker,label,truth
0,36618,896,0,0
1,36618,866,1,0
2,36618,39,0,0
3,36618,175,1,0
4,36618,1721,1,0


In [7]:
import numpy as np

df['correct'] = (df['label'] == df['truth']).astype(np.int32)

In [8]:
encode = {}
for feature in {'item_id', 'worker'}:
    encode[feature] = dict(zip(df[feature].unique(), range(10000)))
df['item'] = df['item_id'].map(encode['item_id'])
df['user'] = df['worker'].map(encode['worker'])

In [9]:
df.head()

Unnamed: 0,item_id,worker,label,truth,correct,item,user
0,36618,896,0,0,1,0,0
1,36618,866,1,0,0,0,1
2,36618,39,0,0,1,0,2
3,36618,175,1,0,0,0,3
4,36618,1721,1,0,0,0,4


In [10]:
# Save dataset into the KTM folder
df[['user', 'item', 'correct']].to_csv('/Users/jilljenn/code/ktm/data/duck/data.csv', index=None)

In [11]:
# Save config into the KTM folder
import yaml

with open('/Users/jilljenn/code/ktm/data/duck/config.yml', 'w') as f:
    yaml.safe_dump({
        'nb_users': 1 + int(df['user'].max()),
        'nb_items': 1 + int(df['item'].max())
    }, f)

## Subsets

In [12]:
test = {}
for index in range(1):
    subset = pd.read_csv(os.path.join(DATASET_FOLDER, 'duck/subgolden/DUCKsubgoldengt.{}.txt'.format(index)), sep='\t', names=('item', 'truth'))
    known_items = set(subset['item'])
    i_train = df.query('item_id in @known_items').index
    test[index] = df.query('item_id not in @known_items')
    i_test = test[index].index
    # Save indices for train and test into the KTM folder
    # np.save('/Users/jilljenn/code/ktm/data/duck/i_train{}.npy'.format(index), i_train)
    # np.save('/Users/jilljenn/code/ktm/data/duck/i_test{}.npy'.format(index), i_test)
    # Load predictions from KTM
    test[index]['pred'] = np.load('/Users/jilljenn/code/ktm/data/duck/y_pred{}.npy'.format(index))
    test[index]['confidence'] = abs(test[index]['pred'] - 0.5)
    test[index]['round_pred'] = np.round(test[index]['pred'])
    test[index]['one'] = 1
    test[index]['recommend'] = (1 - test[index]['round_pred']) * (1 - test[index]['label']) + test[index]['round_pred'] * test[index]['label']
test[0].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.ht

Unnamed: 0,item_id,worker,label,truth,correct,item,user,pred,confidence,round_pred,one,recommend
0,36618,896,0,0,1,0,0,0.432809,0.067191,0.0,1,1.0
1,36618,866,1,0,0,0,1,0.535854,0.035854,1.0,1,1.0
2,36618,39,0,0,1,0,2,0.825434,0.325434,1.0,1,0.0
3,36618,175,1,0,0,0,3,0.513286,0.013286,1.0,1,1.0
4,36618,1721,1,0,0,0,4,0.279498,0.220502,0.0,1,0.0


In [13]:
scores = test[0].groupby(['item_id', 'recommend']).sum()

In [14]:
max_sum = scores.groupby('item_id').max().rename(columns={key: 'max_sum_' + key for key in {'pred', 'confidence', 'one'}})

In [15]:
max_sum[['max_sum_pred', 'max_sum_confidence', 'max_sum_one']].head()

Unnamed: 0_level_0,max_sum_pred,max_sum_confidence,max_sum_one
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11573,16.397921,5.338517,23
11575,16.411759,4.908451,25
11577,14.824428,4.379044,22
11578,12.499755,4.124737,20
11579,20.92967,5.65015,33


In [16]:
scores = scores.join(max_sum[['max_sum_pred', 'max_sum_confidence', 'max_sum_one']], on='item_id')

In [17]:
for feature in {'pred', 'confidence', 'one'}:
    scores['{}_choice'.format(feature)] = scores[feature] == scores['max_sum_{}'.format(feature)]

In [18]:
scores.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,worker,label,truth,correct,item,user,pred,confidence,round_pred,one,max_sum_pred,max_sum_confidence,max_sum_one,pred_choice,confidence_choice,one_choice
item_id,recommend,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
11573,0.0,23049,6,16,6,400,272,9.115336,1.901802,10.0,16,16.397921,5.338517,23,False,False,False
11573,1.0,34585,21,23,21,575,469,16.397921,5.338517,21.0,23,16.397921,5.338517,23,True,True,True
11575,0.0,21319,5,14,5,378,299,7.749337,1.515753,9.0,14,16.411759,4.908451,25,False,False,False
11575,1.0,36315,21,25,21,675,442,16.411759,4.908451,21.0,25,16.411759,4.908451,25,True,True,True
11577,0.0,32756,3,22,3,1210,445,14.824428,4.379044,19.0,22,14.824428,4.379044,22,True,True,True


In [19]:
for feature in {'pred', 'confidence', 'one'}:
    pred = scores.query('{}_choice == True'.format(feature)).reset_index()[['item_id', 'recommend']].rename(columns={'recommend': feature})
    truth = truth.merge(pred, on='item_id')

In [20]:
truth.head()

Unnamed: 0,item_id,truth,pred,confidence,one
0,36618,0,0.0,0.0,0.0
1,11619,1,1.0,1.0,1.0
2,36622,0,0.0,0.0,0.0
3,36623,0,0.0,0.0,0.0
4,36624,0,0.0,0.0,0.0


In [21]:
from sklearn.metrics import accuracy_score, roc_auc_score

for feature in {'pred', 'confidence', 'one'}:
    print('acc', feature, '=', accuracy_score(truth['truth'], truth[feature]))
    print('auc', feature, '=', roc_auc_score(truth['truth'], truth[feature]))

acc pred = 0.7901234567901234
auc pred = 0.7778457772337821
acc confidence = 0.8024691358024691
auc confidence = 0.7925336597307221
acc one = 0.7654320987654321
auc one = 0.7515299877600978
