In [None]:
import pandas as pd
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from users.users import Users
import csv
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
crawled_path = ''
test_path = ''
train_path = ''
root_path = ''
test_authors_path = ''
path_out = ''

In [None]:
df = pd.read_csv(crawled_path)

In [None]:
df = df[['id', 'title']]

In [None]:
df_test = pd.read_csv(test_path)
df_test_author_ids = df_test['author_id'].unique()
# UNCOMMENT
# df_train = pd.read_csv('/mnt/data/vikuen/data/guardian/train-set_all.csv')
# if for smaller vector
df_train = pd.read_csv(train_path)
author_ids = df_train[df_train['author_id'].isin(df_test_author_ids)]['author_id'].unique()

In [None]:
train_article_ids = df_train[df_train['author_id'].isin(df_test_author_ids)]['article_id'].unique()
test_article_ids = df_test['article_id'].unique()

In [None]:
article_titles_train = df[df['id'].isin(train_article_ids)]['title'].tolist()
article_titles_train_ids = df[df['id'].isin(train_article_ids)]['id'].tolist()

In [None]:
article_titles_test = df[df['id'].isin(test_article_ids)]['title'].tolist()
article_titles_test_ids = df[df['id'].isin(test_article_ids)]['id'].tolist()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 
 
# settings that you use for count vectorizer will go here
tfidf_vectorizer=TfidfVectorizer(use_idf=True)
 
# just send in all your docs here
tfidf_vectorizer = tfidf_vectorizer.fit(article_titles_train)
tfidf_train_vectors = tfidf_vectorizer.transform(article_titles_train)
tfidf_test_vectors = tfidf_vectorizer.transform(article_titles_test)

In [None]:
import numpy as np


In [None]:
df_test_comment_to_article = df_test[['article_id', 'comment_id']]
df_test_comment_to_article.index = df_test_comment_to_article.comment_id
test_comment_to_article_dict = df_test_comment_to_article['article_id'].to_dict()

In [None]:
df_representation = df_train[df_train['author_id'].isin(df_test_author_ids)]
df_train_user_articles = df_representation.groupby('author_id')['article_id'].apply(lambda x: np.unique(x)).reset_index()

In [None]:
users = Users()

author_ids = np.load(test_authors_path)
print('All Authors: ', len(author_ids))
print('Authors with negative samples: ', len(author_ids))

# author_ids = np.array_split(author_ids, partitions)
# print(f'Number of parititions: {len(author_ids)}')
# author_ids = author_ids[n_partition]

MAX_POSITIVE_SAMPLES_USER = 10
NUM_NEGATIVE_PER_POSITIVE = 50

with open(path_out, mode='w') as f:
    writer = csv.writer(f)
    writer.writerow(['author_id', 'k', 'hits_at_k', 'interacted_count', 'precision', 'recall', 'AP' 'documents'])

K = [1, 3, 5, 10]


In [None]:
train_article_pos_dict = {k: v for v, k in enumerate(article_titles_train_ids)} 
test_article_pos_dict = {k: v for v, k in enumerate(article_titles_test_ids)} 

In [None]:
import numpy.ma as ma

def get_user_representation(user_id):
    user_train_articles = df_train_user_articles[df_train_user_articles['author_id'] == user_id]['article_id'].iloc[0]
    user_train_mask = []
    for article in user_train_articles:
        pos = train_article_pos_dict.get(article)
        if pos is not None:
            user_train_mask.append(pos)
    
    mx = tfidf_train_vectors[user_train_mask]
    return mx.mean(axis=0)

In [None]:
from numpy import array, ravel

In [None]:
def get_comment_section_representation(comment_section_ids):
    if len(comment_section_ids) > 0:
        article_id = test_comment_to_article_dict[comment_section_ids[0]]
        return tfidf_test_vectors[test_article_pos_dict[article_id]].todense()
    else: 
        return np.zeros((1, 26030))

In [None]:
def evaluate(interacted_items_count_testset, hits, k):
    precision = hits / k
    recall = hits / interacted_items_count_testset
    return interacted_items_count_testset, precision, recall

In [None]:
def print_progress(index):
    if index % 200 == 0:
        print(index / len(author_ids))

for index, author in enumerate(author_ids):
    print_progress(index)
    author_representation = get_user_representation(author)
    truth_sections = users.get_positive_test_samples(author)[:MAX_POSITIVE_SAMPLES_USER]
    false_sections = users.get_negative_test_samples(author, len(truth_sections), NUM_NEGATIVE_PER_POSITIVE)

    representation = np.array([get_comment_section_representation(l) for l in truth_sections])
    nsamples, nx, ny = representation.shape
    representation = representation.reshape((nsamples,nx*ny))
    truth_sim = list(cosine_similarity(author_representation.reshape(1, -1), representation)[0])
    
    representation = np.array([get_comment_section_representation(l) for l in false_sections])
    nsamples, nx, ny = representation.shape
    representation = representation.reshape((nsamples,nx*ny))
    false_sim = list(cosine_similarity(author_representation.reshape(1, -1), representation)[0])

    interacted_items_count_testset = len(truth_sections)

    t = pd.DataFrame({'v': truth_sim, 'flag': 1})
    f = pd.DataFrame({'v': false_sim, 'flag': 0})
    out = pd.concat([t, f])
    out = out.sort_values(by='v', ascending=False).head(max(K))
    y_true = out['flag'].tolist()
    for k in K:
        hits_at_k = sum(y_true[:k])
        interacted_items_count_testset, precision, recall = evaluate(interacted_items_count_testset, hits_at_k, k)
        precisions_at = []
        for i, el in enumerate(y_true[:k]):
            precisions_at.append(sum(y_true[:i+1]) / (i+1))
        AP_at_k = sum(precisions_at) / k
        with open(path_out, 'a', encoding='utf-8') as f:
            writer = csv.writer(f)
            # ['author_id', 'k', 'hits_at_k', 'interacted_count', 'precision', 'recall']
            writer.writerow([author, k, hits_at_k, interacted_items_count_testset, precision, recall, round(AP_at_k, 4), len(out)])

In [None]:
truth_sim

In [None]:
tfidf_test_vectors.shape