# Analyze search and ratings logs to generate unrated queries and results from rated queries to label



In [None]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [None]:
from datetime import datetime
import os

import pandas as pd

from models.rank_eval_utils import load_logs, get_log_messages

In [None]:
# config
query_dir = '../data/rank_eval/queries'
results_dir = '../data/rank_eval/unlabeled_results'
today = datetime.today().strftime('%Y-%m-%d')
log_bucket = 'iloveconference.logs'

In [None]:
# load logs
logs = load_logs(log_bucket)

## Analyze searches

In [None]:
# get search messages from log
searches = get_log_messages(logs, "search")
len(searches)

In [None]:
# get session -> query
session_queries = {msg["response"]["session"]: msg["response"]["q"].lower() for msg in searches}
len(session_queries)

In [None]:
# get session queries as a dataframe
session_queries_df = pd.DataFrame([{"session": k, "query": v} \
                                   for k, v in session_queries.items()])

In [None]:
# get unique queries
queries = set(session_queries.values())
len(queries)

In [None]:
# get results dataframe
results = {result["id"]: result["text"]
           for msg in searches for result in msg["response"]["results"]}
results_df = pd.DataFrame([{"result": k, "text": v.replace("\n", " ")} \
                           for k, v in results.items()])
print(results_df.shape)
results_df.head(3)                        

## Analyze ratings

In [None]:
# get rate messages from log
ratings = get_log_messages(logs, "rate")
len(ratings)

In [None]:
ratings[-1]

In [None]:
# convert to a dataframe so we can do some math
ratings_df = pd.DataFrame(ratings)[["session", "user", "result", "score"]]
print(ratings_df.shape)
ratings_df.head(3)

In [None]:
# sum score by (session, result)
ratings_df = ratings_df.groupby(["session", "user", "result"]).agg({"score": "sum"})
ratings_df = ratings_df.reset_index()
print(ratings_df.shape)
ratings_df.head(3)

In [None]:
# count the number of positively-rated results in each session
pos_results_df = ratings_df[ratings_df["score"] > 0]
print(pos_results_df.shape)
pos_results_df.head(3)

In [None]:
rated_sessions_df = pos_results_df.groupby(["session", "user"]) \
                                  .size().reset_index(name="count")
print(rated_sessions_df.shape)
rated_sessions_df.head(3)

## Find sessions with multiple ratings

In [None]:
# find sessions with at least 3 positive ratings
rated_sessions_df = rated_sessions_df[rated_sessions_df["count"] >= 3]
print(rated_sessions_df.shape)
rated_sessions_df

## Find users who have rated multiple sessions

In [None]:
user_sessions_df = rated_sessions_df.groupby(["user"]) \
                                  .size().reset_index(name="count")
print(user_sessions_df.shape)
user_sessions_df

In [None]:
# join rated_sessions_df (user, session) with session_queries_df (session_query) 
# and sort by user to see which users have rated which queries
user_queries_df = pd.merge(rated_sessions_df[["session", "user"]],
                           session_queries_df[["session", "query"]], on ="session")
user_queries_df.sort_values("user")

In [None]:
expected_raters = set([-22456524306, -15390087832, -7946136890, -7192848417, -4647803836, 9431498551, 10333592859, 22258490670])
unexpected_raters = set([-8626982686, 32997089357])

In [None]:
rated_sessions_df[rated_sessions_df["user"].isin(unexpected_raters)]

## Find rated queries

In [None]:
rated_sessions = rated_sessions_df["session"].tolist()
rated_queries = {v for k, v in session_queries.items() if k in rated_sessions}
len(rated_queries)

## Write unrated queries

In [None]:
unrated_queries = queries - rated_queries
len(unrated_queries)

In [None]:
filename = os.path.join(query_dir, f"{today}.csv")
unrated_queries_df = pd.DataFrame(unrated_queries, columns=["Query"])
unrated_queries_df.to_csv(filename, index=False)

## Write rated queries with results for labeling

In [None]:
# join rated_sessions_df (session, user) with pos_results_df (session, result),
# results_df (result, text, anchor) and session_queries_df (session, query)
# to create unlabeled_results (query, result, text, session, user)
unlabeled_results_df = pd.merge(rated_sessions_df[["session", "user"]], 
                               pos_results_df[["session", "result"]], on="session")
unlabeled_results_df = pd.merge(unlabeled_results_df, results_df, on="result")   
unlabeled_results_df = pd.merge(unlabeled_results_df, session_queries_df, on="session")
unlabeled_results_df = unlabeled_results_df[["query", "text", "result"]]
unlabeled_results_df = unlabeled_results_df.drop_duplicates()
unlabeled_results_df = unlabeled_results_df.sort_values(["query", "result"])
print(unlabeled_results_df.shape)
unlabeled_results_df.head(3)

In [None]:
# write unlabeled results
filename = os.path.join(results_dir, f"{today}.csv")
unlabeled_results_df.to_csv(filename, index=False)