# Create Unlabeled Results
This notebook takes one or more csv files of query-results created while evaluating different embedding models and subtracts results that have already been labeled in order to generate a new unlabeled results csv file to label.

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [None]:
from datetime import datetime
import json
import os

import pandas as pd

In [None]:
input_dir = '../data/rank_eval/unlabeled_results'
unlabeled_filenames = [
    'all-mpnet-base-v2-2023-04-29.csv',
    'embed-english-v2.0-2023-04-29.csv'
]
labeled_filenames = [
    'text-embedding-ada-002-2023-04-29.csv'
]

today = datetime.today().strftime("%Y-%m-%d")

## Read raw labeled results

In [None]:
# read csv file into pandas
unlabeled_dfs = []
for filename in unlabeled_filenames:
    unlabeled_dfs.append(pd.read_csv(os.path.join(input_dir, filename)))
unlabeled_df = pd.concat(unlabeled_dfs)

labeled_dfs = []
for filename in labeled_filenames:
    labeled_dfs.append(pd.read_csv(os.path.join(input_dir, filename)))
labeled_df = pd.concat(labeled_dfs)

In [None]:
print(unlabeled_df.shape)
unlabeled_df.head(3)

In [None]:
print(labeled_df.shape)
labeled_df.head(3)

## Remove duplicates

In [None]:
unlabeled_df = unlabeled_df.drop_duplicates(['query', 'result'])
print(unlabeled_df.shape)

In [None]:
labeled_df = labeled_df.drop_duplicates(['query', 'result'])
print(labeled_df.shape)

## Subtract labeled results from unlabeled results

In [None]:
df = pd.merge(unlabeled_df, labeled_df, indicator=True, how='left')
df = df[df['_merge'] == 'left_only']
df = df[['query', 'text', 'result']]
print(df.shape)

## Save unlabeled results

In [None]:
df.sort_values(by=['query', 'result'], inplace=True)
df.head(5)

In [None]:
df.to_csv(os.path.join(input_dir, f"{today}.csv"), index=False)