## Imports

In [1]:
import os
import pandas as pd
import matplotlib as mp

## Load Data

In [2]:
data_dir = "/Users/hannahzhang/Desktop/Github Repos/ERSP-TeamYang/data/"

models = []
qrels = []
queries = []
collection = data_dir + 'collection.tsv' # collection file

for filename in os.listdir(data_dir):
    if filename.endswith('.trec'): # trec files
        models.append(data_dir + filename)
    elif filename.split('.')[0] == "qrels": # qrels files
        qrels.append(data_dir + filename) # queries folder
    elif filename == 'queries': 
        queries_dir = data_dir + filename + "/"
        for filename in os.listdir(queries_dir):
            queries.append(queries_dir + filename)

print(models)
print(qrels)
print(queries)
print(collection)

['/Users/hannahzhang/Desktop/Github Repos/ERSP-TeamYang/data/bm25-t5-dev.trec', '/Users/hannahzhang/Desktop/Github Repos/ERSP-TeamYang/data/splade-dev.trec']
['/Users/hannahzhang/Desktop/Github Repos/ERSP-TeamYang/data/qrels.train.tsv', '/Users/hannahzhang/Desktop/Github Repos/ERSP-TeamYang/data/qrels.dev.tsv']
['/Users/hannahzhang/Desktop/Github Repos/ERSP-TeamYang/data/queries/queries.eval.tsv', '/Users/hannahzhang/Desktop/Github Repos/ERSP-TeamYang/data/queries/queries.train.tsv', '/Users/hannahzhang/Desktop/Github Repos/ERSP-TeamYang/data/queries/queries.dev.tsv']
/Users/hannahzhang/Desktop/Github Repos/ERSP-TeamYang/data/collection.tsv


## Qrels

In [3]:
columns = []

qrels_train_df = pd.read_csv(qrels[0], sep="\t")
print(qrels_train_df)

qrels_dev_df = pd.read_csv(qrels[1], sep="\t")
print(qrels_dev_df)


        1185869  0      0.1  1
0       1185868  0       16  1
1        597651  0       49  1
2        403613  0       60  1
3       1183785  0      389  1
4        312651  0      616  1
...         ... ..      ... ..
532755    19285  0  8841362  1
532756   558837  0  4989159  1
532757   559149  0  8841547  1
532758   706678  0  8841643  1
532759   405466  0  8841735  1

[532760 rows x 4 columns]
       1102432  0  2026790  1
0      1102431  0  7066866  1
1      1102431  0  7066867  1
2      1090282  0  7066900  1
3        39449  0  7066905  1
4        76162  0  7066915  1
...        ... ..      ... ..
59267   150337  0  8009410  1
59268    22241  0  8009429  1
59269   129177  0  8009442  1
59270   190655  0  3576091  1
59271   371455  0  8009476  1

[59272 rows x 4 columns]


## Models

#### Column Descriptions
- Query_id: Id of query that system is being evaluated on
- Iteration: Iteration number (or run) of the retrieval system
  - Q0 means its the first (or only) run (e.g. batch of results) for the query
- Document_id: Id of document (or passage) returned by the retrieval system for the query
- Rank: Rank of the document for the query
- Score: Relevance score assigned by the retrieval system to the document for the query
- Run_id: Unique identifier for the run that produced the result (often used to distinguish between different models or different configurations of the same model in the testing)

### BM25

In [78]:
columns = ['query_id', 'iteration', 'document_id', 'rank', 'score', 'run_id']

bm25_df = pd.read_csv(models[0], sep='\t', names=columns)

print(bm25_df)

         query_id iteration  document_id  rank    score run_id
0         1048585        Q0      7187157     0  46.6189     R0
1         1048585        Q0      7187156     1  45.7317     R0
2         1048585        Q0      7187158     2  44.6009     R0
3         1048585        Q0      7617404     3  44.5052     R0
4         1048585        Q0      7187155     4  43.8783     R0
...           ...       ...          ...   ...      ...    ...
6979995   1048565        Q0      3922376   995  17.2643     R0
6979996   1048565        Q0       765915   996  17.2637     R0
6979997   1048565        Q0      4292320   997  17.2631     R0
6979998   1048565        Q0      1695524   998  17.2628     R0
6979999   1048565        Q0      2985688   999  17.2608     R0

[6980000 rows x 6 columns]


In [90]:
unique_queries = bm25_df['query_id'].unique()
unique_query_count = bm25_df['query_id'].nunique()

print("Unique queries: ", unique_queries)
print("Unique query count: ", unique_query_count)

Unique queries:  [1048585       2  524332 ...  968921  786375 1048565]
Unique query count:  6980


In [108]:
for i, n in enumerate(unique_queries[:5]):
    uq_df = bm25_df[bm25_df['query_id'] == unique_queries[i]] # Splice df on unique query
    unique_documents = uq_df['document_id'].unique()
    unique_document_count = uq_df['document_id'].nunique()
    print(
        f"Unique document count for Query ID {n}: ", unique_document_count,
        f"\n5 Unique documents for Query ID {n}: ", unique_documents[:5]
        )
    

Unique document count for Query ID 1048585:  1000 
5 Unique documents for Query ID 1048585:  [7187157 7187156 7187158 7617404 7187155]
Unique document count for Query ID 2:  1000 
5 Unique documents for Query ID 2:  [5262269 1304571 5881917 3857958 6947077]
Unique document count for Query ID 524332:  1000 
5 Unique documents for Query ID 524332:  [1518543 1512632 1966060 1194314 1518541]
Unique document count for Query ID 1048642:  1000 
5 Unique documents for Query ID 1048642:  [ 671694 8621225  906008 8041183 4734246]
Unique document count for Query ID 524447:  1000 
5 Unique documents for Query ID 524447:  [3541560 8454456 3541558 3836580 6856971]


### SPLADE

In [81]:
columns = ['query_id', 'iteration', 'document_id', 'rank', 'score', 'run_id']

splade_df = pd.read_csv(models[1], sep='\t', names=columns)

print(splade_df)

         query_id iteration  document_id  rank   score run_id
0         1048585        Q0      7187155     0  104472     R0
1         1048585        Q0      7187160     1  100811     R0
2         1048585        Q0      7187157     2   99206     R0
3         1048585        Q0      7187158     3   98698     R0
4         1048585        Q0      3100835     4   86255     R0
...           ...       ...          ...   ...     ...    ...
6979995   1048565        Q0      4838288   995   66246     R0
6979996   1048565        Q0      2133477   996   66245     R0
6979997   1048565        Q0      5753707   997   66239     R0
6979998   1048565        Q0      1472257   998   66238     R0
6979999   1048565        Q0      5637117   999   66238     R0

[6980000 rows x 6 columns]


In [84]:
print("Unique query count: ", bm25_df['query_id'].nunique())

Unique query count:  6980


## Named Entity Recognition (NER)

In [138]:
collection_df = pd.read_csv(collection, sep='\t', names=['query'])

In [161]:
collection_df.head()

Unnamed: 0,query
0,The presence of communication amid scientific ...
1,The Manhattan Project and its atomic bomb help...
2,Essay on The Manhattan Project - The Manhattan...
3,The Manhattan Project was the name for a proje...
4,versions of each volume as well as complementa...


In [160]:
collection_df.shape

(8841823, 1)

In [189]:
import spacy
import pandas as pd

# Load the spaCy pretrained model
nlp = spacy.load("en_core_web_sm")

named_entities = []

# Loop over the first 10 rows
for i in range(500):
    # Get the text from the 'query' column of the dataset
    text = collection_df['query'][i]

    # Process the text using spaCy
    doc = nlp(text)

    # Extract named entities from the text
    ents = [(ent.text, ent.label_) for ent in doc.ents]  # List of (entity, label) tuples
    
    # Append the query and its corresponding entities as a tuple
    named_entities.append({'query': text, 'entities': ents})

# Convert the list into a DataFrame
entities_df = pd.DataFrame(named_entities)

# Show the DataFrame
print(entities_df.iloc[0, 0])
print(entities_df.iloc[0, 1])

## If contains 1 entity cateogory, may be useful for sparse learning as hold more distinguishable information. 
## If contains multiple, might be better to use sparse learning.

## binary easier, use weights for normalize frequency of query category, PCA feature extraction first

## mrr to go through query categories (to determine whether the classification is accurate)


The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.
[('the Manhattan Project', 'ORG'), ('hundreds of thousands', 'CARDINAL')]


In [183]:
from collections import Counter

# Flatten the list of entities and count their occurrences
all_entities = [ent[0] for sublist in entities_df['entities'] for ent in sublist]
entity_counts = Counter(all_entities)

# Print the most common entities
print(entity_counts.most_common(10))


[('1', 44), ('first', 40), ('two', 32), ('2', 27), ('one', 25), ('China', 22), ('the United States', 21), ('Latrobe', 20), ('annual', 18), ('Scottish', 17)]


In [186]:
# Filter queries that mention person
person_queries = entities_df[entities_df['entities'].apply(lambda x: any(ent[1] == 'PERSON' for ent in x))]
print(len(person_queries))

165
