## Transform `pd.DataFrame`-runs to `run.txt`

In [2]:
import os
import gzip
from pathlib import Path
import pandas as pd

In [3]:
runs = [
    "submit_run_2023-02.csv",
    "submit_run_2023-03.csv",
    "submit_run_2023-04.csv",
    "submit_run_2023-05.csv",
    "submit_run_2023-06.csv",
    "submit_run_2023-07.csv",
    "submit_run_2023-08.csv",
]

In [4]:
l_strip = "submit_run_"
r_strip = ".csv"
all_strip = l_strip + r_strip

BASE_PATH = "submissions/"

In [None]:
trec_cols = ["qid", "placeholder", "docno", "rank", "score", "name"]

for run in runs:
    # Create Directory
    sub_collection = run.strip(all_strip)
    sub_path = BASE_PATH+sub_collection
    sub_run_file = sub_path + "/run.txt"
    Path(sub_path).mkdir(parents=True, exist_ok=True)

    # Transform DataFrame to TREC-EVAL format
    df = pd.read_csv(run)
    df_trec = pd.DataFrame()
    
    df_trec["qid"] = df["qid"]
    df_trec["placeholder"] = "Q0"
    df_trec["docno"] = df["docno"]
    df_trec["rank"] = df["rank"]
    df_trec["score"] = df["score"]    
    df_trec["name"] = sub_collection

    print(df_trec)
    df_trec.to_csv(sub_run_file, index=False, header=False, sep=" ")
    #break
    

           qid placeholder       docno  rank      score     name
0          928          Q0  doc3185247     0  26.156443  2023-02
1          928          Q0  doc3374387     1  23.111432  2023-02
2          928          Q0  doc1337050     2  23.111432  2023-02
3          928          Q0  doc3290627     3  23.004051  2023-02
4          928          Q0  doc2650408     4  22.809234  2023-02
...        ...         ...         ...   ...        ...      ...
14084597  2550          Q0  doc2506299   995   0.298159  2023-02
14084598  2550          Q0  doc3142009   996   0.182743  2023-02
14084599  2550          Q0  doc3364138   997   0.169984  2023-02
14084600  2550          Q0    doc59840   998   0.120606  2023-02
14084601  2550          Q0  doc3030902   999   0.108006  2023-02

[14084602 rows x 6 columns]
          qid placeholder    docno  rank      score     name
0          99          Q0   245518     0  25.195218  2023-03
1          99          Q0  1614409     1  24.540958  2023-03
2       

## Transform `run.txt` to `run.txt.gz`

The `compress="gzip"` in `df.to_csv()` seems to yield errors with the current format.</br>
A solution has to be made, where the gzip compressions needs to be applied after saving the raw text files.</br>
There is an easy workaround for MAC/Linux Distributions.</br>

*Code*

The code looks for `run.txt` files in the parent- and child directories.</br>
Every file found will then be compressed to its respective `run.txt.gz` file.

`find -name "run.txt" -type f | xargs -I {} gzip {} > "{}.gz"`