In [3]:
import pandas as pd
from biblib import Entry
import pybtex as pbt
import math
import numpy as np
import re
import sys
import importlib

from print_table import print_single_dataset_score_table, get_max_score, print_dataset_table

## Load data

In [19]:
path = 'bot_detection_papers.tsv'

df = pd.read_csv(path, sep='\t')
df.fillna("", inplace=True)

In [20]:
dataset_df_path = 'datasets.tsv'

dataset_df = pd.read_csv(dataset_df_path, sep='\t')
dataset_df.fillna("", inplace=True)


In [21]:
scores_path = '~/work/repo/bot-detection/scores.csv'
sdt_df = pd.read_csv(scores_path)


## Generate bibliography methods.bib

In [7]:
bib = pbt.database.BibliographyData()

def add_bib_entries(df):
    for row in df.to_dict(orient="records"):
        #print(row)
        if row['bibtex_id'] in bib.entries.keys():
            continue
        if 'analyzed?' in row:
            if not row['analyzed?']:
                continue
        if not row['bibtex_id']:
            continue
        inputdict = {
            'author': row['authors'],
            'title': row['title'],
            'year': str(int(row['year']))
        }

        if row['conference?']:
            inputdict.update({
                'booktitle': row['booktitle'],
                'pages': row['pages'],
            })
            if row['booktitle']:
                inputdict['booktitle'] = row['booktitle'] 
            type_ = 'inproceedings'
        else:
            inputdict['journal'] = row['journal']
            if row['volume']:
                inputdict['volume'] = str(int(row['volume']))
            if row['number']:
                inputdict['number'] = str(int(row['number']))
            type_ = 'article'
        if row['publisher']:
            inputdict['publisher'] = row['publisher']
        if row['doi']:
            inputdict['doi'] = row['doi']
        if row['pages']:
            inputdict['pages'] = row['pages']
        entry = pbt.database.Entry(type_=type_, fields=inputdict)
        bib.add_entry(entry=entry, key=row['bibtex_id'])

add_bib_entries(df)
add_bib_entries(dataset_df)

bib.to_file("methods.bib")

## Generate table dataset -> paper that uses it

In [8]:
dataset_dict = {}

for row in df.to_dict(orient="records"):
    datasets = row['dataset(s) used'].split("; ")
    for d in datasets:
        if d in dataset_dict:
            dataset_dict[d].append(row['bibtex_id'])
        else:
            dataset_dict[d] = [row['bibtex_id']]

In [9]:
for k,v in dataset_dict.items():
    impl_papers = ", ".join(dataset_dict[k])

    cite_as = '\\cite{' + impl_papers + '} \\\\'
    dataset_name = k
    print(dataset_name+ " & " + cite_as)

midterm-2018 & \cite{ng2023botbuster, guo2022social, antenore2022a, ilias2021deep, dimitriadis2021social, giorgi2021characterizing, yang2020scalable, sayyadiharikandeh2020detection, muo2020malicious, barhate2020twitter} \\
cresci-2015 & \cite{pham2022bot2vec:, gonzalez2022the, dimitriadis2021social, muo2020malicious, stella2019influence, echeverria2018lobo, cresci2015fame} \\
twibot-2020 & \cite{feng2022heterogeneity-aware, alothali2022bot-mgat:, rovito2022an, feng2021botrgcn, geng2021satar, dehghan2018detecting} \\
rtbust-2019 & \cite{guo2022social, ilias2021deep, yang2020scalable, sayyadiharikandeh2020detection, mendoza2020malicious, muo2020malicious, nguyen2020bot, mazza2019rtbust} \\
feedback-2019 & \cite{guo2022social, ilias2021deep, yang2020scalable, sayyadiharikandeh2020detection} \\
gilani-2017 & \cite{guo2022social, ilias2021deep, dimitriadis2021social, yang2020scalable, gilani2020classification, sayyadiharikandeh2020detection, muo2020malicious, gilani2019a, echeverria2018lobo

## Generate table for dataset, #people/bots, description

In [10]:
dataset_df['year'] = dataset_df['dataset name'].str[-4:]

In [11]:
dataset_df = dataset_df.sort_values(by=['year', 'dataset name'], ascending=[False,True])

In [12]:
print_dataset_table(dataset_df, benchmark_only=True)


\data{twibot-2020} & \cite{feng2021twibot} & 3632 & 4646 \\
\data{feedback-2019} & \cite{yang2019arming} & 380 & 139 \\
\data{pan-2019} & \cite{rangel2015overview} & 2060 & 2060 \\
\data{rtbust-2019} & \cite{mazza2019rtbust} & 340 & 353 \\
\data{midterm-2018} & \cite{yang2020scalable} & 8092 & 42446 \\
\data{stock-2018} & \cite{cresci2019fake} & 6174 & 7102 \\
\data{cresci-2017} & \cite{cresci2017the} & 3474 & 10894 \\
\data{gilani-2017} & \cite{gilani2017classification} & 1939 & 1492 \\
\data{cresci-2015} & \cite{cresci2015fame} & 1957 & 3351 \\
\data{yang-2013} & \cite{yang2013empirical} & 10000 & 1000 \\
\data{caverlee-2011} & \cite{lee2011a} & 19276 & 22223 \\


## Generate table for dataset -> sdt/sota scores

In [14]:
dataset_names = [
    'twibot-2020',
    'feedback-2019',
    'rtbust-2019',
    'pan-2019',
    'midterm-2018',
    'stock-2018',
    'cresci-2017',
    'gilani-2017',
    'cresci-2015',
    'yang-2013',
    'caverlee-2011'
]

score_dict = {}

for name in dataset_names:
    score_dict[name] = {
        'accuracy': get_max_score(df, name, 'accuracy'),
        'f1': get_max_score(df, name, 'f1'),
    }
    

In [15]:
print_single_dataset_score_table(score_dict, sdt_df)


\data{twibot-2020} & 0.82/0.86/0.80  & 1 & \cite{feng2022heterogeneity-aware} & -0.05/-0.03 \\
\data{feedback-2019} & 0.80/0.55/0.69  & 3 & \cite{guo2022social} & -0.01/-0.15 \\
\data{rtbust-2019} & 0.71/0.73/0.71  & 4 & \cite{mazza2019rtbust} & -0.22/-0.14 \\
\data{pan-2019} & 0.92/0.91/0.92  & 2 & \cite{geng2021satar} & -0.03/-0.04 \\
\data{midterm-2018} & 0.97/0.98/0.95  & 1 & \cite{giorgi2021characterizing} & -0.01/\;\;--- \\
\data{stock-2018} & 0.80/0.83/0.80  & 3 & \;\;---\;\; & \;\;---\;\;\,/\;\;--- \\
\data{cresci-2017} & 0.98/0.98/0.97  & 1 & \cite{kudugunta2018deep} & -0.02/-0.02 \\
\data{gilani-2017} & 0.77/0.72/0.76  & 3 & \cite{gilani2020classification} & -0.09/-0.11 \\
\data{cresci-2015} & 0.98/0.98/0.98  & 3 & \cite{cresci2015fame} & -0.01/-0.01 \\
\data{yang-2013} & 0.96/0.71/0.79  & 4 & \cite{yang2013empirical} & -0.03/-0.19 \\
\data{caverlee-2011} & 0.91/0.91/0.90  & 2 & \cite{lee2011a} & -0.08/-0.07 \\
