In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
df_snippets = pd.read_csv('data/00_dataset.csv')

In [4]:
df_snippets.columns

Index(['claim_id', 'snippet_content', 'snippet_date', 'snippet_id',
       'snippet_pagenum', 'snippet_title', 'snippet_url', 'claim_content',
       'claim_date', 'claim_label', 'claim_tag', 'claim_url', 'date_number'],
      dtype='object')

In [4]:
df_snippets.head(2)

Unnamed: 0,claim_id,snippet_content,snippet_date,snippet_id,snippet_pagenum,snippet_title,snippet_url,claim_content,claim_date,claim_label,claim_tag,claim_url,date_number
0,3,"News on Japan, Business News, Opinion, Sports,...",17636,0,0,Article expired | The Japan Times,https://www.japantimes.co.jp/article-expired/,Black and Latino people in NYC are arrested at...,17646,True,— PolitiFact New York,/new-york/statements/2018/apr/25/kirsten-gilli...,0
1,3,Get the latest breaking news across the U.S. o...,17636,1,0,"U.S. News | Latest National News, Videos …",https://abcnews.go.com/US/,Black and Latino people in NYC are arrested at...,17646,True,— PolitiFact New York,/new-york/statements/2018/apr/25/kirsten-gilli...,0


In [6]:
## For each claim and each date of publication, we count the associated number of snippets
df_snippets_grouped = df_snippets[['claim_id','snippet_id','snippet_date']]\
    .groupby(['claim_id','snippet_date']).count().reset_index()
    
## We lool at the 10 couples of (claim, date of publication with the maximum number
## of associated snippets
df_snippets_sorted = df_snippets_grouped.sort_values(by=['snippet_id'], ascending=False)
df_snippets_head = df_snippets_sorted.head(10)

## We retrieve the information about the snippets associated to the
## top 10 couples of (claim, date of publication)
df_snippets_merge = pd.merge(df_snippets_head[['claim_id','snippet_date']], 
    df_snippets[['claim_id','snippet_date','snippet_id','claim_content','snippet_content']])

In [92]:
## We write the contents of the selected snippets and claims in a temporary file
df_snippets_merge[['snippet_id','snippet_content','claim_content']].to_csv('data/01_temp.csv', index=False)

## Here we modify the file by hand :
## - We add a new column : groundtruth_label
## - For each couple of claim/snippet, we decide if the snippet is 
## relevant or not (we assign the groundtruth value as true or false)

In [93]:
## We get the data from the manualy modified file
df_recup = pd.read_csv('data/01_temp.csv')

In [108]:
## We add the groundtruth_label values to the df_snippets_merge data set
df_groundtruth_merge = pd.merge(df_recup[['snippet_id','groundtruth_label']], df_snippets_merge)

In [111]:
## We export this ground truth data set in a csv file
df_groundtruth_merge.to_csv('data/01_relevance_discovery_groundtruth.csv', index=False)

In [9]:
## Here we see how many snippets in the ground truth data set
## are considered as relevant or not
df_count = df_groundtruth_merge['groundtruth_label'].value_counts()
df_count.index = ['Irrelevant', 'Relevant']
df_count

Irrelevant    443
Relevant       91
Name: groundtruth_label, dtype: int64