# FEVER 
150k short factual statements from Wikipedia annotated by trained personel, with 3 levels of claim veracity: supported, disprovided and not enough information


#### Necessery imports

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly
import json
pd.set_option('display.max_rows', 500, "display.max_colwidth", None)
plotly.offline.init_notebook_mode(connected=True)

import plotly.offline as pyo
import plotly.graph_objs as go
pyo.init_notebook_mode()

## 1. Loading the Dataset

In [3]:
FEVER_df = pd.read_json('Initial_datasets/FEVER_initial.jsonl', lines=True)
FEVER_df.shape[0]

145449

## 2. Examining the dataset

### 2.1 Dropping unnessery columns and rows

- We don't need the id or the evidence so we can drop these columns

In [4]:
FEVER_df = FEVER_df.drop(['id', 'evidence'], 1)

- We also don't need the rows with "Not enough info" as we want clear indiciation if a claim is true or false. This also means that the "Not verifiable" rows will be deleted as there is a one-to-one relationship between these two.

In [5]:
FEVER_df = FEVER_df.drop(FEVER_df.loc[FEVER_df['label'].str.lower().str.contains('not enough info')].index)
FEVER_df.loc[FEVER_df['verifiable'].str.lower().str.contains('not verifiable')].shape[0]

# Every claim is now "verifiable" so we can drop this column too
FEVER_df = FEVER_df.drop(['verifiable'], 1)

### 2.2 Examining true/fake entries

In [6]:
# Change the type of the entry to boolean with 1 meaning TRUE and 0 meaning it's a FALSE claim
conditions = [FEVER_df['label'].eq('SUPPORTS'),
              FEVER_df['label'].eq('REFUTES')]
choices = [1, 0]
FEVER_df['claim_veracity'] = np.select(conditions, choices, default = FEVER_df['label'])
FEVER_df = FEVER_df.drop(['label'], 1)

In [None]:
fig = px.histogram(FEVER_df, x='claim_veracity').update_xaxes(categoryarray=[1, 0])
fig.update_layout(bargap=0.2)
fig.show()

### 2.3 Removing duplicates

In [8]:
# Removind duplicates (over 7k)
print(FEVER_df.shape[0])
FEVER_df = FEVER_df.drop_duplicates(subset='claim', keep='first')
FEVER_df.shape[0]

109810


102292

### 2.4 Claim Length 
- Group by sentence length and take some % of each group into the dataset

In [None]:
FEVER_df['claim_count'] = FEVER_df['claim'].str.len()

# Showing claim length distribution
fig = px.histogram(FEVER_df, x='claim_count')
fig.update_layout(bargap=0.2)
fig.show()

In [10]:
print("8 different ranges of characters length that split the dataset into 8 groups: \n\n", 
      pd.qcut(FEVER_df['claim_count'], 8).unique())

FEVER_df['char_category'] = pd.qcut(FEVER_df['claim_count'], 8, labels=[0, 1, 2, 3, 4, 5, 6, 7])

8 different ranges of characters length that split the dataset into 8 groups: 

 [(57.0, 68.0], (29.0, 35.0], (68.0, 614.0], (50.0, 57.0], (35.0, 40.0], (40.0, 45.0], (45.0, 50.0], (10.999, 29.0]]
Categories (8, interval[float64]): [(10.999, 29.0] < (29.0, 35.0] < (35.0, 40.0] < (40.0, 45.0] < (45.0, 50.0] < (50.0, 57.0] < (57.0, 68.0] < (68.0, 614.0]]


In [11]:
# 500 random entries from each category will be chosen 

dfs = []
for category in [0, 1, 2, 3, 4, 5, 6, 7]:
    category_df_false = FEVER_df[(FEVER_df['char_category'] == category) & (FEVER_df['claim_veracity'] == 0)]
    category_df_true = FEVER_df[(FEVER_df['char_category'] == category) & (FEVER_df['claim_veracity'] == 1)]
    dfs.append(category_df_true.sample(250, random_state=1)) 
    dfs.append(category_df_false.sample(250, random_state=1)) 
    
FEVER_picked = pd.concat(dfs)

## 3. Saving new datasets

### 3.1. Whole FEVER

In [87]:
FEVER_df = FEVER_df.sample(frac=1).reset_index(drop=True).drop(['claim_count', 'char_category'], axis=1)
FEVER_df.to_csv('FEVER_Final.csv', encoding='utf-8')

### 3.2. Picked FEVER

In [88]:
FEVER_picked = FEVER_picked.sample(frac=1).reset_index(drop=True).drop(['claim_count', 'char_category'], axis=1)
FEVER_picked.to_csv('FEVER_Picked.csv', encoding='utf-8')