In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("ggplot")

In [2]:
import string

# Data understanding

In [4]:
df = pd.read_csv("data/result/final/train2.tsv", sep="\t")

In [5]:
df.head(2)

Unnamed: 0,json_id,label,statement,subject,speaker,job_title,state,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context,justification,sentiment,question,grammar_errors
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,That's a premise that he fails to back up. Ann...,negative,not_question,1
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,"Surovell said the decline of coal ""started whe...",neutral,question,1


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10269 entries, 0 to 10268
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   json_id               10269 non-null  object 
 1   label                 10269 non-null  object 
 2   statement             10269 non-null  object 
 3   subject               10268 non-null  object 
 4   speaker               10268 non-null  object 
 5   job_title             7366 non-null   object 
 6   state                 8057 non-null   object 
 7   party_affiliation     10268 non-null  object 
 8   barely_true_counts    10268 non-null  float64
 9   false_counts          10268 non-null  float64
 10  half_true_counts      10268 non-null  float64
 11  mostly_true_counts    10268 non-null  float64
 12  pants_on_fire_counts  10268 non-null  float64
 13  context               10168 non-null  object 
 14  justification         10154 non-null  object 
 15  sentiment          

# Data preparation

In [7]:
df[["statement", "justification"]] = df[["statement", "justification"]].astype(str)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10269 entries, 0 to 10268
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   json_id               10269 non-null  object 
 1   label                 10269 non-null  object 
 2   statement             10269 non-null  object 
 3   subject               10268 non-null  object 
 4   speaker               10268 non-null  object 
 5   job_title             7366 non-null   object 
 6   state                 8057 non-null   object 
 7   party_affiliation     10268 non-null  object 
 8   barely_true_counts    10268 non-null  float64
 9   false_counts          10268 non-null  float64
 10  half_true_counts      10268 non-null  float64
 11  mostly_true_counts    10268 non-null  float64
 12  pants_on_fire_counts  10268 non-null  float64
 13  context               10168 non-null  object 
 14  justification         10269 non-null  object 
 15  sentiment          

## Add statement and justification length information

In [9]:
df["statement_len"] = df["statement"].apply(len)
df["justification_len"] = df["justification"].apply(len)

In [11]:
df = df.drop(["json_id"], axis=1)

In [12]:
df.head(2)

Unnamed: 0,label,statement,subject,speaker,job_title,state,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context,justification,sentiment,question,grammar_errors,statement_len,justification_len
0,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,That's a premise that he fails to back up. Ann...,negative,not_question,1,82,248
1,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,"Surovell said the decline of coal ""started whe...",neutral,question,1,141,637


## Add punctuations count per 100 characters

In [13]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [14]:
df["punctuations"] = df["statement"].apply(lambda s: sum(map(s.count, string.punctuation)))

## How many null values

In [15]:
df.isna().sum()

label                      0
statement                  0
subject                    1
speaker                    1
job_title               2903
state                   2212
party_affiliation          1
barely_true_counts         1
false_counts               1
half_true_counts           1
mostly_true_counts         1
pants_on_fire_counts       1
context                  101
justification              0
sentiment                  0
question                   0
grammar_errors             0
statement_len              0
justification_len          0
punctuations               0
dtype: int64

## Describe dataset

In [30]:
df.describe()

Unnamed: 0,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,grammar_errors,statement_len,justification_len,punctuations
count,10268.0,10268.0,10268.0,10268.0,10268.0,10269.0,10269.0,10269.0,10269.0
mean,11.561258,13.314959,17.197215,16.492111,6.203253,0.383971,106.294381,425.242964,2.754114
std,19.007031,24.141501,35.949606,36.252653,16.118404,0.686065,46.150477,314.071139,2.033661
min,0.0,0.0,0.0,0.0,0.0,0.0,11.0,3.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,73.0,270.0,1.0
50%,2.0,2.0,3.0,3.0,1.0,0.0,99.0,394.0,2.0
75%,12.0,15.0,13.0,11.0,5.0,1.0,132.0,534.0,4.0
max,70.0,114.0,160.0,163.0,105.0,13.0,1109.0,9394.0,40.0


## Grammatical errors example

In [32]:
df[df['grammar_errors']==13]

Unnamed: 0,label,statement,subject,speaker,job_title,state,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context,justification,sentiment,question,grammar_errors,statement_len,justification_len,punctuations
9404,False,"Joe, I keep hearing you every morning talking ...",,,,,,,,,,,,,neutral,not_question,13,1109,3,40


In [29]:
print(df[df['grammar_errors']==13]['statement'].values[0])

Joe, I keep hearing you every morning talking about the biggest tax increase in history, but you don't mention it's also the biggest tax cut in history.''	health-care,taxes	richard-durbin	Senator	Illinois	democrat	0	2	1	0	1	a comment on the Morning Joe'' show on MSNBC. "Bill Clinton said, ""As secretary of state, (Hillary Clinton) worked hard to get strong sanctions against Iran's nuclear program"" and ""got Russia and China to support them. ""During Hillary Clintons first 18 months as secretary, the State Department led the global effort to increase sanctions on Iran  notably getting Russia and China on board  culminating in an important U. N.  resolution. Clinton was personally involved in these diplomatic efforts and pushed them publicly. Experts said these sanctions, on top of other sanctions passed before and after, were crucial to getting Iran to the negotiating table. However, Clinton wasnt singularly responsible for the sanctions or getting China and Russia to support them, jus

# Understanding features