In [1]:
# import all needed packages 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

import nltk
from nltk import word_tokenize
from nltk.probability import FreqDist

import catboost as cb
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score

from collections import Counter
import numpy as np

In [2]:
# import the data and turn into dataframe 
fake_news_data = pd.read_csv('../raw_data/train.csv')

In [3]:
# view the data
fake_news_data.head()

Unnamed: 0,ID,label,statement,subject,speaker,speaker_job,state_info,party_affiliation,barely_true_counts,false_counts,...,sentiment_score,sentiment_magnitude,anger,fear,joy,disgust,sad,speaker_id,list,sentiment_code
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,...,-0.5,0.5,0.121137,0.008926,0.026096,0.263479,0.531887,_0_,"[0, 1]",_NEG_
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,...,-0.4,0.8,0.095352,0.124566,0.191357,0.016999,0.102045,_1_,"[0, 1]",_NEG_
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,...,-0.3,0.3,0.039559,0.024162,0.500384,0.454228,0.052453,_2_,"[1, 0]",_NEG_
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,...,-0.3,0.3,0.004804,0.194674,0.375055,0.022509,0.383403,_3_,"[0, 1]",_NEG_
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,...,0.0,0.0,0.044237,0.215996,0.222402,0.045672,0.274343,_4_,"[0, 1]",


In [4]:
fake_news_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
barely_true_counts,11519.0,11.564893,18.980609,0.0,0.0,2.0,12.0,70.0
false_counts,11519.0,13.349596,24.163892,0.0,0.0,2.0,15.0,114.0
half_true_counts,11519.0,17.146975,35.82422,0.0,0.0,3.0,13.0,160.0
mostly_true_counts,11519.0,16.455595,36.133691,0.0,0.0,3.0,11.0,163.0
pants_on_fire_counts,11519.0,6.278062,16.301728,0.0,0.0,1.0,5.0,105.0
sentiment_score,11519.0,-0.297543,0.337396,-0.9,-0.6,-0.3,0.0,0.9
sentiment_magnitude,11519.0,0.429143,0.329673,0.0,0.2,0.4,0.6,7.1
anger,11519.0,0.158227,0.115627,0.0,0.071107,0.12886,0.215767,0.931034
fear,11519.0,0.135847,0.107703,0.0,0.061308,0.106236,0.180576,0.931034
joy,11519.0,0.176459,0.155817,0.0,0.059737,0.128039,0.249675,0.918537


In [5]:
# check for null values 
fake_news_data.isnull().sum()

ID                         0
label                      0
statement                  0
subject                    0
speaker                    0
speaker_job             3239
state_info              2486
party_affiliation          0
barely_true_counts         0
false_counts               0
half_true_counts           0
mostly_true_counts         0
pants_on_fire_counts       0
context                  112
sentiment               1541
sentiment_score            0
sentiment_magnitude        0
anger                      0
fear                       0
joy                        0
disgust                    0
sad                        0
speaker_id                 0
list                       0
sentiment_code          1541
dtype: int64

In [6]:
missing = pd.concat([fake_news_data.isnull().sum(), 100 * fake_news_data.isnull().mean()], axis=1)
missing.columns=['count', '%']
missing.sort_values(by='count')

Unnamed: 0,count,%
ID,0,0.0
speaker_id,0,0.0
sad,0,0.0
disgust,0,0.0
joy,0,0.0
fear,0,0.0
anger,0,0.0
sentiment_magnitude,0,0.0
sentiment_score,0,0.0
list,0,0.0


In [7]:
# See how one record looks.
fake_news_data.iloc[0]

ID                                                              2635.json
label                                                               false
statement               Says the Annies List political group supports ...
subject                                                          abortion
speaker                                                      dwayne-bohac
speaker_job                                          State representative
state_info                                                          Texas
party_affiliation                                              republican
barely_true_counts                                                    0.0
false_counts                                                          1.0
half_true_counts                                                      0.0
mostly_true_counts                                                    0.0
pants_on_fire_counts                                                  0.0
context                               

In [8]:
# get genral overview of the datatypes the the data has. 
fake_news_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11519 entries, 0 to 11518
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ID                    11519 non-null  object 
 1   label                 11519 non-null  object 
 2   statement             11519 non-null  object 
 3   subject               11519 non-null  object 
 4   speaker               11519 non-null  object 
 5   speaker_job           8280 non-null   object 
 6   state_info            9033 non-null   object 
 7   party_affiliation     11519 non-null  object 
 8   barely_true_counts    11519 non-null  float64
 9   false_counts          11519 non-null  float64
 10  half_true_counts      11519 non-null  float64
 11  mostly_true_counts    11519 non-null  float64
 12  pants_on_fire_counts  11519 non-null  float64
 13  context               11407 non-null  object 
 14  sentiment             9978 non-null   object 
 15  sentiment_score    

In [9]:
# This is the feild that we are interseted in 
fake_news_data['statement'].head()

0    Says the Annies List political group supports ...
1    When did the decline of coal start? It started...
2    Hillary Clinton agrees with John McCain "by vo...
3    Health care reform legislation is likely to ma...
4    The economic turnaround started at the end of ...
Name: statement, dtype: object

In [10]:
# the statement feild does not show the full text so, we use set_option to get full text. 
pd.set_option("display.max_colwidth", None)

In [11]:
fake_news_data['statement'].head()

0                                                               Says the Annies List political group supports third-trimester abortions on demand.
1    When did the decline of coal start? It started when natural gas took off that started to begin in (President George W.) Bushs administration.
2                                        Hillary Clinton agrees with John McCain "by voting to give George Bush the benefit of the doubt on Iran."
3                                                                   Health care reform legislation is likely to mandate free sex change surgeries.
4                                                                                           The economic turnaround started at the end of my term.
Name: statement, dtype: object

In [12]:
# uderstand and get distusbusion of the labels. 
fake_news_data['label'].value_counts()

label
half-true      2361
false          2255
mostly-true    2213
barely-true    1891
true           1845
pants-fire      954
Name: count, dtype: int64

In [13]:
# see the subject that is most talked about
fake_news_data['subject'].value_counts()

subject
health-care                                        434
taxes                                              337
immigration                                        285
education                                          279
elections                                          273
                                                  ... 
elections,labor,workers                              1
health-care,medicare,10-news-tampa-bay               1
environment,government-regulation,public-health      1
baseball,economy,florida                             1
candidates-biography,infrastructure                  1
Name: count, Length: 4190, dtype: int64

In [14]:
# We see that this data mostly focus on Obama and Trump. 
fake_news_data['speaker'].value_counts()

speaker
barack-obama        549
donald-trump        310
hillary-clinton     266
mitt-romney         195
john-mccain         168
                   ... 
d-king                1
burrell-ellis         1
david-segal           1
hilary-treadwell      1
sal-esquivel          1
Name: count, Length: 3125, dtype: int64

In [15]:
# We understand which state is mention the most. 
fake_news_data['state_info'].value_counts()

state_info
Texas             1135
Florida           1114
Wisconsin          799
New York           744
Illinois           624
                  ... 
Qatar                1
New Hampshire        1
Virginia             1
United Kingdom       1
Georgia              1
Name: count, Length: 84, dtype: int64

In [16]:
fake_news_data['party_affiliation'].value_counts()

party_affiliation
republican                      5092
democrat                        3730
none                            1967
organization                     237
independent                      166
newsmaker                         63
libertarian                       46
activist                          43
journalist                        40
columnist                         37
talk-show-host                    29
state-official                    20
labor-leader                      15
business-leader                   10
tea-party-member                  10
education-official                 3
green                              3
constitution-party                 2
liberal-party-canada               1
government-body                    1
Moderate                           1
democratic-farmer-labor            1
ocean-state-tea-party-action       1
county-commissioner                1
Name: count, dtype: int64

In [None]:
# We ploted all the numeric data to see what feild is usefull and which not. 
fake_news_data.hist(figsize=(15,10))
plt.subplots_adjust(hspace=0.5)

In [None]:
# US has 50 states not 84
fake_news_data['state_info'].nunique()

In [None]:
fake_news_data['speaker_job'].value_counts()

In [None]:
# there we see some outliers that can cause some problem. 
sns.boxplot(fake_news_data[['barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts']])

In [None]:
# we merged pants-fire into false 
fake_news_data['label'] = fake_news_data['label'].replace('pants-fire','false')
fake_news_data['label'].value_counts()

In [None]:
# we found some duplicates and we dropped them. 
fake_news_data[fake_news_data.duplicated()]
fake_news_data = fake_news_data.drop_duplicates()

In [None]:
# we dropped rows with missing values because some of the data points had missing state_info and speaker_job.
fake_news_data = fake_news_data.dropna()

In [None]:
# we only want to deal with fake news from USA. So, we removed the other states or country. 
us_state = ['Alabama','Alaska','Arizona','Arkansas','California','Colorado','Connecticut','Delaware','Florida','Georgia','Hawaii','Idaho','Illinois',
            'Indiana','Iowa','Kansas','Kentucky','Louisiana','Maine','Maryland','Massachusetts','Michigan','Minnesota','Mississippi','Missouri','Montana',
            'Nebraska','Nevada','New Hampshire','New Jersey','New Mexico','New York','North Carolina','North Dakota','Ohio','Oklahoma','Oregon',
            'Pennsylvania','Rhode Island','South Carolina','South Dakota','Tennessee','Texas','Utah','Vermont','Virginia','Washington','West Virginia',
            'Wisconsin','Wyoming','District of Columbia', 'Washington, D.C.']

fake_news_data = fake_news_data[fake_news_data['state_info'].isin(us_state)]

In [None]:
fake_news_data