# Data Cup Fake News Data Profiling and Parsing

## 1. Import packages

In [1]:
import os
import pandas as pd
import numpy as np
import json
import pandas_profiling

In [2]:
%load_ext ipycache

  from IPython.utils.traitlets import Unicode


## 2. Prepare training data

### 2.1 Load training data

Here, we want to load our train.json file and convert the data from json to a pandas dataframe. This makes it easier to wrangle (convert data formats, create new dimensions, and standardize) the data.

In [3]:
%%cache train_dataframe.pkl train_dataframe
with open("data/train.json") as f:
    train_data = json.load(f)

train_dataframe = pd.DataFrame.from_records(train_data)
train_dataframe['date'] = pd.to_datetime(train_dataframe['date'])

[Skipped the cell's code and loaded variables train_dataframe from file '/home/bking/Projects/pipenvs/Fake_News_Data_Cup/train_dataframe.pkl'.]


In [4]:
%store train_dataframe 

Stored 'train_dataframe' (DataFrame)


In [5]:
train_dataframe

Unnamed: 0,claim,claimant,date,label,related_articles,id
0,A line from George Orwell's novel 1984 predict...,,2017-07-17,0,"[122094, 122580, 130685, 134765]",0
1,Maine legislature candidate Leslie Gibson insu...,,2018-03-17,2,"[106868, 127320, 128060]",1
2,A 17-year-old girl named Alyssa Carson is bein...,,2018-07-18,1,"[132130, 132132, 149722]",4
3,In 1988 author Roald Dahl penned an open lette...,,2019-02-04,2,"[123254, 123418, 127464]",5
4,"When it comes to fighting terrorism, ""Another ...",Hillary Clinton,2016-03-22,2,"[41099, 89899, 72543, 82644, 95344, 88361]",6
...,...,...,...,...,...,...
15550,"The omnibus spending bill has ""9,427 pork barr...",John McCain,2009-02-25,2,"[82947, 93503]",17137
15551,Representative Maxine Waters said Muslims were...,,2017-06-06,0,"[103780, 104726, 126025]",17138
15552,"""We were not, I repeat, were not told that wat...",Nancy Pelosi,2009-04-23,0,"[11331, 68915, 2186, 2185, 88418, 81950]",17139
15553,"As of August 2017, members of the public could...",,2018-05-14,2,"[121353, 152864, 154411]",17140


### 2.2 Convert 'date' column to date data type

Change date field to datetime data type YYYY-MM-DD. For more information: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html

### 2.3 Profile the training data

Use pandas_profiling: https://github.com/pandas-profiling/pandas-profiling

In [8]:
train_dataframe.profile_report(style={'full_width':True})
profile = train_dataframe.profile_report(title='Fake News Data Profile')
# profile.to_file(output_file="data_profile.html")

In [9]:
profile



Information about the dataset:
- 0: false, 1: partly true, 2: true
- “claimant”: entity who made the claim

# 3. Load and process articles to generate corpora

### 3.1 Build dataframe of article text and article id

In [10]:
%%cache article_df.pkl article_df

path_to_articles = r"/media/bking/data/Datasets/fakenews_datacup/train_articles/"

# Build lists of article id and article text
article_id_list = []
article_text_list = []
for index, article_file in enumerate(os.scandir(path_to_articles)):
    article_path = str(article_file.path)
    article_id = int(''.join(list(filter(str.isdigit, article_path))))
    
    with open(article_path) as f:
         text = f.read()
    
    article_id_list.append(article_id)
    article_text_list.append(text)
#     if index == 3:
#         break
#     print(article_id)

# Construct dataframe
zippedList =  list(zip(article_id_list, article_text_list))
article_df = pd.DataFrame(zippedList, columns = ['id' , 'text'])  
# article_df.set_index(id, drop=True, append=False, inplace=False, verify_integrity=False)

[Skipped the cell's code and loaded variables article_df from file '/home/bking/Projects/pipenvs/Fake_News_Data_Cup/article_df.pkl'.]


### 3.2 Store initial data for other steps 

In [11]:
%store article_df 

Stored 'article_df' (DataFrame)


In [12]:
article_df.head()

Unnamed: 0,id,text
0,106081,Trump Supporter “Kicked Pregnant Muslim Woman ...
1,129341,UW Facts and Figures – University of Wisconsin...
2,100963,Gun Control Advocates Target Peaceful Switzerl...
3,12200,U.S. and Republic of Korea Conclude New Specia...
4,128496,Kremlin's persistent claim of “expected chemic...
