# KAGGLE Dataset
Headlines and bodies of news articles.

### Necessery Imports

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly
pd.set_option('display.max_rows', 500, "display.max_colwidth", None)
plotly.offline.init_notebook_mode(connected=True)
import warnings
warnings.filterwarnings('ignore')

## 1. Loading the datasets

In [None]:
# Loading the dataset and adding claim_veracity column
KAGGLE_true = pd.read_csv('Initial_datasets/Kaggle_True.csv', low_memory=False)
KAGGLE_true['claim_veracity'] = 1
KAGGLE_false = pd.read_csv('Initial_datasets/Kaggle_Fake.csv', low_memory=False)
KAGGLE_false['claim_veracity'] = 0

KAGGLE_df = pd.concat([KAGGLE_true, KAGGLE_false], ignore_index=True)
print("size od dataset: ", KAGGLE_df.shape)

## 2. Formatting the dataset

### 2.1 Dropping unnessery columns

In [None]:
KAGGLE_df = KAGGLE_df.drop(['text', 'date'], 1)
KAGGLE_df.head(5)

### 2.2 Removing duplicates

In [None]:
# Removind duplicates (around 120)
print(KAGGLE_df.shape[0])
KAGGLE_df = KAGGLE_df.drop_duplicates(subset='title', keep='first')
print(KAGGLE_df.shape[0])

### 2.3 Inspecting the subject

In [None]:
fig = px.histogram(KAGGLE_df, x='subject')
fig.update_layout(bargap=0.2)
fig.show()

### 2.4 Any special characters

In [None]:
# Droping titles with @
KAGGLE_df = KAGGLE_df.drop(KAGGLE_df[KAGGLE_df['title'].str.contains('@', regex=True)].index)

In [None]:
# Droping titles with \n
KAGGLE_df = KAGGLE_df.drop(KAGGLE_df[KAGGLE_df['title'].str.contains('\n', regex=True)].index)

In [None]:
# Droping titles with links ('http')
KAGGLE_df = KAGGLE_df.drop(KAGGLE_df[KAGGLE_df['title'].str.contains('http', regex=True)].index)

In [None]:
# Droping titles with # -> Mostly very weird entries that are not reliable (~400)
KAGGLE_df = KAGGLE_df.drop(KAGGLE_df[KAGGLE_df['title'].str.contains('#', regex=True)].index)

In [None]:
# Titles containing ' & ' -> Replace with ' and ' instead (25)
KAGGLE_df['title'] = KAGGLE_df['title'].str.replace(' & ',' and ')

# Drop the rest of the titles containing &
KAGGLE_df = KAGGLE_df.drop(KAGGLE_df[KAGGLE_df['title'].str.contains('&', regex=True)].index)

In [None]:
# Droping titles with ; -> Hard to trust such titles
KAGGLE_df = KAGGLE_df.drop(KAGGLE_df[KAGGLE_df['title'].str.contains(';', regex=True)].index)

In [None]:
KAGGLE_df.shape[0]

### 2.5 Other observations

- Looking for frequent words that are suspicious and unnessery

In [None]:
# Take most common words

from collections import Counter
print(Counter(" ".join(KAGGLE_df["title"]).split()).most_common(200))

In [None]:
def replace_words(replace, replace_with):
    KAGGLE_df['title'] = np.where(KAGGLE_df['title'].str.lower().str.contains(replace, regex=False), 
                                      KAGGLE_df['title'].str.lower().str.replace(replace, replace_with, regex=False), 
                                      KAGGLE_df['title'])

In [None]:
# Deleting various inside brackets info that doesn't affect these listings
# [] brackets
KAGGLE_df['title'] = np.where(KAGGLE_df['title'].str.contains(r'\[[A-Za-z\/ ,.\'0-9]*\]', regex=True), 
                              KAGGLE_df['title'].str.replace(r'\[[A-Za-z\/ ,.\'0-9]*\]', '', regex=True), 
                              KAGGLE_df['title'])

# () brackets
KAGGLE_df['title'] = np.where(KAGGLE_df['title'].str.contains(r'\([A-Za-z\/ ,.\'0-9]*\)', regex=True), 
                              KAGGLE_df['title'].str.replace(r'\([A-Za-z\/ ,.\'0-9]*\)', '', regex=True), 
                              KAGGLE_df['title'])

# Deleting other words that don't bring much to the table
replace_words('factbox - ', '')
replace_words('factbox:', '')  
replace_words('wow!', '') 
replace_words('wow', '') 
replace_words('exlusive:', '') 
replace_words('exlusive - ', '') 
replace_words('exlusive ', '') 
replace_words('watch: ', '') 

- Many entries are quating some twitter posts in an unusual way that doesn't ressemble a "normal claim"

In [None]:
# Delete the "on Twitter" entries (212)
KAGGLE_df = KAGGLE_df.drop(KAGGLE_df[KAGGLE_df['title'].str.lower().str.contains('on twitter', regex=True)].index)

### 2.6. TRUE/FALSE Distribution after formatting

In [None]:
fig = px.histogram(KAGGLE_df, x='claim_veracity').update_xaxes(categoryarray=[1, 0])
fig.update_layout(bargap=0.2)
fig.show()

### 2.7. Sentence Length
- The lower character length sentences seem to bad. Very non-informative - EXAMINE


In [None]:
KAGGLE_df['title_count'] = KAGGLE_df['title'].str.len()

fig = px.histogram(KAGGLE_df, x='title_count') # With log scale to see a better distribution
fig.update_layout(bargap=0.2)
fig.show()

In [None]:
KAGGLE_df_1 = KAGGLE_df[KAGGLE_df['title_count'] <= 35]  # 0 - 30 -> DELETE ALL THESE (64)
KAGGLE_df_2 = KAGGLE_df[(KAGGLE_df['title_count'] > 35) & (KAGGLE_df['title_count'] <= 50)]  # 35 - 50 (1377)
KAGGLE_df_3 = KAGGLE_df[(KAGGLE_df['title_count'] > 50) & (KAGGLE_df['title_count'] <= 75)] # 50 - 75 (20886)
KAGGLE_df_4 = KAGGLE_df[(KAGGLE_df['title_count'] > 75) & (KAGGLE_df['title_count'] <= 100)] # 75 - 100 (11502)
KAGGLE_df_5 = KAGGLE_df[KAGGLE_df['title_count'] > 100] # 100 - 180 (4106)
KAGGLE_df_6 = KAGGLE_df[KAGGLE_df['title_count'] > 180] # 180 - ___ (113) Don't like the quality of these entries

## 3. Making selected dataset

Making a selected Kaggle dataset with entries from each category based on sentence length and their claim veracity

- 8 different ranges of characters length that split the dataset into 8 groups

In [None]:
# 800 random entries from each category will be chosen (400 from true and 400 from fake)

KAGGLE_df_true = KAGGLE_df[KAGGLE_df['claim_veracity'] == 1]
KAGGLE_df_false = KAGGLE_df[KAGGLE_df['claim_veracity'] == 0]
# df_list = []

dfs = []
for df in [KAGGLE_df_true, KAGGLE_df_false]:
    df['char_category'] = pd.qcut(df['title_count'], 8, labels=[0, 1, 2, 3, 4, 5, 6, 7])
    for category in [0, 1, 2, 3, 4, 5, 6, 7]:
        category_df = df[df['char_category'] == category]
        dfs.append(category_df.sample(400, random_state=1))
    
KAGGLE_picked = pd.concat(dfs)

## 4. Saving Datasets

In [None]:
# Rename title to claim
KAGGLE_df.rename(columns={'title': 'claim'}, inplace=True)
KAGGLE_picked.rename(columns={'title': 'claim'}, inplace=True)

### 4.1. Whole KAGGLE

In [None]:
KAGGLE_df = KAGGLE_df.sample(frac=1).reset_index(drop=True).drop(['subject', 'title_count'], axis=1)
KAGGLE_df.to_csv('KAGGLE_Final.csv', encoding='utf-8')

### 4.2. Picked KAGGLE

In [None]:
KAGGLE_picked = KAGGLE_picked.sample(frac=1).reset_index(drop=True).drop(['subject', 'title_count', 'char_category'], axis=1)
KAGGLE_picked.to_csv('KAGGLE_Picked.csv', encoding='utf-8')