# Preprocessing of data

### Preprocess 'news_sample.csv'

In [None]:
import pandas as pd
import lib.process_a as process_a

# load 'news_sample.csv' file from git source
df_sample = pd.read_csv('https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv', index_col=0)

# Apply preprocess to dataframe: cleanup -> remove stopword -> stemming
# and get word frequencies and reduction rates
df_sample = process_a.preprocess(df_sample)

# save csv file copy of preprocessed dataframe
df_sample.to_csv("data/news_sample_cleaned.csv")

### Preprocess '995,000_rows.csv' dataset

In [None]:
import pandas as pd
import lib.process_b as process_b

# load data
src = 'data/995,000_rows.csv'
raw_data = pd.read_csv(src)

# Apply preprocess: cleanup -> remove stopword -> stemming
clean_data = process_b.preprocess(raw_data)

# save csv file of processed data
dst = 'data/995,000_rows_cleaned.csv'
clean_data.to_csv(dst)

### Split 995k dataset into a training, validation, and test sets

In [None]:
import pandas as pd
import lib.process_methods as pm

src = 'data/995,000_rows_cleaned.csv'
split_data = pd.read_csv(src)
pm.train_valid_test(split_data)

### Group 'fake' and 'reliable' types

In [2]:
import pandas as pd
import lib.process_methods as pm

# load data
src_train = 'data/training_data.csv'
train_data = pd.read_csv(src_train)

src_valid = 'data/validation_data.csv'
valid_data = pd.read_csv(src_valid)

src_test = 'data/test_data.csv'
test_data = pd.read_csv(src_test)

# rows with omitted types
omitted_types = {'political',
                'bias',
                'rumor',
                'unknown',
                'unreliable',
                'clickbait',
                'junksci',
                'hate',
                '2018-02-10 13:43:39.521661'
}

# reassigned labels into 'fake' and 'reliable'
fake_types = {'fake', 
              'satire',
              'conspiracy',
}

reliable_types = {'reliable'}

# group training data and save to file
dst = 'data/training_data_grouped.csv'
train_data_group = pm.group_data(train_data, omitted_types, fake_types)
train_data_group.to_csv(dst)

# group valid data and save to file
dst = 'data/validation_data_grouped.csv'
valid_data_group = pm.group_data(valid_data, omitted_types, fake_types)
valid_data_group.to_csv(dst)

# group test data and save to file
dst = 'data/test_data_grouped.csv'
test_data_group = pm.group_data(test_data, omitted_types, fake_types)
test_data_group.to_csv(dst)

In [3]:
# destribution of types in grouped training data
type_dist = train_data_group['type'].value_counts(normalize=True) * 100
print("TRANING DATA:")
print(type_dist)

# destribution of types in grouped validation data
type_dist = valid_data_group['type'].value_counts(normalize=True) * 100
print("VALIDATION DATA:")
print(type_dist)

# destribution of types in grouped validation data
type_dist = test_data_group['type'].value_counts(normalize=True) * 100
print("TEST DATA:")
print(type_dist)

TRANING DATA:
type
reliable    50.298673
fake        49.701327
Name: proportion, dtype: float64
VALIDATION DATA:
type
reliable    50.494572
fake        49.505428
Name: proportion, dtype: float64
TEST DATA:
type
reliable    50.808436
fake        49.191564
Name: proportion, dtype: float64
