In [None]:
import pandas as pd

# load raw dataset
src = 'data/995,000_rows.csv'
# src = 'data/SAMPLE.csv'
raw_data = pd.read_csv(src)

### Amount of '!' (exclamations) in fake news vs. reliable news

There seems to be a lot more '!'-characters in fake labelled articles

In [None]:
import matplotlib.pyplot as plt

char = '!'

# copy dataframe
exclm_data = raw_data.copy(deep=True)

# get count sum of exclamation points in each article
exclm_data['exclm_count'] = exclm_data['content'].str.count(char)

# get total sum of exclamation points for each type (labels)
fake_exclm_sum = (exclm_data[ (exclm_data['type'] == 'fake')])['exclm_count'].mean()
reliable_exclm_sum = (exclm_data[ (exclm_data['type'] == 'reliable')])['exclm_count'].mean()

# plot data
fig, ax = plt.subplots()

ax.set_ylabel('mean')
ax.set_title('\'!\' characters in fake vs. reliable')

ax.bar(['fake', 'reliable'], [fake_exclm_sum, reliable_exclm_sum])

plt.show()

### Amount of '!' (exclamations) in each type of labels

'political' has most exclamations points. Second is 'fake'.

In [None]:
import matplotlib.pyplot as plt

char = '!'

# copy dataframe
exclm_data = raw_data.copy(deep=True)

# get count sum of exclamation points in each article
exclm_data['exclm_count'] = exclm_data['content'].str.count(char)

# get total sum of exclamation points for each type (labels)
types = ['reliable',
         'political',
         'bias',
         'fake',
         'conspiracy',
         'rumor',
         'unknown',
         'unreliable',
         'clickbait',
         'junksci',
         'satire',
         'hate'
         ]

sums = []
for type in types:
    sum = (exclm_data[ (exclm_data['type'] == type)])['exclm_count'].mean()
    sums.append(sum)

# plot data
fig, ax = plt.subplots()
plt.xticks(rotation='vertical')

ax.set_ylabel('mean')
ax.set_title('\'!\' characters in all article types')

ax.bar(types, sums)

plt.show()

### Amount of unique words in reliable news vs. fake news

Explore the sum of different words for each article, and make a scatterplot

In [None]:
import lib.process_methods as pm
import pandas as pd
import os.path

# copy dataframe
# src = 'data/995,000_rows_cleaned.csv'
src = 'data/SAMPLE_cleaned.csv'
word_data = pd.read_csv(src)
word_freq = pd.DataFrame()

# get unique word freq for each article and add to dataframe
# This might take a while!
path = 'data/word_freq.csv'
if not os.path.isfile(path):
    word_freq = pm.word_freq(word_data, 'content_clean', 'content_word_freq')

    # concat types
    word_freq.insert(0, "type", word_data['type'])

    # save to file
    word_freq.to_csv('data/word_freq.csv')
else:
    print('File already exists')

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

src = 'data/word_freq.csv'
word_freq = pd.read_csv(src)

# get total mean of exclamation points for each type (labels)
types = ['reliable',
         'political',
         'bias',
         'fake',
         'conspiracy',
         'rumor',
         'unknown',
         'unreliable',
         'clickbait',
         'junksci',
         'satire',
         'hate'
         ]

means = []
for type in types:
    mean = (word_freq[ (word_freq['type'] == type)])['content_word_freq'].mean()
    means.append(mean)

# plot data
fig, ax = plt.subplots()
plt.xticks(rotation='vertical')

ax.set_ylabel('mean')
ax.set_title('Unique words in articles by type')

ax.bar(types, means)

plt.show()


### Do Fake news have less author names then reliable news? 

From the barplot, it seems that 'reliable' news have more missing authors, then 'fake' news. So actually the opposite of our hypothesis.

In [None]:
import matplotlib.pyplot as plt

# get all rows with label 'fake'
fake_data = raw_data[(raw_data['type'] == 'fake')]

# count rows for 'fake' with no author names 
fake_auth_isNull_sum = fake_data['authors'].isnull().sum()

# get all rows with label 'reliable'
reliable_data = raw_data[(raw_data['type'] == 'reliable')]

# count rows for 'reliable' with no author names
reliable_auth_isNull_sum = reliable_data['authors'].isnull().sum()

# plot comparison
fig, ax = plt.subplots()

ax.set_ylabel('missing author values')
ax.set_title('Missing author values: \'fake\' vs. \'reliable\' news')

ax.bar(['fake', 'reliable'], [fake_auth_isNull_sum, reliable_auth_isNull_sum])

plt.show()


### Correlations between word reduction rates between reliable vs. fake?