# Generating data for WikiHoaxes project

In [284]:
import requests
from datetime import datetime, date
import csv
import pandas as pd
from ast import literal_eval
import time
import pickle
import wikitextparser as wtp
import regex as re
from statistics import median
from scipy.stats import median_abs_deviation as MAD

URL = "https://en.wikipedia.org/w/api.php" # API endpoint
s = requests.Session()

## Fetching the list of haoxes and their creation dates
The following uses Wikipedia API (https://www.mediawiki.org/wiki/API:Main_page) to retrieve the titles of the hoax pages and their creation dates. The pages reside in namespace 4 under the prefix "List of hoaxes on Wikipedia".
The steps to perform this task are as follows:
1. consult mediawiki api (wikipedia endpoint) to extract the list of hoaxes from page namespace 4
2. parse the returned json output to extract the titles of each hoax
3. consult the revisions property to fetch the timestamp of the first revision to each hoax page (creation date)
4. format the timestamp to be in the form of %Y-%m-%d
5. save the hoaxes + creation date in data/hoaxes

In [71]:
print("Performing a prefix search for \"List of hoaxes on Wikipedia\"...")

prms = {
    "action": "query",
    "format": "json",
    "list": "prefixsearch",
    "pssearch": "List of hoaxes on Wikipedia",
    "pslimit": "max",
    "psnamespace": 4 # list of hoaxes residing in namespace 4
}

data = s.get(url=URL, params = prms).json()
print("Success")

Performing a prefix search for "List of hoaxes on Wikipedia"...
Success


In [72]:
# extracting hoaxes' titles
hoax_titles = []
for page in data['query']['prefixsearch']:
    hoax_titles.append(page['title'])
    
# popping the first returned title which belongs
# to page containing the list
# e.g. 'Wikipedia:List of hoaxes on Wikipedia'
hoax_titles.pop(0)

print('Retrieved a total of', len(hoax_titles), 'hoax pages' )

Retrieved a total of 197 hoax pages


In [73]:
print("Performing an action query to get the creation dates of the hoaxes...")

creation_dates = []
for hoax in hoax_titles:
#     gets the timestamp of the first revision of each article in list
    prms = {
    "action": 'query',
    'format': 'json',
    'prop': 'revisions',
    'titles': hoax,
    'rvprop': 'timestamp',
    'rvlimit': '1',
    'rvdir': 'newer'
    }
    
    data = s.get(url=URL, params=prms).json()
    creation_dates.append(list(data['query']['pages'].values())[0]['revisions'][0]['timestamp'])

print('# of creation dates successfully retrieved:', len(creation_dates), '/', len(hoax_titles))

Performing an action query to get the creation dates of the hoaxes...
# of creation dates successfully retrieved: 197 / 197


In [74]:
print("Formatting the timestamps to Y-M-D...")
for i in range(len(creation_dates)):
    str_date = creation_dates[i].split('T')[0]
    date = datetime.strptime(str_date, "%Y-%m-%d").date()
    creation_dates[i] = date

Formatting the timestamps to Y-M-D...


In [91]:
print("Creating hoaxes_creation_dates.csv...")

columns = ['hoax title', 'creation date']
rows = [[a,b] for a,b in zip(hoax_titles, creation_dates)]

with open('data/hoaxes/hoaxes_creation_dates.csv', 'w') as file:
    wr = csv.writer(file)
    wr.writerow(columns)
    wr.writerows(rows)
    
print("Success")

Creating hoaxes_creation_dates.csv...
Success


## Extracting WikiText (markup) for hoaxes and their cohorts
The following sections shows how the markup for each hoax page as well as every member of its respective cohort was extracted. The steps taken to accomplish this task are:
1. Reading the list of hoaxes and their respective cohort members' ids
2. using the revisions property to extract the markup content for hoaxes & cohorts
3. saving hoaxes and non-hoaxes (cohort) markups in two dicts
4. the dict for hoaxes is structured as (key:hoax, value:markup)
5. the dict for cohort is structured as (key:respective hoax, values:list of markups for each id (non-hoax) in the cohort)
6. saving the dicts as pickle files in data/wiki-markup

In [44]:
#  Loading hoaxes' cohorts' members
print("Reading data/cohorts/hoaxes_cohorts.csv...")
cohorts_df = pd.read_csv('data/cohorts/hoaxes_cohorts.csv')

cohorts_df.rename(columns={"Unnamed: 0": "idx"}, inplace=True)
cohorts_df.set_index('idx')

Reading data/cohorts/hoaxes_cohorts.csv...


Unnamed: 0_level_0,hoax title,creation date,cohort
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Wikipedia:List of hoaxes on Wikipedia/Mustelodon,2005-11-09,"[3112909, 3112911, 3112930, 3112935, 3112940, ..."
1,Wikipedia:List of hoaxes on Wikipedia/The Gate...,2006-02-08,"[3996199, 3996212, 3996214, 3996238, 3996244, ..."
2,Wikipedia:List of hoaxes on Wikipedia/Bicholim...,2007-07-04,"[12077351, 12077353, 12077366, 12077372, 12077..."
3,Wikipedia:List of hoaxes on Wikipedia/Yuri Gad...,2009-08-03,"[23841113, 23841125, 23841144, 23841152, 23841..."
4,Wikipedia:List of hoaxes on Wikipedia/Buchares...,2009-06-16,"[23234860, 23234871, 23234875, 23234877, 23234..."
...,...,...,...
192,Wikipedia:List of hoaxes on Wikipedia/Be A Sta...,2017-01-21,"[52935880, 52935906, 52935911, 52935915, 52935..."
193,Wikipedia:List of hoaxes on Wikipedia/Rothmanhaus,2017-10-02,"[55410695, 55410707, 55410720, 55410727, 55410..."
194,Wikipedia:List of hoaxes on Wikipedia/Malkiel ...,2006-03-12,"[4359632, 4359638, 16783242, 4359651, 4359653,..."
195,Wikipedia:List of hoaxes on Wikipedia/Azeem Azam,2017-05-10,"[54006553, 54006557, 54006608, 54006659, 54006..."


In [46]:
# extracting the wiki-markup for the hoaxes
hoaxes_markup = {}
for hoax in cohorts_df['hoax title']:
    PARAMS= {
                'action': 'query',
                'format': 'json',
                'prop': 'revisions',
                'titles': hoax,
                'rvslots': '*' ,
                'rvprop': 'content',
            }
    
    r = s.get(url=URL, params=PARAMS)
    data = r.json()
    
    hoaxes_markup[hoax] = (list(data['query']['pages'].values())[0]\
                           ['revisions'][0]['slots']['main']['*'])
    print("Markup extracted for:", hoax)
    
print("Total of", len(hoaxes_markup), "pages extracted")

Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/Mustelodon
Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/The Gates of Saturn
Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/Bicholim conflict
Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/Yuri Gadyukin
Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/Bucharest Film Festival
Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/Gaius Flavius Antoninus
Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/Slow Blind Driveway
Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/Milk Studios
Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/Sheer Perfection
Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/Jack Robichaux
Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/Spanish tickler
Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/Upper Peninsula War
Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/Bine (my

Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/Monvilla
Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/Sean MacLeod
Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/Matthew Lyons
Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/The Bierrum Effect
Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/Anaxiphales
Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/Argusto Emfazie
Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/Functional temporalism
Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/John Seigenthaler Sr.
Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/Jason Donoghue
Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/Chad Berryman
Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/Baldock Beer Disaster
Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/Bont
Markup extracted for: Wikipedia:List of hoaxes on Wikipedia/Oliver Bayley
Markup extracted for: 

In [307]:
# extracting the wiki-markup for the nonhoaxes in each cohort
nonhoaxes_markup = {}
for i in range(len(cohorts_df)):
    hoax = cohorts_df['hoax title'][i]
    cohort = literal_eval(cohorts_df['cohort'][i])
    
    print("Extracting markup for", hoax ,"...")
    nonhoaxes_markup[hoax] = []
    
    # counter to keep track of pages missing wiki-markup (i.e. deleted pages from time of extracting cohort)
    for i in range(0,len(cohort), 50): # making request for 50 pages at a time
        page_ids = ''.join([str(page)+'|' for page in cohort[i:i+50]]) # converting ids to strings separated by |
        
        PARAMS= {
                    'action': 'query',
                    'format': 'json',
                    'prop': 'revisions',
                    'pageids': page_ids[:-1], # removing the last | in the string
                    'rvslots': '*' ,
                    'rvprop': 'content',
                }
        r = s.get(url=URL, params=PARAMS)
        time.sleep(0.05) # limiting the # of requests to about 20 per second
        
        data = r.json()
        for output in list(data['query']['pages'].values()):
            try:
                nonhoaxes_markup[hoax].append(output['revisions'][0]['slots']['main']['*'])
            except KeyError:
                continue
    
    print("Extracted contents of",\
          len(nonhoaxes_markup[hoax]), "/", len(cohort), \
          "cohort pages for", hoax)
    print("*"*50)

Extracting markup for Wikipedia:List of hoaxes on Wikipedia/Mustelodon ...
Extracted contents of 1462 / 1464 cohort pages for Wikipedia:List of hoaxes on Wikipedia/Mustelodon
**************************************************
Extracting markup for Wikipedia:List of hoaxes on Wikipedia/The Gates of Saturn ...
Extracted contents of 1455 / 1456 cohort pages for Wikipedia:List of hoaxes on Wikipedia/The Gates of Saturn
**************************************************
Extracting markup for Wikipedia:List of hoaxes on Wikipedia/Bicholim conflict ...
Extracted contents of 1329 / 1329 cohort pages for Wikipedia:List of hoaxes on Wikipedia/Bicholim conflict
**************************************************
Extracting markup for Wikipedia:List of hoaxes on Wikipedia/Yuri Gadyukin ...
Extracted contents of 1245 / 1245 cohort pages for Wikipedia:List of hoaxes on Wikipedia/Yuri Gadyukin
**************************************************
Extracting markup for Wikipedia:List of hoaxes on Wikipedi

Extracted contents of 1381 / 1382 cohort pages for Wikipedia:List of hoaxes on Wikipedia/Saint Ofelia
**************************************************
Extracting markup for Wikipedia:List of hoaxes on Wikipedia/The Siccness Network ...
Extracted contents of 824 / 825 cohort pages for Wikipedia:List of hoaxes on Wikipedia/The Siccness Network
**************************************************
Extracting markup for Wikipedia:List of hoaxes on Wikipedia/Bryan J. Baldelli ...
Extracted contents of 1196 / 1196 cohort pages for Wikipedia:List of hoaxes on Wikipedia/Bryan J. Baldelli
**************************************************
Extracting markup for Wikipedia:List of hoaxes on Wikipedia/Jûtien-Gustave DuRoi ...
Extracted contents of 1712 / 1712 cohort pages for Wikipedia:List of hoaxes on Wikipedia/Jûtien-Gustave DuRoi
**************************************************
Extracting markup for Wikipedia:List of hoaxes on Wikipedia/George K. Broomhall ...
Extracted contents of 1513 / 1513

Extracted contents of 771 / 771 cohort pages for Wikipedia:List of hoaxes on Wikipedia/Another Demonstration of the Cliff-Guibert Fire Hose Reel, Showing a Young Girl Coming from an Office, Detaching Hose, Running with It 60 Feet, and Playing a Stream, All Inside of 30 Seconds
**************************************************
Extracting markup for Wikipedia:List of hoaxes on Wikipedia/Maria Portaro ...
Extracted contents of 1278 / 1279 cohort pages for Wikipedia:List of hoaxes on Wikipedia/Maria Portaro
**************************************************
Extracting markup for Wikipedia:List of hoaxes on Wikipedia/Nuremberg Plate ...
Extracted contents of 3812 / 3816 cohort pages for Wikipedia:List of hoaxes on Wikipedia/Nuremberg Plate
**************************************************
Extracting markup for Wikipedia:List of hoaxes on Wikipedia/Shai Bernstein ...
Extracted contents of 2318 / 2318 cohort pages for Wikipedia:List of hoaxes on Wikipedia/Shai Bernstein
********************

Extracted contents of 1423 / 1423 cohort pages for Wikipedia:List of hoaxes on Wikipedia/Ingrid Vakaslavik
**************************************************
Extracting markup for Wikipedia:List of hoaxes on Wikipedia/Hey Everybody ...
Extracted contents of 1804 / 1821 cohort pages for Wikipedia:List of hoaxes on Wikipedia/Hey Everybody
**************************************************
Extracting markup for Wikipedia:List of hoaxes on Wikipedia/Monvilla ...
Extracted contents of 1723 / 1724 cohort pages for Wikipedia:List of hoaxes on Wikipedia/Monvilla
**************************************************
Extracting markup for Wikipedia:List of hoaxes on Wikipedia/Sean MacLeod ...
Extracted contents of 1807 / 1807 cohort pages for Wikipedia:List of hoaxes on Wikipedia/Sean MacLeod
**************************************************
Extracting markup for Wikipedia:List of hoaxes on Wikipedia/Matthew Lyons ...
Extracted contents of 1235 / 1235 cohort pages for Wikipedia:List of hoaxes on W

Extracted contents of 1666 / 1666 cohort pages for Wikipedia:List of hoaxes on Wikipedia/Helen Anne Petrie
**************************************************
Extracting markup for Wikipedia:List of hoaxes on Wikipedia/Reede-drum ...
Extracted contents of 1428 / 1429 cohort pages for Wikipedia:List of hoaxes on Wikipedia/Reede-drum
**************************************************
Extracting markup for Wikipedia:List of hoaxes on Wikipedia/Vitus Barbaro ...
Extracted contents of 1566 / 1566 cohort pages for Wikipedia:List of hoaxes on Wikipedia/Vitus Barbaro
**************************************************
Extracting markup for Wikipedia:List of hoaxes on Wikipedia/Franz Josef Weern ...
Extracted contents of 1329 / 1331 cohort pages for Wikipedia:List of hoaxes on Wikipedia/Franz Josef Weern
**************************************************
Extracting markup for Wikipedia:List of hoaxes on Wikipedia/Trundu ...
Extracted contents of 1460 / 1460 cohort pages for Wikipedia:List of hoax

Extracted contents of 1441 / 1442 cohort pages for Wikipedia:List of hoaxes on Wikipedia/Martin Coleman (American football)
**************************************************
Extracting markup for Wikipedia:List of hoaxes on Wikipedia/Bunaka ...
Extracted contents of 1489 / 1489 cohort pages for Wikipedia:List of hoaxes on Wikipedia/Bunaka
**************************************************
Extracting markup for Wikipedia:List of hoaxes on Wikipedia/Reich Corps of the Trombone ...
Extracted contents of 1460 / 1462 cohort pages for Wikipedia:List of hoaxes on Wikipedia/Reich Corps of the Trombone
**************************************************
Extracting markup for Wikipedia:List of hoaxes on Wikipedia/George Colby ...
Extracted contents of 790 / 790 cohort pages for Wikipedia:List of hoaxes on Wikipedia/George Colby
**************************************************
Extracting markup for Wikipedia:List of hoaxes on Wikipedia/Ruy Lopez, Marshall Attack, Rombaua Trap ...
Extracted cont

In [309]:
print('Saving hoaxes & non-hoaxes markup data...')

with open('data/wiki-markup/hoaxes_markup.pickle', 'wb') as handle:
    pickle.dump(hoaxes_markup, handle)
with open('data/wiki-markup/nonhoaxes_markup.pickle', 'wb') as handle:
    pickle.dump(nonhoaxes_markup, handle) 
    
print('Saved markup data in directory data/wiki-markup')

Saving hoaxes & non-hoaxes markup data...
Saved markup data in directory data/wiki-markup


## Calculating the appearance features
The following section shows how to calculate the appearance features for the hoaxes and the non-hoaxes in their respective cohorts. The appearance features are:
1. plain text length (word count after stripping article from its markup)
2. plain text to markup ratio (ratio of after to before markup removal)
3. wiki-link density (number of wiki-links within a page per 100 words (counted before stripping markup))
4. external-link density (number of external (http) links within a page per 100 words (counted before stripping markup))

wikitextparser was used to parse the content and extract the links. For more information, consult https://github.com/5j9/wikitextparser.

In [300]:
# dicts to keep track of the appearance features' values for hoaxes and non-hoaxes
hoaxes_features = {}
cohort_plain = {}
cohort_pln2markup = {}
cohort_wlinkden = {}
cohort_extlinkden = {}

for key, value in hoaxes_markup.items():
    
    print('Calculating the appearance features for', key,'...')
    parser = wtp.parse(value)
    
    count_markup = len(re.findall(r'\w+', value))
    count_wikilinks = len(parser.wikilinks)
    count_extlinks = len(parser.external_links)
    
#     calculating the four appearance features
    plain_length = len(re.findall(r'\w+', wtp.remove_markup(value)))
    plain_to_markup = plain_length / count_markup
    wikilink_density = (count_wikilinks / count_markup)*100
    extlink_density = (count_extlinks / count_markup)*100
    
    hoaxes_features[key] = {'plain': plain_length, \
                            'plain_to_markup': plain_to_markup, \
                            'wikilink_density': wikilink_density, \
                            'extlink_density': extlink_density}
    
    cohort_plain[key] = []
    cohort_pln2markup[key] = []
    cohort_wlinkden[key] = []
    cohort_extlinkden[key] = []
    print('Calculating the appearance features for its cohort...')
#     iterating through each cohort member's markup and performing same calculations
    for cohort_markup in nonhoaxes_markup[key]:
        parser = wtp.parse(cohort_markup)
        
        count_markup = len(re.findall(r'\w+', cohort_markup))
        count_wikilinks = len(parser.wikilinks)
        count_extlinks = len(parser.external_links)
        
#         calculating the features

#         few cohort members raise a 'NoneType' object has no attribute 'span' ...
#         ... when calling remove_markup() or plain_text(). The try except is ...
#         ... to handle such case.
        try:
            plain_length = len(re.findall(r'\w+', wtp.remove_markup(cohort_markup)))
        except AttributeError:
            continue
            
        plain_to_markup = plain_length / count_markup
        wikilink_density = (count_wikilinks / count_markup)*100
        extlink_density = (count_extlinks / count_markup)*100
        
        cohort_plain[key].append(plain_length)
        cohort_pln2markup[key].append(plain_to_markup)
        cohort_wlinkden[key].append(wikilink_density)
        cohort_extlinkden[key].append(extlink_density)

    print('Finished with ', len(cohort_plain[key]), '/', len(nonhoaxes_markup[key]), 'features calculated')
    print('*'*50)

Calculating the appearance features for Wikipedia:List of hoaxes on Wikipedia/Mustelodon ...
Calculating the appearance features for its cohort...
Finished with  1462 / 1462 features calculated
**************************************************
Calculating the appearance features for Wikipedia:List of hoaxes on Wikipedia/The Gates of Saturn ...
Calculating the appearance features for its cohort...
Finished with  1455 / 1455 features calculated
**************************************************
Calculating the appearance features for Wikipedia:List of hoaxes on Wikipedia/Bicholim conflict ...
Calculating the appearance features for its cohort...
Finished with  1329 / 1329 features calculated
**************************************************
Calculating the appearance features for Wikipedia:List of hoaxes on Wikipedia/Yuri Gadyukin ...
Calculating the appearance features for its cohort...
Finished with  1245 / 1245 features calculated
**************************************************
C

Finished with  1515 / 1515 features calculated
**************************************************
Calculating the appearance features for Wikipedia:List of hoaxes on Wikipedia/Vanda Varvara ...
Calculating the appearance features for its cohort...
Finished with  811 / 811 features calculated
**************************************************
Calculating the appearance features for Wikipedia:List of hoaxes on Wikipedia/Saint Ofelia ...
Calculating the appearance features for its cohort...
Finished with  1381 / 1381 features calculated
**************************************************
Calculating the appearance features for Wikipedia:List of hoaxes on Wikipedia/The Siccness Network ...
Calculating the appearance features for its cohort...
Finished with  824 / 824 features calculated
**************************************************
Calculating the appearance features for Wikipedia:List of hoaxes on Wikipedia/Bryan J. Baldelli ...
Calculating the appearance features for its cohort...
Fi

Finished with  789 / 789 features calculated
**************************************************
Calculating the appearance features for Wikipedia:List of hoaxes on Wikipedia/E'tedalion Party ...
Calculating the appearance features for its cohort...
Finished with  1086 / 1086 features calculated
**************************************************
Calculating the appearance features for Wikipedia:List of hoaxes on Wikipedia/PH Games ...
Calculating the appearance features for its cohort...
Finished with  1363 / 1363 features calculated
**************************************************
Calculating the appearance features for Wikipedia:List of hoaxes on Wikipedia/Another Demonstration of the Cliff-Guibert Fire Hose Reel, Showing a Young Girl Coming from an Office, Detaching Hose, Running with It 60 Feet, and Playing a Stream, All Inside of 30 Seconds ...
Calculating the appearance features for its cohort...
Finished with  771 / 771 features calculated
**************************************

Finished with  852 / 852 features calculated
**************************************************
Calculating the appearance features for Wikipedia:List of hoaxes on Wikipedia/Arekh ...
Calculating the appearance features for its cohort...
Finished with  2018 / 2018 features calculated
**************************************************
Calculating the appearance features for Wikipedia:List of hoaxes on Wikipedia/The Deadweights ...
Calculating the appearance features for its cohort...
Finished with  1395 / 1395 features calculated
**************************************************
Calculating the appearance features for Wikipedia:List of hoaxes on Wikipedia/James Geiss ...
Calculating the appearance features for its cohort...
Finished with  1619 / 1619 features calculated
**************************************************
Calculating the appearance features for Wikipedia:List of hoaxes on Wikipedia/Shantal Méndez ...
Calculating the appearance features for its cohort...
Finished with  13

Finished with  1438 / 1438 features calculated
**************************************************
Calculating the appearance features for Wikipedia:List of hoaxes on Wikipedia/Tim Verfaillie ...
Calculating the appearance features for its cohort...
Finished with  1590 / 1590 features calculated
**************************************************
Calculating the appearance features for Wikipedia:List of hoaxes on Wikipedia/Joseph Warshaw ...
Calculating the appearance features for its cohort...
Finished with  2008 / 2008 features calculated
**************************************************
Calculating the appearance features for Wikipedia:List of hoaxes on Wikipedia/Långrocken ...
Calculating the appearance features for its cohort...
Finished with  1240 / 1240 features calculated
**************************************************
Calculating the appearance features for Wikipedia:List of hoaxes on Wikipedia/Teddy Dressing ...
Calculating the appearance features for its cohort...
Finished

Finished with  1256 / 1256 features calculated
**************************************************
Calculating the appearance features for Wikipedia:List of hoaxes on Wikipedia/Upton H. Pennyworth ...
Calculating the appearance features for its cohort...
Finished with  1648 / 1648 features calculated
**************************************************
Calculating the appearance features for Wikipedia:List of hoaxes on Wikipedia/EuroNation 1 London ...
Calculating the appearance features for its cohort...
Finished with  900 / 900 features calculated
**************************************************
Calculating the appearance features for Wikipedia:List of hoaxes on Wikipedia/Eric Radloff ...
Calculating the appearance features for its cohort...
Finished with  855 / 855 features calculated
**************************************************
Calculating the appearance features for Wikipedia:List of hoaxes on Wikipedia/Emerson LaSalle ...
Calculating the appearance features for its cohort...

In [301]:
print('Saving hoaxes & non-hoaxes appearance features information...')

with open('data/appearance_features/nonhoaxes/cohort_plain.pickle', 'wb') as handle:
    pickle.dump(cohort_plain, handle)
with open('data/appearance_features/nonhoaxes/cohort_pln2markup.pickle', 'wb') as handle:
    pickle.dump(cohort_pln2markup, handle)
with open('data/appearance_features/nonhoaxes/cohort_wlinkden.pickle', 'wb') as handle:
    pickle.dump(cohort_wlinkden, handle)
with open('data/appearance_features/nonhoaxes/cohort_extlinkden.pickle', 'wb') as handle:
    pickle.dump(cohort_extlinkden, handle)

print('Saved features data in directory data/appearance_features/nonhoaxes')

Saving hoaxes & non-hoaxes appearance features information...
Saved features data in directory data/appearance_features/nonhoaxes


## Calculating the zscores
The following section calculates the zscores of each of the four appearance features. The modified z-score is used instead of the regular one due to the presence of outliers. This is accomplished through:
1. calculating the median of the cohort values for the corresponding feature
2. calculating the MAD(medain absolute deviation) for each cohort and for each feature
3. using the modified z-score equation ((x – x̃) / MAD) with x representing the feature value for the hoax and x̃ being median of the corresponding features for the respective cohort

In [303]:
z_scores = {}
for hoax, hx_features in hoaxes_features.items():
#     calculating the median
    plain_median = median(cohort_plain[hoax])
    pl2mark_median = median(cohort_pln2markup[hoax])
    wklnkden_median = median(cohort_wlinkden[hoax])
    xtlnkden_median = median(cohort_extlinkden[hoax])
    
#     calculating the median absolute devation
    plain_mad = MAD(cohort_plain[hoax])
    pl2mark_mad = MAD(cohort_pln2markup[hoax])
    wklnkden_mad = MAD(cohort_wlinkden[hoax])
    xtlnkden_mad = MAD(cohort_extlinkden[hoax])
    
#     mod z-score = ((x – x̃) / MAD)
    zplain = (hx_features['plain'] - plain_median) / plain_mad
    zpl2mark = (hx_features['plain_to_markup'] - pl2mark_median) / pl2mark_mad
    zwklnkden = (hx_features['wikilink_density'] - wklnkden_median) / wklnkden_mad
    zxtlnkden = (hx_features['extlink_density'] - xtlnkden_median) / xtlnkden_mad
    
    z_scores[hoax] = {'plain': zplain, 'plain_to_markup' : zpl2mark, \
                      'wikilink_density': zwklnkden, \
                      'extlink_density': zxtlnkden}

In [304]:
print('Saving the z-scores of all hoaxes...')

with open('data/appearance_features/zscores.pickle', 'wb') as handle:
    pickle.dump(z_scores, handle)
    
print('Saved z-scores in directory data/appearance_features')

Saving the z-scores of all hoaxes...
Saved z-scores in directory data/appearance_features
