# Clean AFT Dataset
**Author:** Jim Maddock

**Last Updated:** 5-3-20

**Description:** Import the initial English AFT dataset (downloaded from https://old.datahub.io/dataset/wikipedia-article-feedback-corpus) and create features for exploratory analysis and classifier.  The processed dataset should include the following **feature fields**:
* unique ID
* original ID (these are non-unique but could be useful for debugging)
* article and revision ID
* comment (might have to remove stop tokens)
* article topic from https://github.com/wikimedia/drafttopic
* article features - TBD
* vandalism score from https://github.com/wikimedia/draftquality

The dataset should also include the following **class fields**:
* helpful/un-helpful
* non-actionable

In [149]:
import csv
import pandas as pd
import numpy as np
import uuid
import random
import requests
import time
from oresapi import Session
import concurrent.futures
import logging
import sys

import matplotlib.pyplot as plt
import matplotlib

matplotlib.style.use('ggplot')

In [175]:
# import dataset from csv
FILEPATH = '/Users/klogg/research_data/aft/raw/dump_03-24-20.csv'

dtypes = {
    'aft_id':object
}

df = pd.read_csv(FILEPATH,escapechar='\\', encoding='latin-1', dtype=dtypes)

In [176]:
# remove all comments that do not have a helpful or unhelpful label
df = df.loc[(df['aft_helpful'] > 0) | (df['aft_unhelpful'] > 0)]

In [177]:
# generate a unique ID
# make sure to use a random seed so this is reproducable
rd = random.Random()
rd.seed(0)
df['UUID'] = df.apply(lambda x: uuid.UUID(int=rd.getrandbits(128)), axis=1)
df = df.reset_index()

In [192]:
# get vandalism score for each comment using draft quality
# PRE: comment text, not including stop words
# POST: return the positive and negative scores for the comment

BASE_URL = "https://ores.wikimedia.org/v3/scores/enwiki/57185234/draftquality"

def threadedVandalismScoreHandler(df, max_workers = 4):
    
    to_process = df[['aft_comment','UUID']].iterrows()
    score_list = []
    processed = 0
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:

        future_to_url = {executor.submit(getVandalismScore, row[1]): row for row in to_process}
        for future in concurrent.futures.as_completed(future_to_url):
            score_dict = future.result()
            score_list.append(score_dict)
            processed += 1
            
            if processed % 100 == 0:
                print('scored {0} documents'.format(processed))
    
    score_df = pd.DataFrame(score_list)
    return score_df

def getVandalismScore(row):
    try:
        score_json = vandalismScoreRequest(row['aft_comment'])
    except requests.RequestException as e:
        print(row['UUID'],row['aft_comment'])
        raise requests.RequestException()
    score_dict = jsonToDict(score_json)
    score_dict['UUID'] = row['UUID']
    return score_dict

def formatCommentText(comment_text):
    comment_text = str(comment_text)
    comment_text = bytes(comment_text, 'latin-1').decode('utf-8')
    comment_text = comment_text.replace('\n','')
    comment_text = comment_text.replace('\t','')
    comment_text = comment_text.replace('"','')
    comment_text = comment_text.replace("\'","")
    comment_text = comment_text.replace('"','')
    comment_text = comment_text.replace('”','')
    comment_text = '"{0}"'.format(comment_text)
    return comment_text
    

def vandalismScoreRequest(comment_text):
    comment_text = formatCommentText(comment_text)
    params = {
        'datasource.revision.text':comment_text
    }
    r = requests.get(BASE_URL,params=params)
    if not r:
        raise requests.RequestException()
    json_response = r.json()
    return json_response

def vandalismJsonToDict(json):
    base = json['enwiki']['scores']['57185234']['draftquality']['score']
    score_dict = {
        'dq_pred':base['prediction'],
        'dq_prob_OK':base['probability']['OK'],
        'dq_prob_attack':base['probability']['attack'],
        'dq_prob_spam':base['probability']['spam'],
        'dq_prob_vandalism':base['probability']['vandalism']
    }
    return score_dict

#threadedVandalismScoreHandler(test)

In [133]:
def getTopics(df):
    rev_id_list = df['aft_page_revision'].unique().tolist()
    
    session = Session("https://ores.wikimedia.org",user_agent="")
    results = my_session.score("enwiki", ["drafttopic"], rev_id_list)

    topic_list = []
    for rev_id, result in zip(rev_id_list, results):
        score_dict = {
            'aft_page_revision':rev_id
        }
        if 'error' in result['drafttopic']:
            score_dict['dt_pred'] = None
        elif len(result['drafttopic']['score']['prediction']) < 1:
            score_dict['dt_pred'] = None
        else:
            try:
                score_dict['dt_pred'] = result['drafttopic']['score']['prediction'][0],
            except:
                print(result)
                raise
        topic_list.append(score_dict)
    
    topic_df = pd.DataFrame(topic_list)
    return topic_df

getTopics(test)

Unnamed: 0,aft_page_revision,dt_pred
0,543382932,"(STEM.Biology,)"
1,546589577,"(Culture.Internet culture,)"
2,551246522,"(Culture.Food and drink,)"
3,551998802,"(Culture.Internet culture,)"
4,539625963,"(STEM.STEM*,)"
...,...,...
643,563914106,"(Culture.Media.Media*,)"
644,563981546,"(Geography.Regions.Asia.Asia*,)"
645,563886072,"(STEM.Medicine & Health,)"
646,564048829,"(Culture.Internet culture,)"


In [125]:
def addPredColumns(df):
    vandalism_pred_df = threadedVandalismScoreHandler(df, workers = 8)
    topic_pred_df = getTopics(df)

    df = df.merge(vandalism_pred_df, on='UUID')
    df = df.merge(topic_pred_df, on='aft_page_revision', how='outer')
    return df

In [134]:
test = df.head(1000)
test = test.reset_index()

addPredColumns(test)

scored 100 documents
scored 200 documents
scored 300 documents
scored 400 documents
scored 500 documents
scored 600 documents
scored 700 documents
scored 800 documents
scored 900 documents
scored 1000 documents


Unnamed: 0,level_0,index,aft_id,aft_page,aft_page_revision,aft_user,aft_user_text,aft_user_token,aft_form,aft_cta,...,aft_relevance_score,aft_discuss,aft_claimed_user,UUID,dq_pred,dq_prob_OK,dq_prob_attack,dq_prob_spam,dq_prob_vandalism,dt_pred
0,0,1,04f8e0fdffaf1e9b25a890b11c27a364,3235587,543382932,0,216.38.130.162,x6riNeCDobHCPn2XXciJq7x4xvA0KmpA,6,4,...,53,N,0,e3e70682-c209-4cac-629f-6fbed82c07cd,vandalism,0.035010,0.248067,0.026091,0.690832,"(STEM.Biology,)"
1,1,2,04f8e1029f2870edec75842b2b77e916,3235587,543382932,0,216.38.130.162,x6riNeCDobHCPn2XXciJq7x4xvA0KmpA,6,4,...,52,N,0,f728b4fa-4248-5e3a-0a5d-2f346baa9455,spam,0.224409,0.110223,0.362515,0.302853,"(STEM.Biology,)"
2,2,14,04f8e5d1942ae88ed84e842b2b7828a7,63948,546589577,0,207.118.89.254,njG9PLr6PByQP8W9a1H6DnHWALdZfFFG,6,4,...,-1,N,0,eb1167b3-67a9-c378-7c65-c1e582e2e662,vandalism,0.065524,0.214019,0.157396,0.563061,"(Culture.Internet culture,)"
3,3,15,04f8e61b93956f3747a890b11c28d448,63948,546589577,0,69.231.49.118,6Kroi6eGf3M2Gw3cXMjfZp7tuADVUjr8,6,4,...,-6,N,0,f7c1bd87-4da5-e709-d471-3d60c8a70639,attack,0.122656,0.427280,0.059457,0.390608,"(Culture.Internet culture,)"
4,4,17,04f8e8fa6f0bb5fc8633842b2b7723d4,63948,546589577,0,98.237.129.51,jV7qkQyXlHlZKf5j3dMnIzzkgu73rqyN,6,4,...,-3,N,0,e443df78-9558-867f-5ba9-1faf7a024204,vandalism,0.125008,0.237430,0.043884,0.593678,"(Culture.Internet culture,)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,14955,04ff4f920f8255c90be3842b2b77d26b,352699,563914106,0,70.122.244.246,GrRYUEo72Q9MBm0hOGSyNv6qaWk6gRGS,6,4,...,49,N,0,dbb59fc2-5cff-c96e-bd03-c81f7ff35029,vandalism,0.179491,0.165535,0.078210,0.576764,"(Culture.Media.Media*,)"
996,996,14970,04ff51127e6c7a9cc2e8842b2b7827f8,6356686,563981546,0,103.245.14.196,Vhl3ip0EwOKJN0TSY9MeDYpyKQ6ZqA2v,6,4,...,51,N,0,6e935faf-9502-e1d5-167a-d0895acecdbf,vandalism,0.046297,0.249662,0.034274,0.669767,"(Geography.Regions.Asia.Asia*,)"
997,997,15096,04ff5e2c0408f4a3d2e090b11c2fb49a,21868967,563886072,0,72.253.225.48,154vhLBMpUBUnX8wB6rJHJAcvNCH3pEC,6,4,...,-4,N,0,0654f465-b311-d9c4-1d86-fd12c8359b91,spam,0.053723,0.027930,0.786246,0.132101,"(STEM.Medicine & Health,)"
998,998,15107,04ff5f494e402a992078782bcb08708f,31795249,564048829,0,113.203.145.73,SUK68YZ6vIvPYsLhshvHnKqjMK9HOh25,6,4,...,-3,N,0,98082cf3-03a7-69aa-2c14-af79c4a0c1cd,vandalism,0.301423,0.160383,0.100973,0.437220,"(Culture.Internet culture,)"


In [203]:
FILE_OUTPUT_PATH = '/Users/klogg/research_data/aft/processed/chunks/chunk_{0}_5-6-20.csv'

def chunkAndProcess(df, fileout = None, chunk_size = 1000):
    df_list = [df[i:i+1000] for i in range(0,df.shape[0],1000)]
    for i, df_chunk in enumerate(df_list):
        try:
            result_chunk = addPredColumns(df_chunk)
            if fileout:
                result_chunk.to_csv(fileout.format(i), index = False)
        except:
            print('error in chunck {0}'.format(i))
            print(sys.exc_info())
    
test = df[1000:2000]
chunkAndProcess(test, fileout = FILE_OUTPUT_PATH)

scored 100 documents
scored 200 documents
135981c7-b984-dbe3-aef8-2d2e51f83e44 Ambassador Christopher Stevens was in Benghazi on Sept. 11, 2012, the day he died in a terrorist attack, because Secretary of State Hillary Clinton ordered him there.  Hillary Clinton had given Stevens direct instructions to prepare the CIA compound in Benghazi to be upgraded to the status of a U.S. diplomatic mission and Stevens, in complying with Clintonâs wishes, was in Benghazi the first time he had the opportunity to do so, cognizant of the need to visit the site before the end of the fiscal year, on Sept. 30, 2012.
 Barack Obama campaigned partly on his foreign policy successes and he did this by making the claim that al Qaeda was on the run and fractured. The Benghazi attack flew in the face of that claim so the Obama regime had to downplay the attack in order for Barack Obama to maintain his credibility, thus the cover-up in order to help the president win reelection.  And that is the best case sce

In [135]:
len(df)

114984

In [202]:
text = df.loc[df['UUID'] == uuid.UUID('135981c7-b984-dbe3-aef8-2d2e51f83e44')]['aft_comment'].tolist()[0]
text = formatCommentText(text)
text[1738]

'“'