In [15]:
import csv
import functools

from revscoring.features import wikitext
from revscoring.datasources.meta import mappers, vectorizers
from revscoring.datasources import revision_oriented
from revscoring.dependencies import solve
from revscoring.features.meta import aggregators

In [20]:
def load_vectorizer(enwiki_kvs_path):
    enwiki_kvs = vectorizers.word2vec.load_gensim_kv(
        path=enwiki_kvs_path,
        mmap="r"
    )

    vectorize_words = functools.partial(vectorizers.word2vec.vectorize_words, enwiki_kvs)

    revision_text_vectors = vectorizers.word2vec(
        mappers.lower_case(wikitext.revision.datasources.words),
        vectorize_words,
        name="revision.text.en_vectors")

    w2v = aggregators.mean(
        revision_text_vectors,
        vector=True,
        name="revision.text.en_vectors_mean"
    )

    return w2v

vectorizer = load_vectorizer('../word2vec/enwiki-20200501-learned_vectors.50_cell.10k.kv')
tokenizer = mappers.lower_case(wikitext.revision.datasources.words)

In [21]:
FILEPATH = '/Users/klogg/research_data/aft/raw/dump_03-24-20.csv'

def process(filepath, save_tokens=False, debug=False):
    comment_list = []
    with open(filepath) as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',', escapechar='\\')
        cols_to_extract = [
            'aft_id',
            'aft_page',
            'aft_page_revision',
            'aft_user', 
            'aft_user_text',
            'aft_comment',
            'aft_noaction', 
            'aft_inappropriate',
            'aft_helpful', 
            'aft_unhelpful'
        ]
        
        for i, row in enumerate(csvreader):
            if i == 0:
                header = row
            else:
                observation = {}
                for j, cell in enumerate(row):
                    if header[j] in cols_to_extract:
                        observation[header[j]] = cell
                
                cache = {}

                cache[revision_oriented.revision.text] = observation['aft_comment']
                tokenized_text = solve(tokenizer, cache=cache, context=None)

                cache[tokenizer] = tokenized_text
                observation['feature_vector'] = solve(vectorizer, cache=cache, context=None)

                if save_tokens:
                    observation['tokenized_text'] = tokenized_text
                
                yield observation
            
            if debug:
                if i == debug:
                    break 
                
for comment in process(FILEPATH):
    print(comment)

{'aft_id': '04f8e0e598742a992078782bcb08708f', 'aft_page': '3235587', 'aft_page_revision': '543382932', 'aft_user': '16211526', 'aft_user_text': 'Aft5hide', 'aft_comment': 'This is a test', 'aft_noaction': '1', 'aft_inappropriate': '0', 'aft_helpful': '0', 'aft_unhelpful': '0', 'feature_vector': [-0.1532449573278427, -0.4001066982746124, 0.4541527032852173, 0.13911527395248413, -0.0364912673830986, 0.7478120923042297, -0.05662533640861511, 0.39939382672309875, 0.893058717250824, 0.2397097498178482, -0.13020268082618713, 0.4093013405799866, 0.2659134864807129, 0.09960272908210754, -0.02469797059893608, -0.172836571931839, 0.26157525181770325, -0.18799465894699097, 0.4572039246559143, -0.3244284391403198, 0.8059065341949463, -0.7381508946418762, -0.48238784074783325, -0.4806919991970062, -0.6197969913482666, 0.09806283563375473, -0.12558457255363464, 0.39431339502334595, -0.1987651288509369, 0.44880327582359314, -0.45737501978874207, 0.5243905186653137, 0.019599515944719315, -0.464253067

In [23]:
['s','a'].index('a')

1

In [24]:
s = ""
not s

True