In [1]:
from __future__ import print_function
from collections import defaultdict
import numpy as np
import cPickle as pickle
import time
import os
import csv
import re
from itertools import groupby
import nltk
from dateutil import parser as dateparser

data_path = os.path.join(os.path.expanduser("~"), 'data/stack_overflow')

In [2]:
def read_file(f):
    for l in open(f):
        yield l

In [3]:
# regexs
RE_NONALNUM = re.compile(r'\W+')
RE_NONANS = re.compile(r'[^\w\s]+')
RE_DIGIT = re.compile(r'\d+')
RE_URL = re.compile(r'https?://')
RE_NONWORD = re.compile(r'[A-Z\d]+')

# labels from 0 to 5, 0 for undefined
all_status = ['not a real question', # 1
              'not constructive',    # 2
              'off topic',           # 3
              'open',                # 4
              'too localized']       # 5
status_map_label = dict((k, int(i + 1)) for i, k in enumerate(all_status))


def gen_datum_feature_dict(datum):
    def norm(string):
        return RE_NONANS.sub('', string).lower()

    def norm_tag(string):
        return RE_NONALNUM.sub('', string).lower()

    def ratio(x, y):
        if y != 0:
            return x / float(y)
        else:
            return 0

    # feature container
    f_dict = defaultdict(dict)
    
    # get text features
    body = datum['BodyMarkdown']
    lines = body.splitlines()
    code = []
    text = []
    sents = []
    title = datum['Title']
    tags = [norm_tag(datum["Tag%d" % i])
            for i in range(1, 6) if datum["Tag%d" % i]]

    # divide post into code and text blocks
    for is_code, group in groupby(lines, lambda l: l.startswith('    ')):
        (code if is_code else text).append('\n'.join(group))

    # build text f_dict features
    f_dict['num']['sent'] = 0
    f_dict['num']['question'] = 0
    f_dict['num']['exclam'] = 0
    f_dict['num']['period'] = 0
    f_dict['num']['initcap'] = 0
    f_dict['num']['istart'] = 0
    f_dict['num']['url'] = 0
    f_dict['num']['digit'] = 0
    f_dict['num']['nonword'] = 0

    for t in text:
        for sent in nltk.sent_tokenize(t):
            f_dict['num']['sent'] += 1
            ss = sent.strip()
            if ss:
                if ss.endswith('?'):
                    f_dict['num']['question'] += 1
                if ss.endswith('!'):
                    f_dict['num']['exclam'] += 1
                if ss.endswith('.'):
                    f_dict['num']['period'] += 1
                if ss.startswith('I '):
                    f_dict['num']['istart'] += 1
                if ss[0].isupper():
                    f_dict['num']['initcap'] += 1

            words = nltk.word_tokenize(norm(sent))
            sents.append(ss)

        f_dict['num']['digit'] += len(RE_DIGIT.findall(t))
        f_dict['num']['url'] += len(RE_URL.findall(t))
        f_dict['num']['nonword'] += len(RE_NONWORD.findall(t))

    f_dict['num']['finalthanks'] = 1 if text and 'thank' in text[-1].lower() else 0
    f_dict['num']['codeblock'] = len(code)
    f_dict['num']['textblock'] = len(text)
    f_dict['num']['lines'] = len(lines)
    f_dict['num']['tags'] = len(tags)
    
    # len features
    f_dict['len']['title'] = len(title)
    f_dict['len']['text'] = sum(len(t) for t in text)
    f_dict['len']['code'] = sum(len(c) for c in code)
    f_dict['len']['firsttext'] = len(text[0]) if text else 0
    f_dict['len']['firstcode'] = len(code[0]) if code else 0
    f_dict['len']['lasttext'] = len(text[-1]) if text else 0
    f_dict['len']['lastcode'] = len(code[-1]) if code else 0
    
    # ratio features
    f_dict['ratio']['tc'] = ratio(f_dict['len']['text'],
                                  f_dict['len']['code'])
    f_dict['ratio']['ftc'] = ratio(f_dict['len']['firsttext'],
                                   f_dict['len']['firstcode'])
    f_dict['ratio']['ftext'] = ratio(f_dict['len']['firsttext'],
                                     f_dict['len']['text'])
    f_dict['ratio']['fcode'] = ratio(f_dict['len']['firstcode'],
                                     f_dict['len']['code'])
    f_dict['ratio']['qsent'] = ratio(f_dict['num']['question'],
                                     f_dict['num']['sent'])
    f_dict['ratio']['esent'] = ratio(f_dict['num']['exclam'],
                                     f_dict['num']['sent'])
    f_dict['ratio']['psent'] = ratio(f_dict['num']['period'],
                                     f_dict['num']['sent'])
    
    # mean features
    f_dict['mean']['code'] = np.mean([len(c) for c in code]) if code else 0
    f_dict['mean']['text'] = np.mean([len(t) for t in text]) if text else 0
    f_dict['mean']['sent'] = np.mean([len(s) for s in sents]) if sents else 0
    
    # user's post feature 
    f_dict['user'] = dict()
    post_time = dateparser.parse(datum['PostCreationDate'])
    user_create_time = dateparser.parse(datum['OwnerCreationDate'])
    f_dict['user']['age'] = (post_time - user_create_time).total_seconds()
    f_dict['user']['reputation'] = int(datum['ReputationAtPostCreation'])
    f_dict['user']['good_posts'] = int(datum['OwnerUndeletedAnswerCountAtPostTime'])

    return f_dict

def gen_datum_label(datum):
    try:
        post_status_id = status_map_label[datum['OpenStatus']]
    except KeyError:
        post_status_id = '0' # test set
    return post_status_id

In [4]:
# get a sample datum
count = 0
reader = csv.DictReader(open(os.path.join(data_path, 'train-sample.csv')))

open_statuses = set()
for datum in reader:
    open_statuses.add(datum['OpenStatus'])
    count += 1
    if count == 2:
        break

In [5]:
feature_dict = gen_datum_feature_dict(datum)
label = gen_datum_label(datum)

In [6]:
print(label)

4


In [7]:
feature_dict

defaultdict(dict,
            {'len': {'code': 739,
              'firstcode': 73,
              'firsttext': 120,
              'lastcode': 98,
              'lasttext': 32,
              'text': 175,
              'title': 54},
             'mean': {'code': 147.80000000000001,
              'sent': 33.399999999999999,
              'text': 29.166666666666668},
             'num': {'codeblock': 5,
              'digit': 0,
              'exclam': 0,
              'finalthanks': 1,
              'initcap': 4,
              'istart': 0,
              'lines': 34,
              'nonword': 6,
              'period': 1,
              'question': 0,
              'sent': 5,
              'tags': 3,
              'textblock': 6,
              'url': 0},
             'ratio': {'esent': 0.0,
              'fcode': 0.09878213802435724,
              'ftc': 1.643835616438356,
              'ftext': 0.6857142857142857,
              'psent': 0.2,
              'qsent': 0.0,
              'tc': 0.