In [2]:
from __future__ import print_function
from collections import defaultdict
import numpy as np
import cPickle as pickle
import time
import os
import csv
import re
from itertools import groupby
import nltk
from dateutil import parser as dateparser
from pprint import pprint

data_path = os.path.join(os.path.expanduser("~"), 'data/stack_overflow')

In [13]:
# TODO: package into class
def norm(string):
    return RE_NONANS.sub('', string).lower()

def norm_tag(string):
    return RE_NONALNUM.sub('', string).lower()

def ratio(x, y):
    if y != 0:
        return x / float(y)
    else:
        return 0
    
# regexs
RE_NONALNUM = re.compile(r'\W+')
RE_NONANS = re.compile(r'[^\w\s]+')
RE_DIGIT = re.compile(r'\d+')
RE_URL = re.compile(r'https?://')
RE_NONWORD = re.compile(r'[A-Z\d]+')

# labels from 0 to 5, 0 for undefined
all_status = ['not a real question', # 1
              'not constructive',    # 2
              'off topic',           # 3
              'open',                # 4
              'too localized']       # 5
status_map_label = dict((k, int(i + 1)) for i, k in enumerate(all_status))


class FeatureLabelBuilder(object):
    def __init__(self, datum):
        self.datum = datum
        
    @staticmethod
    def __dict_to_list(d):
        stack = []
        keys = sorted(d.keys())
        for k in keys:
            if type(d[k]) in (dict, defaultdict):
                stack += (FeatureLabelBuilder.__dict_to_list(d[k]))
            else:
                stack.append(d[k])
        return stack

    @staticmethod
    def __dict_to_keys(d, prefix=''):
        stack = []
        keys = sorted(d.keys())
        for k in keys:
            prefix_k = k if len(prefix) == 0 else prefix + '-' + k
            if type(d[k]) in (dict, defaultdict):
                stack += (FeatureLabelBuilder.__dict_to_keys(d[k], prefix_k))
            else:
                stack.append(prefix_k)
        return stack

    @staticmethod
    def __get_feature_dict(datum):
        """
        return feature in a dict format
        """

        def norm(string):
            return RE_NONANS.sub('', string).lower()

        def norm_tag(string):
            return RE_NONALNUM.sub('', string).lower()

        def ratio(x, y):
            if y != 0:
                return x / float(y)
            else:
                return 0

        # feature container
        f_dict = defaultdict(dict)

        # get text features
        body = datum['BodyMarkdown']
        lines = body.splitlines()
        code = [] # code
        text = [] # text
        sentences = [] # sentence
        title = datum['Title'] # title
        tags = [norm_tag(datum["Tag%d" % i])
                for i in range(1, 6) if datum["Tag%d" % i]]

        # divide post into code and text blocks
        for is_code, group in groupby(lines, lambda l: l.startswith('    ')):
            (code if is_code else text).append('\n'.join(group))

        # build text f_dict features
        f_dict['num']['sentence'] = 0
        f_dict['num']['question'] = 0
        f_dict['num']['exclam'] = 0
        f_dict['num']['period'] = 0
        f_dict['num']['init_cap'] = 0
        f_dict['num']['i_start'] = 0
        f_dict['num']['url'] = 0
        f_dict['num']['digit'] = 0
        f_dict['num']['non_word'] = 0

        for t in text:
            for sent in nltk.sent_tokenize(t):
                f_dict['num']['sentence'] += 1
                ss = sent.strip()
                if ss:
                    if ss.endswith('?'):
                        f_dict['num']['question'] += 1
                    if ss.endswith('!'):
                        f_dict['num']['exclam'] += 1
                    if ss.endswith('.'):
                        f_dict['num']['period'] += 1
                    if ss.startswith('I '):
                        f_dict['num']['i_start'] += 1
                    if ss[0].isupper():
                        f_dict['num']['init_cap'] += 1

                words = nltk.word_tokenize(norm(sent))
                sentences.append(ss)

            f_dict['num']['digit'] += len(RE_DIGIT.findall(t))
            f_dict['num']['url'] += len(RE_URL.findall(t))
            f_dict['num']['non_word'] += len(RE_NONWORD.findall(t))

        f_dict['num']['final_thanks'] = 1 if text and 'thank' in text[-1].lower() else 0
        f_dict['num']['code_block'] = len(code)
        f_dict['num']['text_block'] = len(text)
        f_dict['num']['lines'] = len(lines)
        f_dict['num']['tags'] = len(tags)

        # len features
        f_dict['len']['title'] = len(title)
        f_dict['len']['text'] = sum(len(t) for t in text)
        f_dict['len']['code'] = sum(len(c) for c in code)
        f_dict['len']['first_text'] = len(text[0]) if text else 0
        f_dict['len']['first_code'] = len(code[0]) if code else 0
        f_dict['len']['last_text'] = len(text[-1]) if text else 0
        f_dict['len']['last_code'] = len(code[-1]) if code else 0

        # ratio features
        f_dict['ratio']['text_code'] = ratio(f_dict['len']['text'],
                                      f_dict['len']['code'])
        f_dict['ratio']['first_text_first_code'] = ratio(f_dict['len']['first_text'],
                                                         f_dict['len']['first_code'])
        f_dict['ratio']['first_text_text'] = ratio(f_dict['len']['first_text'],
                                                   f_dict['len']['text'])
        f_dict['ratio']['first_code_code'] = ratio(f_dict['len']['first_code'],
                                                   f_dict['len']['code'])
        f_dict['ratio']['question_sentence'] = ratio(f_dict['num']['question'],
                                                     f_dict['num']['sentence'])
        f_dict['ratio']['exclam_sentence'] = ratio(f_dict['num']['exclam'],
                                                   f_dict['num']['sentence'])
        f_dict['ratio']['period_sentence'] = ratio(f_dict['num']['period'],
                                                   f_dict['num']['sentence'])

        # mean features
        f_dict['mean']['code'] = np.mean([len(c) for c in code]) if code else 0
        f_dict['mean']['text'] = np.mean([len(t) for t in text]) if text else 0
        f_dict['mean']['sentence'] = np.mean(
            [len(s) for s in sentences]) if sentences else 0

        # user's post feature
        f_dict['user'] = dict()
        post_time = dateparser.parse(datum['PostCreationDate'])
        user_create_time = dateparser.parse(datum['OwnerCreationDate'])
        f_dict['user']['age'] = (post_time - user_create_time).total_seconds()
        f_dict['user']['reputation'] = int(datum['ReputationAtPostCreation'])
        f_dict['user']['good_posts'] = int(
            datum['OwnerUndeletedAnswerCountAtPostTime'])

        # time
        f_dict['time']['year'] = post_time.year
        f_dict['time']['month'] = post_time.month
        f_dict['time']['day'] = post_time.day
        f_dict['time']['weekday'] = post_time.weekday()

        return dict(f_dict)
    
    @staticmethod
    def __get_label(datum):
        """
        return label as int 0,1,2,3,4,5; 0 if test set
        """

        try:
            label = status_map_label[datum['OpenStatus']]
        except KeyError:
            label = 0  # test set
        return label
    
    def label(self):
        return FeatureLabelBuilder.__get_label(self.datum)
    
    def feature(self):
        feature_dict = FeatureLabelBuilder.__get_feature_dict(self.datum)
        return FeatureLabelBuilder.__dict_to_list(feature_dict)
    
    def feature_keys(self):
        feature_dict = FeatureLabelBuilder.__get_feature_dict(self.datum)
        return FeatureLabelBuilder.__dict_to_keys(feature_dict)

In [14]:
# get a sample datum
reader = csv.DictReader(open(os.path.join(data_path, 'train-sample.csv')))

count = 0
for datum in reader:
    count += 1
    if count == 1123:
        break
        
# print datum
pprint(datum)

{'BodyMarkdown': 'I want to write a client for a site on windows phone 7 but the site has no API. The client should simply take data from a website and display of a suitable form. What can I use to write such a client',
 'OpenStatus': 'not a real question',
 'OwnerCreationDate': '12/22/2011 12:58:55',
 'OwnerUndeletedAnswerCountAtPostTime': '0',
 'OwnerUserId': '1111755',
 'PostClosedDate': '12/22/2011 14:22:55',
 'PostCreationDate': '12/22/2011 13:43:33',
 'PostId': '8604738',
 'ReputationAtPostCreation': '1',
 'Tag1': 'c#',
 'Tag2': 'windows-phone-7',
 'Tag3': '',
 'Tag4': '',
 'Tag5': '',
 'Title': 'Windows Phone client for site'}


In [17]:
flb = FeatureLabelBuilder(datum)
flb.feature()
flb.feature_keys()

['len-code',
 'len-first_code',
 'len-first_text',
 'len-last_code',
 'len-last_text',
 'len-text',
 'len-title',
 'mean-code',
 'mean-sentence',
 'mean-text',
 'num-code_block',
 'num-digit',
 'num-exclam',
 'num-final_thanks',
 'num-i_start',
 'num-init_cap',
 'num-lines',
 'num-non_word',
 'num-period',
 'num-question',
 'num-sentence',
 'num-tags',
 'num-text_block',
 'num-url',
 'ratio-exclam_sentence',
 'ratio-first_code_code',
 'ratio-first_text_first_code',
 'ratio-first_text_text',
 'ratio-period_sentence',
 'ratio-question_sentence',
 'ratio-text_code',
 'time-day',
 'time-month',
 'time-weekday',
 'time-year',
 'user-age',
 'user-good_posts',
 'user-reputation']

In [None]:
# """ convert dictionary to list """
# d = feature_dict
# d['first'] = dict()
# d['first']['second'] = dict()
# d['first']['second']['third'] = 123456.

# def dict_to_list(d):
#     stack = []
#     keys = sorted(d.keys())
#     for k in keys:
#         if type(d[k]) in (dict, defaultdict):
#             stack += (dict_to_list(d[k]))
#         else:
#             stack.append(d[k])
#     return stack
    
# def dict_to_keys(d, prefix=''):
#     stack = []
#     keys = sorted(d.keys())
#     for k in keys:
#         prefix_k = k if len(prefix) == 0 else prefix + '-' + k
#         if type(d[k]) in (dict, defaultdict):
#             stack += (dict_to_keys(d[k], prefix_k))
#         else:
#             stack.append(prefix_k)
#     return stack