In [1]:
import sqlite3
from bs4 import BeautifulSoup
from IPython.core.display import display, HTML
import re
import pickle
import pandas as pd

In [2]:
pd.set_option('display.max_colwidth', -1)

In [3]:
conn = sqlite3.connect('../Data/crossvalidated.db')
# return all the records for questions posts from posts table
ques_query = "SELECT * FROM [posts] WHERE PostTypeId==2"

In [4]:
apost_df = pd.read_sql_query(ques_query, conn)

In [5]:
apost_df.drop(['LastEditorDisplayName','CommunityOwnedDate','LastEditorUserId','LastEditDate',
             'LastActivityDate'],axis=1,inplace=True)

In [6]:
display(apost_df.Body[apost_df.Id==133694])#apost_df.Id==133694

53189    <p>What a great question- it's a chance to show how one would inspect the drawbacks and assumptions of any statistical method.  Namely: make up some data and try the algorithm on it!</p>\n\n<p>We'll consider two of your assumptions, and we'll see what happens to the k-means algorithm when those assumptions are broken. We'll stick to 2-dimensional data since it's easy to visualize. (Thanks to the <a href="http://en.wikipedia.org/wiki/Curse_of_dimensionality">curse of dimensionality</a>, adding additional dimensions is likely to make these problems more severe, not less). We'll work with the statistical programming language R: you can find the full code <a href="https://github.com/dgrtwo/dgrtwo.github.com/blob/master/_R/2015-01-16-kmeans-free-lunch.Rmd">here</a> (and the post in blog form <a href="http://varianceexplained.org/r/kmeans-free-lunch/">here</a>).</p>\n\n<h3>Diversion: Anscombe's Quartet</h3>\n\n<p>First, an analogy. Imagine someone argued the following:</p>\n\n<block

#Helper function

In [7]:
def clean_html_text(row):
    soup = BeautifulSoup(row, 'html.parser')
    #denote code
    for tag in soup.find_all('code'):
        tag.replaceWith(' refcode ')
    #denote link
    for tag in soup.find_all('a'):
        content = tag.text
        tag.replaceWith(content +' (reflink) ')
    #denote image
    for tag in soup.find_all('img'):
        tag.replaceWith(' refimage ')
        
    raw = soup.get_text().lower()
    #remove whitespace and /
    raw = re.sub('[\t\n\r\x0b\x0c/]+?', ' ', raw) 
    #denote mention
    # replace twitter @mentions
    mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
    raw = mentionFinder.sub("@mention", raw)
    
    #denote email
    raw = re.sub(r'[\w\.-]+@[\w\.-]+[\.][com|org|ch|uk]{2,3}', " refemail ", raw)
    #denote fomula
    reg = '(\$\$.+?\$\$)|((\\\\begin\{.+?\})(.+?)(\\\\end\{(.+?)\}))'
    raw = re.sub(reg, " refformula ", raw, flags=re.IGNORECASE)  
    #denote variable
    raw = re.sub('(\$.+?\$)|([a-z]\d)',' refvariable ', raw)
    #denote number
    raw = re.sub('[-+]?(\d*[.])?\d+',' refnumber ', raw)
    
    return(raw)

In [8]:
apost_df['Body_Text'] = apost_df.Body.map(lambda i: clean_html_text(i))

In [9]:
bins = [-36, 1, 260]
group_names = ['bad','good']
apost_df['AnsQuality']= pd.cut(apost_df['Score'],bins,labels=group_names)

In [10]:
apost_df.AnsQuality.value_counts()

good    40546
bad     33785
dtype: int64

In [12]:
import pickle
with open('../Data/ans_clean_forDL.pickle', 'wb') as handle:
    pickle.dump(apost_df[['Id','Body_Text','AnsQuality']], handle)

In [13]:
display(clean_html_text(apost_df.Body[apost_df.Id==4632].iloc[0]))

u"one at a somewhat lower level of mathematical sophistication than wooldridge (less dense, more pictures), but a bit more up to date on some of the fast-moving areas: murray, michael p. econometrics: a modern introduction. addison wesley,  refnumber .  refnumber  pp. isbn  refnumber  (reflink)  seems that it's not available for preview on the web and the publisher is out of stock, but you can view pdfs of  refnumber  web extensions (reflink)  to get an idea of its style. "

#Automatic Summarization

In [14]:
from gensim.summarization import summarize
from gensim.summarization import keywords

In [15]:
text = clean_html_text(apost_df.Body[apost_df.Id==1632].iloc[0])

In [16]:
text

u"it's hard to ignore the wealth of statistical packages available in r cran.  that said, i spend a lot of time in python land and would never dissuade anyone from having as much fun as i do.  :)  here are some libraries links you might find useful for statistical work.    numpy scipy (reflink)  you probably know about these already.  but let me point out the cookbook (reflink)  where you can read about many statistical facilities already available and the example list (reflink)  which is a great reference for functions (including data manipulation and other operations).  another handy reference is john cook's distributions in scipy (reflink) . pandas (reflink)  this is a really nice library for working with statistical data -- tabular data, time series, panel data.  includes many builtin functions for data summaries, grouping aggregation, pivoting.  also has a statistics econometrics library. larry (reflink)   labeled array that plays nice with numpy.  provides statistical functions n

In [17]:
print 'Summary:'
print summarize(text, word_count=30)

Summary:
but let me point out the cookbook (reflink)  where you can read about many statistical facilities already available and the example list (reflink)  which is a great reference for functions (including data manipulation and other operations).


In [18]:
print 'Keywords:'
print keywords(text,pos_filter=['NN'],ratio=0.1,lemmatize=True)

Keywords:
reflink
statistics
data
models
libraries
learning
packages


#Word2vec/Doc2vec prepare: Train and test set

In [19]:
from sklearn.cross_validation import train_test_split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(apost_df[['Id','Body_Text']], apost_df['AnsQuality'], test_size=0.4, random_state=42)

In [21]:
X_train['source'] = 'train'
X_test['source'] = 'test'

In [22]:
tmp_df = pd.concat([X_train, X_test],ignore_index=True)

In [23]:
apost_df = apost_df.merge(tmp_df, left_on=apost_df.Id,right_on=tmp_df.Id,suffixes=['_post', '_tmp'])

In [24]:
apost_df.columns

Index([u'Body', u'ViewCount', u'ClosedDate', u'ParentID', u'CommentCount',
       u'AnswerCount', u'AcceptedAnswerId', u'Score', u'OwnerDisplayName',
       u'Title', u'PostTypeId', u'OwnerUserId', u'Tags', u'CreationDate',
       u'FavoriteCount', u'Id_post', u'Body_Text_post', u'AnsQuality',
       u'Id_tmp', u'Body_Text_tmp', u'source'],
      dtype='object')

In [25]:
fdata = open("../data/allans_forDL.csv", 'wb')
for index,row in apost_df.iterrows():
    ID = row['Id_tmp']
    split = row['source']
    sentiment = row['AnsQuality']     
    ans = row["Body_Text_post"].encode("ascii", "ignore")
    
    fdata.write("AnsID:%s\t%s\t%s\t%s\n" % (str(ID),ans,split,sentiment))
fdata.close()

In [26]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple
import nltk

In [27]:
SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')

In [28]:
alldocs = []  # will hold all docs in original order
with open('../Data/allans_forDL.csv') as alldata:
    for line_no, line in enumerate(alldata):
        tags, words_texts, split, sentiment = line.strip().split('\t')
        words = nltk.word_tokenize(words_texts)
        if sentiment == "good":
            sentiment = 1
        else:
            sentiment = 0
            
        alldocs.append(SentimentDocument(words, tags, split, sentiment))

train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']
doc_list = alldocs[:]  # for reshuffling per pass

print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs)))

74331 docs: 44598 train-sentiment, 29733 test-sentiment


In [36]:
doc_list[5000]

SentimentDocument(words=['this', 'paper', 'by', 'massey', 'and', 'denton', 'refnumber', '(', 'reflink', ')', 'is', 'a', 'fairly', 'prolific', 'overview', 'of', 'commonly', 'used', 'indices', 'in', 'sociology', 'demography', '.', 'it', 'would', 'also', 'be', 'useful', 'for', 'some', 'other', 'key', 'terms', 'used', 'for', 'searching', 'articles', '.', 'frequently', 'in', 'sociology', 'the', 'indices', 'are', 'labelled', 'with', 'names', 'such', 'as', '``', 'heterogeneity', "''", 'and', '``', 'segregation', "''", 'as', 'well', 'as', '``', 'diversity', "''", '.', 'part', 'of', 'the', 'reason', 'no', 'absolute', 'right', 'answer', 'exists', 'to', 'your', 'question', 'is', 'that', 'people', 'frequently', 'only', 'use', 'epistemic', 'logic', 'to', 'reason', 'why', 'one', 'index', 'is', 'a', 'preferred', 'measurement', '.', 'infrequently', 'are', 'those', 'arguments', 'so', 'strong', 'that', 'one', 'should', 'entirely', 'discount', 'other', 'suggested', 'measures', '.', 'the', 'work', 'of', '

In [37]:
apost_df['Body_Text_post'][5000]

u'this paper by massey and denton  refnumber  (reflink)  is a fairly prolific overview of commonly used indices in sociology demography. it would also be useful for some other key terms used for searching articles. frequently in sociology the indices are labelled with names such as "heterogeneity" and "segregation" as well as "diversity". part of the reason no absolute right answer exists to your question is that people frequently only use epistemic logic to reason why one index is a preferred measurement. infrequently are those arguments so strong that one should entirely discount other suggested measures. the work of massey and denton is useful to highlight what many of these indices theoretically measure and when they differ to a substantively noticeable extent (in large cities in the us). '

#Set-up Doc2Vec Training & Evaluation Models

In [111]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

In [None]:
simple_models = [
    # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
    #Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DBOW 
    Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DM w/average
    Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores),
]

In [None]:
# speed setup by sharing results of 1st model's vocabulary scan
simple_models[0].build_vocab(alldocs)  # PV-DM/concat requires one special NULL word so it serves as template
print(simple_models[0])
for model in simple_models[1:]:
    model.reset_from(simple_models[0])
    print(model)

models_by_name = OrderedDict((str(model), model) for model in simple_models)

#Predictive Evaluation Methods

In [None]:
import numpy as np
import statsmodels.api as sm
from random import sample

# for timing
from contextlib import contextmanager
from timeit import default_timer
import time 

@contextmanager
def elapsed_timer():
    start = default_timer()
    elapser = lambda: default_timer() - start
    yield lambda: elapser()
    end = default_timer()
    elapser = lambda: end-start
    
def logistic_predictor_from_data(train_targets, train_regressors):
    logit = sm.Logit(train_targets, train_regressors)
    predictor = logit.fit(disp=0)
    #print(predictor.summary())
    return predictor

def error_rate_for_model(test_model, train_set, test_set, infer=False, infer_steps=3, infer_alpha=0.1, infer_subsample=0.1):
    """Report error rate on test_doc sentiments, using supplied model and train_docs"""

    train_targets, train_regressors = zip(*[(doc.sentiment, test_model.docvecs[doc.tags[0]]) for doc in train_set])
    train_regressors = sm.add_constant(train_regressors)
    predictor = logistic_predictor_from_data(train_targets, train_regressors)

    test_data = test_set
    if infer:
        if infer_subsample < 1.0:
            test_data = sample(test_data, int(infer_subsample * len(test_data)))
        test_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) for doc in test_data]
    else:
        test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_docs]
    test_regressors = sm.add_constant(test_regressors)
    
    # predict & evaluate
    test_predictions = predictor.predict(test_regressors)
    corrects = sum(np.rint(test_predictions) == [doc.sentiment for doc in test_data])
    errors = len(test_predictions) - corrects
    error_rate = float(errors) / len(test_predictions)
    return (error_rate, errors, len(test_predictions), predictor) 

#Bulk Training

Using explicit multiple-pass, alpha-reduction approach as sketched in gensim doc2vec blog post – with added shuffling of corpus on each pass.

Evaluation of each model's sentiment-predictive power is repeated after each pass, as an error rate (lower is better), to see the rates-of-relative-improvement. The base numbers reuse the TRAIN and TEST vectors stored in the models for the logistic regression, while the inferred results use newly-inferred TEST vectors.

In [None]:
from random import shuffle
import datetime

alpha, min_alpha, passes = (0.025, 0.001, 20)
alpha_delta = (alpha - min_alpha) / passes

print("START %s" % datetime.datetime.now())

for epoch in range(passes):
    shuffle(doc_list)  # shuffling gets best results
    
    for name, train_model in models_by_name.items():
        # train
        duration = 'na'
        train_model.alpha, train_model.min_alpha = alpha, alpha
        with elapsed_timer() as elapsed:
            train_model.train(doc_list)
            duration = '%.1f' % elapsed()
            
        # evaluate
        eval_duration = ''
        with elapsed_timer() as eval_elapsed:
            err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs)
        eval_duration = '%.1f' % eval_elapsed()
        best_indicator = ' '
        if err <= best_error[name]:
            best_error[name] = err
            best_indicator = '*' 
        print("%s%f : %i passes : %s %ss %ss" % (best_indicator, err, epoch + 1, name, duration, eval_duration))

        if ((epoch + 1) % 5) == 0 or epoch == 0:
            eval_duration = ''
            with elapsed_timer() as eval_elapsed:
                infer_err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs, infer=True)
            eval_duration = '%.1f' % eval_elapsed()
            best_indicator = ' '
            if infer_err < best_error[name + '_inferred']:
                best_error[name + '_inferred'] = infer_err
                best_indicator = '*'
            print("%s%f : %i passes : %s %ss %ss" % (best_indicator, infer_err, epoch + 1, name + '_inferred', duration, eval_duration))

    print('completed pass %i at alpha %f' % (epoch + 1, alpha))
    alpha -= alpha_delta
    
print("END %s" % str(datetime.datetime.now()))

#Achieved Sentiment-Prediction Accuracy

In [None]:
# print best error rates achieved
for rate, name in sorted((rate, name) for name, rate in best_error.items()):
    print("%f %s" % (rate, name))

##Are inferred vectors close to the precalculated ones?

In [None]:
doc_id = np.random.randint(simple_models[0].docvecs.count)  # pick random doc; re-run cell for more examples
print('for doc %d...' % doc_id)
for model in simple_models:
    inferred_docvec = model.infer_vector(alldocs[doc_id].words)
    print('%s:\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=3)))

##Do close documents seem more related than distant ones?¶

In [None]:
import random

doc_id = np.random.randint(simple_models[0].docvecs.count)  # pick random doc, re-run cell for more examples
model = random.choice(simple_models)  # and a random model
sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count)  # get *all* similar documents
print(u'TARGET (%d): «%s»\n' % (doc_id, ' '.join(alldocs[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(alldocs[sims[index][0]].words)))