In [14]:
import pandas as pd
import numpy as np

import nltk

from collections import Counter
from functools import reduce

import string

### wrap-up for each request

In [46]:
class Request:
    
    stops = nltk.corpus.stopwords.words("english") + list(string.punctuation)
    
    @staticmethod
    def tokenize(text):
        return [i for i in nltk.word_tokenize(text.lower()) if i not in Request.stops]
    
    @staticmethod
    def unigram(tokens):
        return Counter(tokens)
    
    @staticmethod
    def bigram(tokens):
        while len(tokens) < 2:
            tokens.append("")
        return Counter(((tokens[i], tokens[i+1]) for i in range(len(tokens-1))))
    
    def __init__(self, rid, text, score, ngram_mode):
        assert ngram_mode in {"unigram", "bigram"}, "illegal ngram mode"
        self._rid = rid
        self._text = text
        self._score = score
        if ngram_mode == "unigram":
            self._ngram = Request.unigram(Request.tokenize(text))
        else:
            self._ngram = Request.bigram(Request.tokenize(text))
            
    def distance(self, right):
        assert isinstance(right, Request), f"wrong operand for Request.distance({type(right)})"
        keys = set(self._ngram.keys())
        keys.update(right._ngram.keys())
        return reduce(lambda x,y : x+y, map(lambda k : (self._ngram.get(k, 0) - right._ngram.get(k, 0))**2, keys))

### read dataset

In [47]:
df_stack = pd.read_csv("Stanford_politeness_corpus/stack-exchange.annotated.csv", index_col=1)
df_wiki = pd.read_csv("Stanford_politeness_corpus/wikipedia.annotated.csv", index_col=1)

In [48]:
df = pd.concat([df_stack, df_wiki])

In [49]:
print(len(df))
print(df.columns)

10956
Index(['Community', 'Request', 'Score1', 'Score2', 'Score3', 'Score4',
       'Score5', 'TurkId1', 'TurkId2', 'TurkId3', 'TurkId4', 'TurkId5',
       'Normalized Score'],
      dtype='object')


### build requests and find closest

In [50]:
unigram_requests = [Request(k, v["Request"], v["Normalized Score"], "unigram") for k, v in df[["Request", "Normalized Score"]].to_dict("index").items()]

In [62]:
new_req = Request(rid=-1, text="Please tackle the <url> of <url>.", score=0, ngram_mode="unigram")

In [63]:
ret = min(unigram_requests, key=new_req.distance)

In [64]:
ret._text

'<url>???'