In [1]:
import pandas as pd
import numpy as np

import nltk

from collections import Counter
from functools import reduce

import string

In [2]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### wrap-up for each request

In [3]:
class Request:
    
    stops = set(nltk.corpus.stopwords.words("english") + list(string.punctuation) + ["``", "''"])
    
    @staticmethod
    def tokenize(text, debug=False):
        return [i for i in nltk.word_tokenize(text.lower()) if i not in Request.stops]
    
    @staticmethod
    def unigram(tokens):
        return Counter(tokens)
    
    @staticmethod
    def bigram(tokens):
        while len(tokens) < 2:
            tokens.append("")
        return Counter(((tokens[i], tokens[i+1]) for i in range(len(tokens)-1)))
    
    def __init__(self, rid, text, score, ngram_mode):
        assert ngram_mode in {"unigram", "bigram"}, "illegal ngram mode"
        self._rid = rid
        self._text = text
        self._toks = Request.tokenize(text)
        self._score = score
        if ngram_mode == "unigram":
            self._ngram = Request.unigram(self._toks)
        else:
            self._ngram = Request.bigram(self._toks)
            
    def distance(self, right):
        #assert isinstance(right, Request), f"wrong operand for Request.distance({type(right)})"
        keys = set(self._ngram.keys())
        keys.update(right._ngram.keys())
        return reduce(lambda x,y : x+y, map(lambda k : (self._ngram.get(k, 0) - right._ngram.get(k, 0))**2, keys))

### read dataset

In [4]:
df_stack = pd.read_csv("Stanford_politeness_corpus/stack-exchange.annotated.csv", index_col=1)
df_wiki = pd.read_csv("Stanford_politeness_corpus/wikipedia.annotated.csv", index_col=1)

In [5]:
df = pd.concat([df_stack, df_wiki])

In [6]:
print(len(df))
print(df.columns)

10956
Index(['Community', 'Request', 'Score1', 'Score2', 'Score3', 'Score4',
       'Score5', 'TurkId1', 'TurkId2', 'TurkId3', 'TurkId4', 'TurkId5',
       'Normalized Score'],
      dtype='object')


### build requests and find closest

### unigram

In [7]:
unigram_requests = [Request(k, v["Request"], v["Normalized Score"], "unigram") for k, v in df[["Request", "Normalized Score"]].to_dict("index").items()]

In [28]:
text = 'why not  return'
print(nltk.word_tokenize(text))
new_req = Request(rid=-1, text=text, score=0, ngram_mode="unigram")


['why', 'not', 'return']


In [29]:
Request.tokenize(text)

['return']

In [30]:
ret = min(unigram_requests, key=new_req.distance)

In [31]:
ret._text

'what does allowed? do or return?'

### bigram

In [12]:
bigram_requests = [Request(k, v["Request"], v["Normalized Score"], "bigram") for k, v in df[["Request", "Normalized Score"]].to_dict("index").items()]

In [13]:
new_req = Request(rid=-1, text="Come on, I haven't unblocked many people before. Ok?", score=0, ngram_mode="bigram")

In [14]:
ret = min(bigram_requests, key=new_req.distance)

In [15]:
ret._text

"Sorry... haven't unblocked many people before. Is it ok now?"