In [14]:
import numpy as np
import pandas as pd
import seaborn as sns
from glob import glob
from sklearn.linear_model import LogisticRegression

In [15]:
bins = 5
n_per_bin = 200
context_chars = 500
seed = 416

In [17]:
data = []

for fn in glob('../propensity/scores/ps_*.csv'):
    df = pd.read_csv(fn)
    w1 = fn.split('_')[-2]
    w2 = fn[:-4].split('_')[-1]
    
    # calibrate the scores
    temp_model = LogisticRegression().fit(np.log(np.array(df["e(x)"])).reshape(-1, 1), df["label"])
    def func(row):
        x = np.array(row["e(x)"]).reshape(1, -1)
        prediction = temp_model.predict_proba(np.log(x))[0,1]
        return prediction
    df["calibrated"] = df.apply(func, axis=1)
    
    df['original'] = [w1] * len(df)
    df['synonym'] = [w2] * len(df)
    df['length'] = df['prefix'].apply(lambda x: len(x))
    df['bin'] = pd.cut(df['calibrated'], bins)
    
    # only append if the scores converged
    if df['e(x)'].max() - df['e(x)'].min() > 0.5:
        print(w1, w2)
        data.append(df)
    
df = pd.concat(data)
df.head(1)

nice good
next following
help assist
subject topic
file record
voice sound
size proportion
just quite
guy player
way direction
business operation
first initial
permit allow
monitor track
say state
grow increase
accept recognize
many multiple
team group
perhaps maybe
full whole
try attempt
return exchange
very really
area location
idea thought
device equipment
man person
own hold
choose pick
fell decrease
cut reduce


Unnamed: 0,prefix,idx,label,e(x),calibrated,original,synonym,length,bin
0,About Grand Slam Fishing Charters\n\nAs a fami...,4,0,0.611402,0.354269,nice,good,1395,"(0.314, 0.472]"


In [18]:
len(df)

640000

In [19]:
eligible = pd.DataFrame(df[(df['length'] > context_chars) & (df['label'] == 0)])
eligible['distance'] = np.abs(eligible['calibrated'] - 0.5)
eligible = eligible.sort_values('distance')
no_duplicates = eligible[~eligible.idx.duplicated(keep='first')]
len(df), len(no_duplicates)

(640000, 119683)

In [27]:
np.random.seed(seed)

In [34]:
data = []

for i, g in no_duplicates.groupby('original'):  
    try:
        sample = g.groupby('bin').sample(n_per_bin, replace=False, random_state=np.random.randint(0,1000))
        data.append(sample)
    except:
        continue
    print(i, g.synonym.unique()[0])

accept recognize
choose pick
cut reduce
grow increase
help assist
man person
monitor track
next following
nice good
perhaps maybe
subject topic
team group
voice sound


In [29]:
len(data)

13

In [30]:
prop_inputs = pd.concat(data)
len(prop_inputs)

13000

In [31]:
prop_inputs['label'] = np.random.randint(0, 2, size=len(prop_inputs))
prop_inputs['label'] = prop_inputs['label'].astype(bool)
prop_inputs['label'].mean()

0.5013846153846154

In [32]:
# prepare propagation inputs
prop_inputs = prop_inputs[['idx', 'prefix', 'length', 'original', 'synonym', 'label', 'calibrated']]
prop_inputs.columns = ['example_index', 'text', 'sub_index', 'original', 'synonym', 'substituted?', 'calibrated']
prop_inputs.to_csv('./propagation_inputs.csv', index=False, header=True)

In [33]:
prop_inputs.tail()

Unnamed: 0,example_index,text,sub_index,original,synonym,substituted?,calibrated
9707,570613,"Ruth Ledbetter, 78, of Bellmead went home to b...",800,voice,sound,True,0.749287
3251,189547,Jabez\n\nJabez or Jabes is a Biblical male gi...,1258,voice,sound,True,0.729611
6142,362558,"Another evening, another candlelit dinner. A t...",10834,voice,sound,False,0.732663
3494,204409,Budget Bluetooth: Six wireless headphones for ...,1629,voice,sound,False,0.8901
6810,402884,Young Death / Nightmarket\n\nYoung Death / Nig...,3583,voice,sound,True,0.804656
