In [5]:
import numpy as np
import pandas as pd
import os
import re

In [49]:
files = os.listdir('data/answers')
output_names = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [125]:
files

['linear_ensemble_0.75_09723.csv',
 'lstm_09832.csv',
 'linear_ensemble_0.9_09724.csv',
 'linear_baseline1_09717.csv',
 'linear_ensemble_0.8_09723.csv',
 'lstm2_09831.csv',
 'linear_ensemble_0.5_09713.csv',
 'linear_baseline2_09600.csv',
 'linear_ensemble_0.85_09724.csv',
 'lstm_09831.csv',
 'lstm_09843.csv',
 'lstm_09844.csv',
 'gru_09811.csv',
 'gru_09837.csv',
 'gru_09835.csv',
 'lstm_09838.csv',
 'lstm_09828.csv',
 'gru_09845.csv']

In [127]:
def blend(outName = '' , filtering = ['lstm', 'gru'], topn = None, weighted = False, weightedMin=0.95, ):
    '''
    Inputs:
        outName: output file name
        filtering: list of regular expressions
        topn: int or None. whether or not to take n scores after regex matching
        weighted: whether to use a simple average or a weighted average
        weightedMin: score to subtract all scores from before scaling sums of scores to one        
    '''
    answer = pd.DataFrame(columns = ['id'] + output_names)
    filesToRead = []
    for i in files:
        if any([re.search(j, i) for j in filtering]):
            filesToRead.append(i)
    scores = {}
    for i in filesToRead:
        scores[i] = float('0.' + re.findall(r'_([0-9]+)\.csv',i)[0][1:])
    if topn:
        toTake = list(zip(*sorted(scores.items(), key = lambda x: -x[1])[:topn]))[0]
    else:
        toTake = filesToRead
        
    preds = {}       
    for i in toTake:
        preds[i] = pd.read_csv('data/answers/' + i)
        preds[i] = preds[i].sort_values(by = 'id')    
    answer['id'] = preds[i]['id']
    results = np.zeros(shape = (preds[i].shape[0], preds[i].shape[1] - 1, len(toTake)))
    for c, i in enumerate(preds):
        results[:,:,c] = preds[i][output_names].values
        
    if not weighted:
        answer[output_names] = np.mean(results, axis = -1)
    else:
        assert(all([scores[i]-weightedMin >= 0 for i in toTake]))
        total = sum([scores[i] - weightedMin for i in toTake])
        scalings = [(scores[i] - weightedMin)/total for i in toTake]
        for i in range(len(toTake)):
            results[:,:,i] *= scalings[i]            
        answer[output_names] = np.sum(results, axis = -1)
    answer.to_csv('data/answers/ensembles/'+outName, index = False)

In [128]:
blend('allUnweighted.csv', filtering = ['.*'])
blend('allWeighted095.csv', filtering = ['.*'], weighted = True, weightedMin=0.95)
blend('allWeighted096.csv', filtering = ['.*'], weighted = True, weightedMin=0.96)

In [129]:
for i in range(3, 12,2): #unweighted GRUs and LSTMs
    blend('unweightedRNNTop'+str(i) + '.csv', topn= i )

In [130]:
for i in range(3, 12,2): #weighted GRUs and LSTMs
    blend('weightedRNN098Top'+str(i) + '.csv', topn= i, weighted = True, weightedMin= 0.98)
    blend('weightedRNN097Top'+str(i) + '.csv', topn= i, weighted = True, weightedMin= 0.97)