Mögliche Labels:
- eindeutige Positionierung der Zielspalte
- mehrfache Positionierungen der Zielspalten sortiert nach derer Kosinus-Ähnlichkeit

In [1]:
from utils.preprocessing import WordEmbedding, load_word_emb
import random
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import json
import numpy as np

In [2]:
w2v_config = {
    'data_dir': 'data/glove',
    'word2idx_path': 'word2idx.json',
    'usedwordemb_path': 'usedwordemb.npy'
}
w2v = WordEmbedding(load_word_emb(w2v_config['data_dir'], 
                                  w2v_config['word2idx_path'],
                                  w2v_config['usedwordemb_path']))

In [9]:
# load table schemata
with open('data/wikisql/tables.jsonl') as file:
    table_schemata = pd.DataFrame([json.loads(line) for line in file.readlines()])
table_schemata['fits'] = ''
table_schemata['header'] = table_schemata['header'].apply(lambda x: '<|>'.join(x)) # needed to drop duplicates
table_schemata = table_schemata[['header', 'fits']].drop_duplicates().reset_index(drop=True)
table_schemata['header'] = table_schemata['header'].apply(lambda x: x.split('<|>')) # rebuild original state
table_schemata.head()

Unnamed: 0,header,fits
0,"[Date, Time, ACC Team, Big Ten Team, Location,...",
1,"[Institution, Wins, Loss, Home Wins, Home Loss...",
2,"[Pick #, MLS Team, Player, Position, Affiliation]",
3,"[DVD title, Number of Episodes, Region 2, Regi...",
4,"[Year, Coach, Crew, Record, Win %]",


In [10]:
table_schemata = table_schemata.head(1000)

In [11]:
all_headers = []
_ = [all_headers.extend(header) for header in table_schemata['header'].values]
all_headers = list(set(all_headers))
random.shuffle(all_headers)
candidates = [column \
              .replace('/', ' ') \
              .replace('_', ' ') \
              for column in all_headers[:100]]
cache = {candidate: np.mean([w2v(word.lower()) for word in candidate.split()], axis=0) for candidate in candidates}

In [14]:
from collections import defaultdict

def calculate_fits(header, cache):
    embs = [np.mean([w2v(word.lower()) for word in col.split()], axis=0) for col in header]
    fits = defaultdict(list)
    for candidate, embedding in cache.items():
        vectors = [embedding] + embs
        similarity = cosine_similarity(vectors)[0][1:]
        max_sim = np.max(similarity)
        max_pos = np.argmax(similarity)
        
        # append the best candidate with its similarity to the header
        # to the lists of candidates for the given header
        # i.e. fits['ACC Team'] -> [('Team', 0.75), ('Coach', 0.64), (<new candidate>, <similarity>)]
        fits[header[max_pos]].append((candidate, max_sim))
    return fits

def reduce_fits(fits, threshold=0.5):
    best_fits = {}
    for column, candidate_tuples in fits.items():
        best_candidate = sorted(candidate_tuples, key=lambda x: x[1], reverse=True)[0]
        similarity = best_candidate[1]
        if similarity > threshold:
            best_fits[column] = best_candidate[0]
    return best_fits

for idx, row in table_schemata.iterrows():
    header = row['header']
    fits = calculate_fits(header, cache)
    best_fits = reduce_fits(fits)
    row['fits'] = best_fits
    table_schemata.loc[idx] = row
    if idx == 5:
        break

In [16]:
table_schemata.head()

Unnamed: 0,header,fits
0,"[Date, Time, ACC Team, Big Ten Team, Location,...","{'Location': 'Location Attendance', 'Big Ten T..."
1,"[Institution, Wins, Loss, Home Wins, Home Loss...","{'Away Wins': 'Opponent in the final', 'Away L..."
2,"[Pick #, MLS Team, Player, Position, Affiliation]","{'Pick #': 'Series #', 'Position': 'Position',..."
3,"[DVD title, Number of Episodes, Region 2, Regi...","{'Region 1 (US)': 'Year (most recent)', 'Numbe..."
4,"[Year, Coach, Crew, Record, Win %]","{'Win %': '% 60-74', 'Year': 'Last Year', 'Rec..."


In [18]:
rows = []
for idx, ts_row in table_schemata.iterrows():
    header, fits = row[['header', 'fits']]
    if idx == 5:
        break

Vorgehen:
1. für jede Tabelle eine Spalte finden, die aus Quellschema sein könnte -> die als Eingabe  
Format: Quellspalte -> [Zielspalte]