Mögliche Labels:
- eindeutige Positionierung der Zielspalte
- mehrfache Positionierungen der Zielspalten sortiert nach derer Kosinus-Ähnlichkeit

In [16]:
from utils.preprocessing import WordEmbedding, load_word_emb
import random
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import json
import numpy as np

In [17]:
w2v_config = {
    'data_dir': 'data/glove',
    'word2idx_path': 'word2idx.json',
    'usedwordemb_path': 'usedwordemb.npy'
}
w2v = WordEmbedding(load_word_emb(w2v_config['data_dir'], 
                                  w2v_config['word2idx_path'],
                                  w2v_config['usedwordemb_path']))

In [18]:
# load table schemata
with open('data/wikisql/tables.jsonl') as file:
    table_schemata = pd.DataFrame([json.loads(line) for line in file.readlines()])
table_schemata['source'] = ''
#table_schemata['header'] = table_schemata['header'].apply(lambda header: list(map(lambda col: col.split(), header)))
table_schemata = table_schemata[['source', 'header']]
table_schemata.head()

Unnamed: 0,source,header
0,,"[Date, Time, ACC Team, Big Ten Team, Location,..."
1,,"[Date, Time, ACC Team, Big Ten Team, Location,..."
2,,"[Date, Time, ACC Team, Big Ten Team, Location,..."
3,,"[Institution, Wins, Loss, Home Wins, Home Loss..."
4,,"[Pick #, MLS Team, Player, Position, Affiliation]"


In [19]:
table_schemata = table_schemata.head(100)

In [59]:
all_headers = []
_ = [all_headers.extend(header) for header in table_schemata['header'].values]
all_headers = list(set(all_headers))
random.shuffle(all_headers)
candidates = [column \
              .replace('/', ' ') \
              .replace('_', ' ') \
              for column in all_headers[:100]]
cache = {candidate: np.mean([w2v(word.lower()) for word in candidate.split()], axis=0) for candidate in candidates}

In [62]:
from collections import defaultdict
for idx, row in table_schemata.iterrows():
    header = row['header']
    embs = [np.mean([w2v(word.lower()) for word in col.split()], axis=0) for col in header]
    fits = defaultdict(list)
    for candidate, embedding in cache.items():
        vectors = [embedding] + embs
        similarity = cosine_similarity(vectors)[0][1:]
        max_sim = np.max(similarity)
        max_pos = np.argmax(similarity)
        
        # append the best candidate with its similarity to the header
        # to the lists of candidates for the given header
        # i.e. fits['ACC Team'] -> [('Team', 0.75), ('Coach', 0.64), (<new candidate>, <similarity>)]
        fits[header[max_pos]].append((candidate, max_sim))
    break

In [77]:
best_fits = {}
for column, candidate_tuples in fits.items():
    best_candidate = sorted(candidate_tuples, key=lambda x: x[1], reverse=True)[0]
    best_fits[column] = best_candidate

In [78]:
best_fits

{'Date': ('January', 0.5904744733794349),
 'ACC Team': ('Team', 0.8743069497292355),
 'Attendance': ('Decile', 0.13113325815406326),
 'Location': ('Location Attendance', 0.835072308353462),
 'Challenge Leader': ('Challenge Leader', 0.9999999999999996),
 'Big Ten Team': ('Big Ten Team', 0.9999999999999999),
 'Television': ('Television', 1.0),
 'Time': ('Current status', 0.6296626386395778),
 'Winner': ('Winner', 1.0)}

Vorgehen:
1. für jede Tabelle eine Spalte finden, die aus Quellschema sein könnte -> die als Eingabe  
Format: Quellspalte -> [Zielspalte]