Mögliche Labels:
- eindeutige Positionierung der Zielspalte
- mehrfache Positionierungen der Zielspalten sortiert nach derer Kosinus-Ähnlichkeit

In [8]:
from utils.preprocessing import WordEmbedding, load_word_emb
import random
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import json
import numpy as np
from tqdm import tqdm
from collections import defaultdict

import warnings
warnings.filterwarnings("ignore")

In [2]:
w2v_config = {
    'data_dir': 'data/glove',
    'word2idx_path': 'word2idx.json',
    'usedwordemb_path': 'usedwordemb.npy'
}
w2v = WordEmbedding(load_word_emb(w2v_config['data_dir'], 
                                  w2v_config['word2idx_path'],
                                  w2v_config['usedwordemb_path']))

In [3]:
# load table schemata
with open('data/wikisql/tables.jsonl') as file:
    table_schemata = pd.DataFrame([json.loads(line) for line in file.readlines()])
table_schemata['fits_1to1'] = ''
table_schemata['fits_1to0'] = ''
table_schemata['header'] = table_schemata['header'].apply(lambda x: '<|>'.join(x)) # needed to drop duplicates
table_schemata = table_schemata[['header', 'fits_1to1', 'fits_1to0']].drop_duplicates().reset_index(drop=True)
table_schemata['header'] = table_schemata['header'].apply(lambda x: x.split('<|>')) # rebuild original state
table_schemata.head()

Unnamed: 0,header,fits_1to1,fits_1to0
0,"[Date, Time, ACC Team, Big Ten Team, Location,...",,
1,"[Institution, Wins, Loss, Home Wins, Home Loss...",,
2,"[Pick #, MLS Team, Player, Position, Affiliation]",,
3,"[DVD title, Number of Episodes, Region 2, Regi...",,
4,"[Year, Coach, Crew, Record, Win %]",,


In [4]:
table_schemata = table_schemata.head(10000)

In [5]:
all_headers = []
_ = [all_headers.extend(header) for header in table_schemata['header'].values]
all_headers = list(set(all_headers))
random.shuffle(all_headers)
candidates = [column \
              .replace('/', ' ') \
              .replace('_', ' ') \
              for column in all_headers[:1000]]
cache = {candidate: np.mean([w2v(word.lower()) for word in candidate.split()], axis=0) for candidate in candidates}
if cache.get(''):
    del cache['']

In [11]:
table_schemata = table_schemata.head(5)

In [18]:


def calculate_1to1(header, cache):
    eps = 0.001
    embs = [np.mean([w2v(word.lower()) for word in col.split()], axis=0) for col in header]
    fits = defaultdict(list)
    for candidate, embedding in cache.items():
        try:
            vectors = [embedding] + embs
            similarity = cosine_similarity(vectors)[0][1:]
            max_sim = np.max(similarity)
            if abs(max_sim + eps) >= 1.0:
                continue
            max_pos = np.argmax(similarity)

            # append the best candidate with its similarity to the header
            # to the lists of candidates for the given header
            # i.e. fits['ACC Team'] -> [('Team', 0.75), ('Coach', 0.64), (<new candidate>, <similarity>)]
            fits[header[max_pos]].append((candidate, max_sim))
        except:
            continue
    return fits    

def calculate_1t0(header, cache, threshold=0.6):
    embs = [np.mean([w2v(word.lower()) for word in col.split()], axis=0) for col in header]
    fits = defaultdict(list)
    for candidate, embedding in cache.items():
        try:
            vectors = [embedding] + embs
            similarity = cosine_similarity(vectors)[0][1:]
            max_sim = np.max(similarity)
            # if the candidate is not similar to any of the columns, append it to the fits
            if max_sim < threshold:
                min_sim = np.min(similarity)
                min_pos = np.argmin(similarity)
                fits[header[min_pos]].append((candidate, min_sim))
        except:
            continue
    return fits

def reduce_fits(good_fits, increase_similarity=True, lower_threshold=0.9, upper_threshold=0.3):
    best_fits = {}
    for column, candidate_tuples in good_fits.items():
        best_candidate = sorted(candidate_tuples, key=lambda x: x[1], reverse=True if increase_similarity else False)[0]
        similarity = best_candidate[1]
        if increase_similarity:
            if similarity > lower_threshold:
                best_fits[column] = best_candidate[0]
        else:
            if similarity < upper_threshold:
                best_fits[column] = best_candidate[0]
    return best_fits

for idx, row in tqdm(table_schemata.iterrows(), total=len(table_schemata)):
    header = row['header']
    fits_1to1 = calculate_1to1(header, cache)
    fits_1to1 = reduce_fits(fits_1to1, increase_similarity=True)
    
    fits_1to0 = calculate_1t0(header, cache)
    fits_1to0 = reduce_fits(fits_1to0, increase_similarity=False)
    
    
    row['fits_1to1'] = fits_1to1
    row['fits_1to0'] = fits_1to0
    table_schemata.loc[idx] = row

100%|████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.11it/s]


In [19]:
table_schemata

Unnamed: 0,header,fits_1to1,fits_1to0
0,"[Date, Time, ACC Team, Big Ten Team, Location,...",{'Time': 'Time Time Zone'},"{'Attendance': 'Campeonato Paulista', 'Winner'..."
1,"[Institution, Wins, Loss, Home Wins, Home Loss...",{},"{'Neutral Wins': 'Phoenician', 'Institution': ..."
2,"[Pick #, MLS Team, Player, Position, Affiliation]",{},"{'Affiliation': 'IntelliTrace', 'Player': 'Cho..."
3,"[DVD title, Number of Episodes, Region 2, Regi...",{},"{'DVD title': 'Cholim', 'Region 1 (US)': '100-..."
4,"[Year, Coach, Crew, Record, Win %]","{'Year': 'Last Year', 'Win %': 'Wins %'}","{'Coach': 'Resultado', 'Crew': '100-yr', 'Year..."


In [16]:
# WiP!
def calculate_1toN(header, cache):
    eps = 0.001
    embs = [np.mean([w2v(word.lower()) for word in col.split()], axis=0) for col in header]
    fits = defaultdict(list)
    for candidate, embedding in cache.items():
        try:
            vectors = [embedding] + embs
            similarity = cosine_similarity(vectors)[0][1:]
            max_sim = np.max(similarity)
            if abs(max_sim + eps) >= 1.0:
                continue
            max_pos = np.argmax(similarity)
            

            # append the best candidate with its similarity to the header
            # to the lists of candidates for the given header
            # i.e. fits['ACC Team'] -> [('Team', 0.75), ('Coach', 0.64), (<new candidate>, <similarity>)]
            fits[header[max_pos]].append((candidate, max_sim))
        except:
            continue
    
    return fits

for idx, row in tqdm(table_schemata.iterrows(), total=len(table_schemata)):
    header = row['header']
    fits_1toN = calculate_1toN(header, cache)
    for key in fits_1toN.keys():
        print(key)
    print()
    
    # wenn einer der Top Kandidaten eine hohe Similarity zu anderen Spalten hat, füge diesen 
    # Kandidaten den common candidates hinzu
    
    ##### OLD CODE! ######
    #common_candidates = []
    #for header_col, candidate_tuples in fits_1toN.items():
    #    columns_containing_candidate = [header_col] # indicator which columns contain the candidate
    #    other_cols = header[:header.index(header_col)] + header[header.index(header_col) + 1:]
    #    for other_col in other_cols:
    #        other_candidates = [candidate_tuple[0] for candidate_tuple in fits_1toN[other_col]]
    #        this_candidates = [candidate_tuple[0] for candidate_tuple in \
    #                           sorted(candidate_tuples, key=lambda x: x[1], reverse=True)]
    #        for candidate in this_candidates:
    #            if candidate in other_candidates:
    #                print("X")
    

 40%|███████████████████████████▏                                        | 2/5 [00:00<00:00,  6.62it/s]

Date
Time
Big Ten Team
Location
Winner
ACC Team
Challenge Leader
Attendance
Television

Home Losses
Current Streak
Away Wins
Institution
Wins
Home Wins
Away Losses
Neutral Wins
Neutral Losses
Loss



 60%|████████████████████████████████████████▊                           | 3/5 [00:00<00:00,  6.78it/s]

Pick #
Position
MLS Team
Affiliation
Player



100%|████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  6.23it/s]

Region 1 (US)
Number of Episodes
Region 4 (AU)
Region 2
DVD title

Year
Win %
Record
Coach
Crew






In [None]:
table_schemata.head()

In [None]:
# Drop rows that have not been tested
table_schemata.drop(table_schemata.loc[(table_schemata['fits_1to1'] == '') & \
                                       (table_schemata['fits_1to0'] == '')].index, inplace=True)

In [None]:
rows = []
for idx, ts_row in tqdm(table_schemata.iterrows()):
    header, fits_1to1 = ts_row[['header', 'fits_1to1']]
    num_words_in_header = len((' '.join(header)).split(' '))
    if num_words_in_header > 30:
        continue
    for target_col, source_col in fits_1to1.items(): 
        if 'k {\math' in source_col:
            continue
        seq_row = {
            'source_col': source_col,
            'input_cols': '<|>'.join(header),
            'target_col': target_col
        }
        rows.append(seq_row)
rows = pd.DataFrame(rows)
#rows.to_csv('data/training/schema_matching_raw.txt', index=False)

In [None]:
rows

In [None]:
rows.shape

Vorgehen:
1. für jede Tabelle eine Spalte finden, die aus Quellschema sein könnte -> die als Eingabe  
Format: Quellspalte -> [Zielspalte]