In [None]:
from queries import *
import json

%load_ext autoreload
%autoreload 2

In [None]:
with open('explanation.json', 'w') as f:
    hits = query_explain('barbaresco gallina', 'ceretto', 'RED')
    # hits = query_explain('barbaresco', 'ceretto', 'RED')
    json.dump(hits, f, indent=2)

In [None]:
with open('bbb.json', 'w') as f:
    bbb = query_es_clean('barbaresco gallina', 'ceretto', 'RED')
    json.dump(bbb, f, indent=2)

# Analysis of v3 Dataframes

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Concatenate all datasets, and perform cleanup. The cleanup steps are:

 - add client column
 - remove null scores (for some reason, only laite has them)
 - convert `'ok'` column from `str` to `bool` (for some reason, only for vinessa)

In [None]:
dfs = []

for element in os.listdir('./'):
    element_path = os.path.join('./', element)
    if not os.path.isdir(element_path) or element=='__pycache__' or '.' in element:
        continue
    df = pd.read_csv(os.path.join(element_path, 'v3-selection.csv'))

    # cleanup
    df['client'] = element
    df = df.loc[~df['score'].isna()]
    df['ok'] = df['ok'].apply(bool)
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)
df

In [None]:
df_ok = df.loc[df['ok']==True]
df_not = df.loc[df['ok']==False]

display(pd.DataFrame({'OK': df_ok['score'].describe(), 'NOT OK': df_not['score'].describe()}))

fig, ax = plt.subplots()

bins = np.arange(0, 105, 2)
df_ok['score'].hist(ax=ax, bins=bins, density=True, alpha=0.5)
df_not['score'].hist(ax=ax, bins=bins, density=True, alpha=0.5)

df_ok['score'].plot.kde(ax=ax, color='C0')
df_not['score'].plot.kde(ax=ax, color='C1')

ax.set_xlim(bins[0], bins[-1])

plt.show()

In [None]:
from scipy.stats import gaussian_kde

padding = 10
eval_range = np.linspace(df['score'].min()-padding, df['score'].max()+padding, 1001)
eval_step = eval_range[1] - eval_range[0]

kernel_ok = gaussian_kde(df_ok['score'])
kernel_not = gaussian_kde(df_not['score'])

dist_ok = kernel_ok(eval_range)
dist_not = kernel_not(eval_range)

print(f'Area of OK: {dist_ok.sum()*eval_step:.3f}')
print(f'Area of NOT: {dist_not.sum()*eval_step:.3f}')

overlap = np.minimum(dist_ok, dist_not).sum()*eval_step
union = np.maximum(dist_ok, dist_not).sum()*eval_step
iou = overlap/union

plt.plot(eval_range, dist_ok)
plt.plot(eval_range, dist_not)

print(f'Overlap: {overlap:.3f}')
print(f'Union: {union:.3f}')
print(f'IoU: {iou:.3f}')

# Analysis on all-selections

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from queries import query_by_id

In [None]:
df_original = pd.read_csv('all-selections.csv')
df_original.head()

Extract only the useful columns:
 - `'name'`
 - `'winery_name'`
 - `'type'`
 - `'matched_id'`

And rename `'matched_id'` to `'correct_id'`.

This is because the only correct match is `'matched_id'`, and the columns `'matched_name'`, `'matched_winery_name'` are incorrect. I have to build them again by querying the ElasticSearch index by id.

In [None]:
df = df_original.get(['name', 'winery_name', 'type', 'matched_id'])
df = df.rename(columns={'matched_id': 'correct_id'})
df.head()

In [None]:
df['correct_id'].__len__()

In [None]:
df['responses'] = df['correct_id'].apply(query_by_id)

In [None]:
for i, wine_id in enumerate(df['correct_id']):
    print(i, end='\r')
    query_by_id(wine_id)

In [None]:
list(df['correct_id'])[:10]

In [None]:
response = query_by_id(list(df['correct_id'])[:10])
response