In [None]:
from queries import *
import json

%load_ext autoreload
%autoreload 2

In [None]:
with open('explanation.json', 'w') as f:
    hits = query_explain('barbaresco gallina', 'ceretto', 'RED')
    # hits = query_explain('barbaresco', 'ceretto', 'RED')
    json.dump(hits, f, indent=2)

In [None]:
with open('bbb.json', 'w') as f:
    bbb = query_es_clean('barbaresco gallina', 'ceretto', 'RED')
    json.dump(bbb, f, indent=2)

# Analysis of v3 Dataframes

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Concatenate all datasets, and perform cleanup. The cleanup steps are:

 - add client column
 - remove null scores (for some reason, only laite has them)
 - convert `'ok'` column from `str` to `bool` (for some reason, only for vinessa)

In [None]:
dfs = []

for element in os.listdir('./'):
    element_path = os.path.join('./', element)
    if not os.path.isdir(element_path) or element=='__pycache__' or '.' in element:
        continue
    df = pd.read_csv(os.path.join(element_path, 'v3-selection.csv'))

    # cleanup
    df['client'] = element
    df = df.loc[~df['score'].isna()]
    df['ok'] = df['ok'].apply(bool)
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)
df

In [None]:
df_ok = df.loc[df['ok']==True]
df_not = df.loc[df['ok']==False]

display(pd.DataFrame({'OK': df_ok['score'].describe(), 'NOT OK': df_not['score'].describe()}))

fig, ax = plt.subplots()

bins = np.arange(0, 105, 2)
df_ok['score'].hist(ax=ax, bins=bins, density=True, alpha=0.5)
df_not['score'].hist(ax=ax, bins=bins, density=True, alpha=0.5)

df_ok['score'].plot.kde(ax=ax, color='C0')
df_not['score'].plot.kde(ax=ax, color='C1')

ax.set_xlim(bins[0], bins[-1])

plt.show()

In [None]:
from scipy.stats import gaussian_kde

padding = 10
eval_range = np.linspace(df['score'].min()-padding, df['score'].max()+padding, 1001)
eval_step = eval_range[1] - eval_range[0]

kernel_ok = gaussian_kde(df_ok['score'])
kernel_not = gaussian_kde(df_not['score'])

dist_ok = kernel_ok(eval_range)
dist_not = kernel_not(eval_range)

print(f'Area of OK: {dist_ok.sum()*eval_step:.3f}')
print(f'Area of NOT: {dist_not.sum()*eval_step:.3f}')

overlap = np.minimum(dist_ok, dist_not).sum()*eval_step
union = np.maximum(dist_ok, dist_not).sum()*eval_step
iou = overlap/union

plt.plot(eval_range, dist_ok)
plt.plot(eval_range, dist_not)

print(f'Overlap: {overlap:.3f}')
print(f'Union: {union:.3f}')
print(f'IoU: {iou:.3f}')

# Analysis on all-selections.csv

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from queries import query_by_id

## Prepare data

Read `all-selections.csv`

In [None]:
df_original = pd.read_csv('all-selections.csv')
df_original.head()

Extract only the useful columns:
 - `'name'`
 - `'winery_name'`
 - `'type'`
 - `'matched_id'`

Additional actions:
 - rename `'matched_id'` to `'correct_id'`.

This is because the only correct match is `'matched_id'`, and the columns `'matched_name'`, `'matched_winery_name'` are incorrect. I have to build them again by querying the ElasticSearch index by id.


In [None]:
df = df_original.get(['name', 'winery_name', 'type', 'matched_id'])
df = df.rename(columns={'matched_id': 'correct_id'})
df.head()

Get the correct values for `'name'`, `'winery_name'` and`'type'`:
 - Query the ElasticSearch index by id
 - generate dictionary from id to names
 - add new columns to dataframe

In [None]:
response = query_by_id(list(df['correct_id']))
display(response[0])

id_mapping = {r['id']: r for r in response}

df['correct_name'] = df['correct_id'].map(lambda x: id_mapping[x]['name'])
df['correct_winery_name'] = df['correct_id'].map(lambda x: id_mapping[x]['winery_name'])
df['correct_type'] = df['correct_id'].map(lambda x: id_mapping[x]['type'])

df.head()

In [None]:
np.where(df['correct_type'] != df['type'])[0]

## Build vocabulary of all words

TODO: get the document frequencies of all the words in the 2 vocabularies

In [None]:
vocabulary_name = sorted(set([word for doc in df['name'].map(str.split) for word in doc]))
vocabulary_winery_name = sorted(set([word for doc in df['winery_name'].map(str.split) for word in doc]))

print(len(vocabulary_name))
print(len(vocabulary_winery_name))

## Refactoring queries

TODO: modularize queries, write docstrings and move them in `queries.py`

In [None]:
import requests

def condition_name(name: str):
    return {
        "multi_match": {
            "query": name,
            "fields": [
                "name"
            ],
            "type": "best_fields",
            "operator": "or",
                        "fuzziness": "AUTO"
        }
    }


def condition_winery_name(winery_name: str):
    return {
        "multi_match": {
            "query": winery_name,
            "fields": [
                "winery_name^2"
            ],
            "type": "best_fields",
            "operator": "or",
                        "fuzziness": "1"
        }
    }


def condition_wine_type(wine_type: str):
    return {
        "constant_score": {
            "filter": {
                "term": {
                    "type.keyword": wine_type.upper()
                }
            },
            'boost': 0
        }
    }


def query_es_2(conditions: list, num_results: int = 1):
    url = 'https://es.vinoteqa.com/wines/_search'
    headers = {'Content-Type': 'application/json'}
    query = {
        "query": {
            "bool": {
                "must": conditions
            }
        },
        "size": num_results
    }

    try:
        response = requests.post(url, headers=headers, json=query)
        return response.json()
    except Exception as e:
        print(f'ERROR', e)
        return None


def query_es_clean_2(conditions, match_rank: int = 1):
    response_json = query_es_2(conditions, match_rank)

    if response_json is None:
        return None, 0

    hits = response_json['hits']['hits']
    if len(hits) > match_rank-1:
        match = hits[match_rank-1]['_source']
        return match, hits[match_rank-1]['_score']
    return None, 0


def query_explain_2(conditions: list, matched_id: str = None):
    if matched_id is None:
        match, score = query_es_clean_2(conditions)
        if match is None:
            print('No match')
            return
        matched_id = match['id']

    # perform explanation query
    url = 'https://es.vinoteqa.com/wines/_explain/' + matched_id
    headers = {'Content-Type': 'application/json'}
    query = {
        "query": {
            "bool": {
                "must": conditions,
            }
        }
    }

    return requests.post(url, headers=headers, json=query).json()

## Get document frequencies of each word in the vocabulary

In [None]:
def find_node(explanation, name: str):
    if explanation is None:
        return None

    if explanation['description']==name:
        return explanation['value']

    for detail in explanation['details']:
        result = find_node(detail, name)
        if result is not None:
            return result


def get_explanation(word: str, index: str = 'name'):
    explanation = None
    if index == 'name':
        explanation = query_explain_2(condition_name(word))
    elif index == 'winery_name':
        explanation = query_explain_2(condition_winery_name(word))
    else:
        raise ValueError("The index must be either 'name' or 'winery_name'")

    if explanation is not None:
        return explanation['explanation']


def get_document_occurrences(word: str, index: str = 'name'):
    explanation = get_explanation(word, index)
    if explanation is None:
        return 0
    return find_node(explanation, 'n, number of documents containing term')


def get_total_documents(word: str, index: str = 'name'):
    explanation = get_explanation(word, index)
    if explanation is None:
        return 0
    return find_node(explanation, 'N, total number of documents with field')


In [None]:
vocabulary_name[0]

In [None]:
import time
from utils.list_manipulation import split_list_2
from utils.time_utils import compute_eta, format_seconds

def get_occurrences(vocabulary: list):
    occurences = {}

    t0 = time.time()
    t1 = t0
    eta = None
    for i, word in enumerate(vocabulary):
        # print(f'{i} of {len(vocabulary)}, {word}'.ljust(80), end='\r')

        # ETA
        t = time.time()
        eta = compute_eta(t-t1, len(vocabulary)-i, eta)
        print(f'{i} of {len(vocabulary)}, {word}'.ljust(50) +
              f"T/step: {t-t1:.2f}s, Elapsed: {format_seconds(t-t0)}, ETA: {format_seconds(eta)}.",
              end='\r')
        occurences[word] = get_document_occurrences(word, 'name')
        t1 = t
    return occurences


import os
from multiprocessing import Pool

def parallel_download(vocabulary, num_jobs=None):
    if num_jobs is None:
        num_jobs = os.cpu_count()
    p = Pool(num_jobs)

    p.map(get_occurrences, split_list_2(vocabulary, num_jobs))

In [None]:
word_frequency_name = get_occurrences(vocabulary_name)

In [None]:
word_frequency_name = parallel_download(vocabulary_name)

In [None]:
word_frequency_name