# Analysis on all-selections.csv

In [None]:
import numpy as np
import pandas as pd
from utils.queries import *

%load_ext autoreload
%autoreload 2

## Prepare data

Read `all-selections.csv`

In [None]:
df_original = pd.read_csv('all-selections.csv')
df_original.head()

Extract only the useful columns:
 - `'name'`
 - `'winery_name'`
 - `'type'`
 - `'matched_id'`

Additional actions:
 - rename `'matched_id'` to `'correct_id'`.

This is because the only correct match is `'matched_id'`, and the columns `'matched_name'`, `'matched_winery_name'` are incorrect. I have to build them again by querying the ElasticSearch index by id.


In [None]:
df = df_original.get(['name', 'winery_name', 'type', 'matched_id'])
df = df.rename(columns={'matched_id': 'correct_id'})
df.head()

Get the correct values for `'name'`, `'winery_name'` and`'type'`:
 - Query the ElasticSearch index by id
 - generate dictionary from id to names
 - add new columns to dataframe

In [None]:
response = query_by_id(list(df['correct_id']))
display(response[0])

id_mapping = {r['id']: r for r in response}

df['correct_name'] = df['correct_id'].map(lambda x: id_mapping[x]['name'])
df['correct_winery_name'] = df['correct_id'].map(lambda x: id_mapping[x]['winery_name'])
df['correct_type'] = df['correct_id'].map(lambda x: id_mapping[x]['type'])

df.head()

In [None]:
np.where(df['correct_type'] != df['type'])[0]

## Build vocabulary of all words

And apply additional preprocessing

TODO: incorporate this additional preprocessing in the onboarding pipeline

In [None]:
import re
import unicodedata

def strip_accents(text: str):
    nfkd_form = unicodedata.normalize('NFKD', text)
    return nfkd_form.encode('ASCII', 'ignore').decode()

def remove_special_characters(text: str):
    return re.sub('\W', ' ', text)

def preprocessing(input_string):
    return strip_accents(remove_special_characters(input_string)).lower()

def tokenize(preprocessed_text):
    return re.findall(r'[\w.]+(?:\B\S)*', preprocessed_text)

In [None]:
vocabulary_name = sorted(set([word for doc in df['name'].map(tokenize) for word in doc]))
vocabulary_winery_name = sorted(set([word for doc in df['winery_name'].map(tokenize) for word in doc]))

print(len(vocabulary_name))
print(len(vocabulary_winery_name))

vocabulary_name

## Get document frequencies of each word in the vocabulary

In [None]:
from utils.word_occurrences import get_all_occurrences_db

get_all_occurrences_db(vocabulary_name, 'word_frequency_name.db')

In [None]:
from utils.word_occurrences import get_all_occurrences_db

get_all_occurrences_db(vocabulary_winery_name, 'word_frequency_winery_name.db')