# **Wine Tasting**

In [1]:
import pandas as pd
import os

In [2]:
# file import
# since the file was not encoded in utf-8 therefore we pass the appropriate encoding in order to correctly read the file

df = pd.read_csv('./wine-tasting-clean/wine_tasting_clean.csv', encoding='unicode_escape')

## **Data Manipulation**

We will extract the top wineries, varieties, and titles for the top 15 countries.

In [3]:
# first we will filter the data to only contain the top 15 countries, having the most reviews as first,
# so that there are no outliers in our data

countries_to_retain = (
    df[['country']]
    .assign(count=0)
    .groupby('country')
    .count()
    .sort_values('count', ascending=False)
    .head(15)
    .reset_index()
    ['country']
    .to_list()
)

df = df[df['country'].isin(countries_to_retain)]

In [4]:
tmp = df[['country', 'title', 'winery', 'variety', 'points']]

# initiating a dictionary that will store a dataframe against each key

dataframe = {
    'title' : '',
    'winery' : '',
    'variety' : ''
}

for item in ['title', 'winery', 'variety']:
    
    _temp = (
        tmp[['country', item, 'points']]
        .groupby(['country', item])
        .mean()
        )
    
    _temp['ranked'] = (
        _temp
        .groupby('country')
        ['points']
        .rank(method='first', ascending=False)
        )
    
    # extracting only the top ranked for each category
    
    _temp = (
        _temp
        .query('ranked == 1')
        .reset_index()
        .rename(columns={'points' : f'{item}_points'})
        [['country', item, f'{item}_points']]
        )

    dataframe[item] = _temp


In [5]:
# joining all the dataframes to create on dataframe and exporting

try:
    os.mkdir('./clean-data')
except: pass

(
    dataframe['winery'].set_index('country')
    .join(
        dataframe['variety'].set_index('country')
        )
    .join(
        dataframe['title'].set_index('country')
        )
    .reset_index()
).to_csv('./clean-data/wine_top_results.csv', index=False)