# **Wine Tasting**

In [2]:
import pandas as pd
import numpy as np
import re
import os

In [3]:
# file import

dataset = pd.read_csv('./data/wine-tasting.csv')

## **Data Manipulation**

Extracting regions from titles


In [4]:
# extracting regions from title column

# regex pattern that extracts the required substring
pattern = re.compile(r'\(.+\)')

def get_region(x):
    """
    function that will passed to .apply function
    the output will always be of the form (...foo...bar...)
    """

    return ''.join(re.findall(pattern, x))

def filter_region(string):
    """
    after the substring is returned we need to clean it further
    """

    # removing the first ( from the regex returned string
    
    truncated = string[1:]

    # if there is only no ( in the truncated string that means our string is of the type
    # foo...bar), therefore we return the string with last character also removed
    
    if truncated.count('(') == 0:
        return truncated[:-1]
    
    # if there is only one ( in the truncated string that means our string is of the type
    # foo...(bar)) or foo...(bar)...) or foo...(bar)...) therefore we return the string with last character also removed
    
    if truncated.count('(') == 1:
        
        # for the first case we just do as we did for the previous case
        if truncated[-2:] == '))':
            return truncated[:-1]
        
        # otherwise reverse the truncated string, find the first ( character then do arithmetic so
        # it can be used as a starting index for the previous string and then return the string
        # with first and last character removed

        else:
            return truncated[-truncated[::-1].find('(')-1:][1:-1]
        
    if truncated.count('(') == 2:
        

        if truncated[-2:] == '))':
            return truncated[truncated.find('('):][1:-1]
        
        else:
            return truncated[-truncated[::-1].find('(')-1:][1:-1]
        
    if truncated.count('(') > 2:
        return truncated[:-1]

# extracts substring
dataset['region'] = dataset['title'].apply(get_region)

# filters the region substring
dataset['region'] = dataset['region'].apply(filter_region)

# returns NaN value if string is empty
dataset['region'] = dataset['region'].apply(lambda x: np.nan if len(x)== 0 else x)

# dataset['region']

### **Dropping columns and rows**

In [5]:
# dropping columns

dataset.drop(
    ['designation', 'region_1', 'region_2', 'taster_twitter_handle'],
    axis=1,
    inplace=True
)

In [6]:
# dropping rows which contain null values

print(f"Before: {dataset.shape}")

dataset.dropna(
    subset=['country', 'points', 'price', 'province','taster_name', 'title', 'variety', 'winery', 'region'],
    inplace=True
)

print(f"After:  {dataset.shape}")

Before: (129971, 11)
After:  (89022, 11)


### **Filters**

In [7]:
# constraints function

def constrainted(df: pd.DataFrame, column: str, _query: str = None) -> list[bool]:
    
    return df[column].isin(
        df[[column]]
        .assign(count=0)
        .groupby([column])
        .count()
        .query('count ' + _query)
        .index
        )

def constraint_check(df: pd.DataFrame, column: str, lower_bound: int) -> tuple[bool, int]:
    value = (
        df[[column]]
        .assign(count=0)
        .groupby([column])
        .count()
        ['count']
        .min()
    )

    out = (value >= lower_bound, value)

    return out

In [8]:
# defining constraints

constraints = {
    'winery' : '>= 6',
    'variety' : '>= 3',
    'region' : '>= 6',
    'taster_name' : '>= 100'
}

In [9]:
# iterating through each of the constraint

for column, _query in constraints.items():
    dataset = dataset[constrainted(dataset, column, _query)]

# checking constraints

for item in ['winery', 'variety', 'region', 'province', 'taster_name', 'country']:
    print(item, ':', constraint_check(dataset, item, 10)[1])

# rechecking how many rows we are left with

print(dataset.shape)

winery : 1
variety : 1
region : 3
province : 6
taster_name : 112
country : 6
(68969, 11)


In [10]:
# creating the final dataset

wine_tasting_clean = (
    dataset
    .drop(['description'], axis=1)
)

## **Extracting common words**

Extracting common words that the reviewers are using in their reviews.

In [11]:
def get_words(x):
    
    out = x.lower()
    out = out.replace("'s", '')
    out = out.replace(",", '')
    out = out.replace(".", '')
    out = out.replace("!", '')
    out = out.replace(";", '')
    out = out.replace(":", '')
    out = out.replace('"', '')
    out = out.split(' ')

    return out

In [12]:
# from each of the row in dataset[].apply() to convert a list of strings
# to list of list of string:

# [a,b,c] -> [[a], [b], [c]

# and them creates a dataframe from it
temp = pd.DataFrame([
    item for item in dataset['description'].apply(get_words)
])

# first converts the dataframe to numpy array and then uses .ravel() to create a stack from it
# then creates pandas series from it
temp = pd.Series(temp.to_numpy().ravel())

# rows have None value, replaces them with np.nan so that it can be dropped using dropna()
temp = temp.fillna(value=np.nan).dropna()

In [13]:
# afterwards, using the series, it creates a dataframe
# creates a count column with 0s
# groups using the column values and counts the number of rows each group contains
# reset index, sort_values in descending order and then stores the value in .csv file

words = (
    pd.DataFrame(temp, columns=['column'])
    .assign(count=0)
    .groupby('column')
    .count()
    .reset_index()
    .sort_values('count', ascending=False)
)

## **Exporting files**

In [14]:
try:
    os.mkdir('./clean-data')
except: pass

wine_tasting_clean.to_csv('./clean-data/wine_tasting_clean.csv', index=False)

words.to_csv('./clean-data/words.csv', index=False)