In [1]:
import pandas as pd
import functools
import operator
import collections
import numpy as np
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_excel('E:/eska/201013_Bierrezeptliste_English.xlsx',  na_values=['n / a'], header=1)
df.head()

Unnamed: 0,Surname,Brief description,Beer type,Deflection volume [L],Alc [% vol],Bitter value [IBU],Color [EBC],Aroma,Carbonation [g / l],Amount [g],...,Rest time [min].5,Resting temp. [° C].5,Start temp. [° C].5,method.5,Volume [L],Total cooking time [min],Post-polymerization time [min],Evaporation rate [% / h],Loss of spent grains [L],Unnamed: 155
0,815,,American pale ale,20.0,6.8,39.0,21,strong,5.0,5000,...,,,,,22.5,60.0,5.0,10.0,0.5,
1,SomWei,Traditional light wheat beer - fruity summer w...,Light wheat beer,30.0,4.7,11.0,9,strong,6.0,3500,...,,,,,41.0,90.0,10.0,15.0,1.5,
2,Abbey beer,Dark strong beer with a high alcohol content a...,Bière de Garde,28.0,6.5,19.0,37,very low,5.0,3500,...,,,,,32.7,80.0,10.0,10.0,1.0,https://brauerei.mueggelland.de/rezeptkalkulat...
3,Old Bavarian dark wheat beer,Dark wheat beer with malt and roasted aromas,Dark wheat beer,19.0,5.3,13.0,44,very low,6.0,2400,...,,,,,24.7,90.0,15.0,10.0,2.0,https://brauerei.mueggelland.de/rezeptkalkulat...
4,Asian tiger,"Asian Rice Lager, summer beer with an Asian to...",Bohemian Pilsener,27.0,6.1,33.0,9,moderate,5.0,3000,...,,,,,32.4,90.0,5.0,10.0,0.5,


In [3]:
## Input data has Surname and Brief Description which are text data which need to be converted to vectors
## Categorical columns = Beer Type and Aroma
## Continuous columns = Remaining input columns

## Handling Text data
'''
Text input needs to be processed to identify tokens/words and some basic preprocessing needs to be done
to get rid of stopwords (the/is/an and other common words that dont add much value), remove numbers and junk text.
Then we need to embed these words/tokens into word vectors.
'''

'\nText input needs to be processed to identify tokens/words and some basic preprocessing needs to be done\nto get rid of stopwords (the/is/an and other common words that dont add much value), remove numbers and junk text.\nThen we need to embed these words/tokens into word vectors.\n'

In [4]:
def sentence_preprocessor(sentence):
    '''
    Args: sentence (str) : Input string to be tokenized after preprocessing
    '''

    if not sentence:
        return []

    # split into words
    tokens = word_tokenize(sentence)

    # convert to lower case
    tokens = map(lambda w: w.lower(), tokens)

    # convert (full-bodied => full bodied)
    tokens = functools.reduce(operator.add, map(lambda w: w.split('-'), tokens))

    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = map(lambda w: w.translate(table), tokens)

    # remove remaining tokens that are not alphabetic
    words = filter(lambda w: w.isalpha(), stripped)

    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = filter(lambda w: w not in stop_words, words)

    return list(words)

In [5]:
full_txt = functools.reduce(lambda x, y: x + ' ' + y, df['Brief description'].dropna())
full_txt

'Traditional light wheat beer - fruity summer wheat Dark strong beer with a high alcohol content and a finely balanced bitterness and round body Dark wheat beer with malt and roasted aromas Asian Rice Lager, summer beer with an Asian touch and American hops The beer is similar to the triple but is lighter, less malty and slightly bitter in the aftertaste Classic blonde ale "Home brewing for everyone", VHS Freiburg i. Br Extra bitter red lager Cook 3 vanilla sticks for 15 minutes; Use 3 packets (8g each) of vanilla sugar instead of grape sugar and simmer for 15 minutes The summer beer with corn; A wonderfully light summer beer with corn, the ladies love it. Slightly sweet and very tasty thanks to the corn content. Also ideal for cyclists Traditional Irish red ale. Strong red color with a light malt aroma and medium-strong hull melons Guinness like stout Dunkler Bock, fermented with the K-97 A Doppelbock with a fine malty residual sweetness and high alcohol content, tasty and to be enjoy

In [6]:
words = sentence_preprocessor(full_txt)
words[:100]

['traditional',
 'light',
 'wheat',
 'beer',
 'fruity',
 'summer',
 'wheat',
 'dark',
 'strong',
 'beer',
 'high',
 'alcohol',
 'content',
 'finely',
 'balanced',
 'bitterness',
 'round',
 'body',
 'dark',
 'wheat',
 'beer',
 'malt',
 'roasted',
 'aromas',
 'asian',
 'rice',
 'lager',
 'summer',
 'beer',
 'asian',
 'touch',
 'american',
 'hops',
 'beer',
 'similar',
 'triple',
 'lighter',
 'less',
 'malty',
 'slightly',
 'bitter',
 'aftertaste',
 'classic',
 'blonde',
 'ale',
 'home',
 'brewing',
 'everyone',
 'vhs',
 'freiburg',
 'br',
 'extra',
 'bitter',
 'red',
 'lager',
 'cook',
 'vanilla',
 'sticks',
 'minutes',
 'use',
 'packets',
 'vanilla',
 'sugar',
 'instead',
 'grape',
 'sugar',
 'simmer',
 'minutes',
 'summer',
 'beer',
 'corn',
 'wonderfully',
 'light',
 'summer',
 'beer',
 'corn',
 'ladies',
 'love',
 'slightly',
 'sweet',
 'tasty',
 'thanks',
 'corn',
 'content',
 'also',
 'ideal',
 'cyclists',
 'traditional',
 'irish',
 'red',
 'ale',
 'strong',
 'red',
 'color',
 'lig

In [7]:
'''
GLOVE WORD EMBEDDINGS
We are using 100 dimensional Glove word vector embeddings readily available on our data.
Each token we derive from our data will be converted to their equivalent word vector representation
which preserves information about the meaning of these text input.
'''

'\nGLOVE WORD EMBEDDINGS\nWe are using 100 dimensional Glove word vector embeddings readily available on our data.\nEach token we derive from our data will be converted to their equivalent word vector representation\nwhich preserves information about the meaning of these text input.\n'

In [8]:
glove_embeddings_path = 'C:/Users/abish/Google Drive/glove.6B.100d.txt'
WORD_VEC_DIM = 100
embeddings_index = {}
with open(glove_embeddings_path, encoding="utf8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

embeddings_index['PAD'] = np.zeros(WORD_VEC_DIM)

In [9]:
# Words in description that are not available for encoding
## These words that arenot in Glove embeddings will be given Random vector of length 100 to be taken as its word vector.
## It is safe because the number of such uncommon words is very few
words_set = set(words)
words_set - embeddings_index.keys() 

{'berghammer',
 'brausol',
 'dunkler',
 'fuggles',
 'grainfather',
 'grönwohlder',
 'hanghofer',
 'lakvattnet',
 'mjölksyra',
 'märzen',
 'tettnang',
 'trub',
 'zoigl'}

In [10]:
s = df['Surname'].tolist()
s

[815,
 ' SomWei',
 'Abbey beer',
 'Old Bavarian dark wheat beer',
 'Asian tiger',
 'Australian sparceling ale',
 'Bavarian wheat double buck',
 'Belgian Strong Golden Ale',
 'Blonde flavor',
 'Bock top-fermented',
 'California Common',
 'Citra Red',
 'Coffee Vanilla Stout',
 'Cornberry',
 'Dag Müller Irish Red Ale (Real IRA)',
 "Dag's Irish stout",
 'Ladies pale ale',
 'Darg Bogg',
 'Dark Wheat',
 'The stubborn ',
 'The Wild XIII',
 'Pressure white',
 'Dark ',
 'Dark raspberry wheat',
 'Oak buck',
 'One-armed violinist',
 'A kind of Zoigl or cellar beer',
 'El bosque de pinas',
 'English Pumkin Ale',
 "Falconer's Flight to Spring",
 'Fine drink lavender ale',
 'Wanderlust to Bamberg',
 'Fresh cellar beer',
 'Fruity Hazy IPA',
 'Gewekin copper',
 'Ginger Goose',
 'glüXpils',
 'Golding Pils',
 'Gönndalf',
 'Götzner brewery monastery beer',
 'Oat smoke',
 'Halloween Beer',
 'Hemp brew',
 'Heather quad',
 'Light amarillo',
 'Light spring wheat',
 'Light with a kick',
 'Light rye beer No. 2

In [11]:
ft = functools.reduce(lambda x,y: str(x) + ' ' +  str(y), s)
w = sentence_preprocessor(ft)
w[:100]

['somwei',
 'abbey',
 'beer',
 'old',
 'bavarian',
 'dark',
 'wheat',
 'beer',
 'asian',
 'tiger',
 'australian',
 'sparceling',
 'ale',
 'bavarian',
 'wheat',
 'double',
 'buck',
 'belgian',
 'strong',
 'golden',
 'ale',
 'blonde',
 'flavor',
 'bock',
 'top',
 'fermented',
 'california',
 'common',
 'citra',
 'red',
 'coffee',
 'vanilla',
 'stout',
 'cornberry',
 'dag',
 'müller',
 'irish',
 'red',
 'ale',
 'real',
 'ira',
 'dag',
 'irish',
 'stout',
 'ladies',
 'pale',
 'ale',
 'darg',
 'bogg',
 'dark',
 'wheat',
 'stubborn',
 'wild',
 'xiii',
 'pressure',
 'white',
 'dark',
 'dark',
 'raspberry',
 'wheat',
 'oak',
 'buck',
 'one',
 'armed',
 'violinist',
 'kind',
 'zoigl',
 'cellar',
 'beer',
 'el',
 'bosque',
 'de',
 'pinas',
 'english',
 'pumkin',
 'ale',
 'falconer',
 'flight',
 'spring',
 'fine',
 'drink',
 'lavender',
 'ale',
 'wanderlust',
 'bamberg',
 'fresh',
 'cellar',
 'beer',
 'fruity',
 'hazy',
 'ipa',
 'gewekin',
 'copper',
 'ginger',
 'goose',
 'glüxpils',
 'golding',


In [12]:
# Words in description that are not available for encoding
words_set = set(w)
words_set - embeddings_index.keys() 

{'biab',
 'bogg',
 'cornberry',
 'darg',
 'gewekin',
 'glüxpils',
 'gönndalf',
 'götzner',
 'hjuvik',
 'holzgünzer',
 'pumkin',
 'sendlinger',
 'somwei',
 'sparceling',
 'topinale',
 'wheatra',
 'wieß',
 'zoigl'}

In [13]:
len(list(words_set))

174

In [14]:
## Maximum length sentence in column BRIEF DESCRIPTION
BRIEF_DESCRIPTION_VECTOR_SIZE = max(map(lambda sentence: len(sentence_preprocessor(sentence)),
                                        df['Brief description'].dropna()))
BRIEF_DESCRIPTION_VECTOR_SIZE

53

In [15]:
## Maximum length sentence in column SURNAME
SURNAME_VECTOR_SIZE = max(map(lambda x: len(sentence_preprocessor(str(x))), df['Surname'].dropna()))
SURNAME_VECTOR_SIZE

7

In [16]:
## Since the sentences in Brief Description and Surname are of varying lengths it needs to be adjusted to uniform length
## We reesort to left padding with zeros to get equal length vector. The maximum number of tokens in any row is 53 and so
## we pad 0s to account for 53 tokens in all of the sentences in Brief Description

In [17]:
def leftpad(token_list, vec_size):
    return ['PAD'] * (vec_size - len(token_list)) + token_list

In [18]:
def embed_tokens(token_list):
    embeddings = list(map(lambda w: embeddings_index.get(w, np.random.rand((WORD_VEC_DIM))), token_list))
    return np.array(embeddings).flatten()

In [19]:
ip = 'Traditional light wheat beer - fruity summer wheat'
token_list = sentence_preprocessor(ip)
token_list = leftpad(token_list, BRIEF_DESCRIPTION_VECTOR_SIZE)
embeddings = embed_tokens(token_list)

token_list

['PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'traditional',
 'light',
 'wheat',
 'beer',
 'fruity',
 'summer',
 'wheat']

In [20]:
embeddings

array([0.        , 0.        , 0.        , ..., 0.32326001, 0.20395   ,
       0.2386    ])

In [21]:
## Now we extract the continuous, categorical and word vector parts of the dataframe separately
## Continuous dataframe needs to be normalized to get same mean and standard deviation, which makes it easy for model to learn
## Categorical dataframe needs to be one-hot encoded

In [22]:
continuous_df = df.iloc[:, 3:9].drop(labels='Aroma', axis=1)
continuous_df_statistics = (continuous_df.mean(axis=0), continuous_df.std(axis=0))
continuous_df = continuous_df.apply(lambda x: (x - x.mean()) / x.std(), axis=0)

continuous_df.head()

Unnamed: 0,Deflection volume [L],Alc [% vol],Bitter value [IBU],Color [EBC],Carbonation [g / l]
0,-0.169744,1.005912,0.820885,-0.263192,0.031284
1,-0.136802,-0.830181,-1.260957,-0.65798,1.246528
2,-0.14339,0.743613,-0.666145,0.263192,0.031284
3,-0.173038,-0.305583,-1.112254,0.493485,1.246528
4,-0.146684,0.393881,0.374776,-0.65798,0.031284


In [23]:
categorical_df = pd.get_dummies(df[['Aroma', 'Beer type']])
categorical_df.head()

Unnamed: 0,Aroma_low,Aroma_moderate,Aroma_strong,Aroma_very low,Aroma_very strong,Beer type_American IPA,Beer type_American amber ale,Beer type_American brown ale,Beer type_American pale ale,Beer type_American stout,...,"Beer type_Spice, herb or vegetable beer",Beer type_Strong Scotch Ale,Beer type_Sweet stout,Beer type_Traditional buck,Beer type_Vienna warehouse,Beer type_Wheat buck,Beer type_Wheat dark,Beer type_Witbier,Beer type_pale Ale,Beer type_season
0,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
surname_word_vec_df = df['Surname'].replace(np.nan, '', regex=True) \
                                   .apply(str) \
                                   .apply(sentence_preprocessor) \
                                   .apply(lambda x: leftpad(x, SURNAME_VECTOR_SIZE)) \
                                   .apply(embed_tokens) 
surname_word_vec_df.head(2).tolist()

[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 

In [25]:
description_word_vec_df = df['Brief description'].replace(np.nan, '', regex=True) \
                                                 .apply(sentence_preprocessor) \
                                                 .apply(lambda x: leftpad(x, BRIEF_DESCRIPTION_VECTOR_SIZE)) \
                                                 .apply(embed_tokens) 
description_word_vec_df.head(2).tolist()

[array([0., 0., 0., ..., 0., 0., 0.]),
 array([0.        , 0.        , 0.        , ..., 0.32326001, 0.20395   ,
        0.2386    ])]

In [26]:
arr1 = np.stack(description_word_vec_df)
arr2 = np.stack(surname_word_vec_df)
arr3 = np.array(categorical_df)
arr4 = np.array(continuous_df)

arr1.shape, arr2.shape, arr3.shape, arr4.shape

((101, 5300), (101, 700), (101, 58), (101, 5))

In [27]:
input_matrix = np.hstack((arr1, arr2, arr3, arr4))
input_matrix.shape

(101, 6063)

In [28]:
input_statistics_df = pd.concat([*continuous_df_statistics], axis=1)
input_statistics_df.columns = ['Mean', 'std']
input_statistics_df.head()

Unnamed: 0,Mean,std
Deflection volume [L],71.527723,303.561557
Alc [% vol],5.649505,1.143733
Bitter value [IBU],27.959406,13.449626
Color [EBC],29.0,30.396052
Carbonation [g / l],4.974257,0.82288


In [29]:
## Output data contains the following columns
## Let us identify how many categorical and continuous columns we have
## We drop some columns taht are just empty throughout

In [30]:
target_df = df.iloc[:, 9:-1] \
              .drop(labels=['Amount [g].23', 'variety.23', 'Degree of fermentation [%].2', 'Fermentation temp. [° C].2'], axis=1)
target_df

Unnamed: 0,Amount [g],Proportion of [%],variety,Amount [g].1,Proportion of [%].1,variety.1,Amount [g].2,Proportion of [%].2,variety.2,Amount [g].3,...,Bulk [g].5,Rest time [min].5,Resting temp. [° C].5,Start temp. [° C].5,method.5,Volume [L],Total cooking time [min],Post-polymerization time [min],Evaporation rate [% / h],Loss of spent grains [L]
0,5000,91,Mairs Otter Malt,50.0,1.0,Amber Malt,220.0,4.0,Caramel malt amber,230.0,...,,,,,,22.5,60.0,5.0,10.0,0.5
1,3500,52,Wheat malt light,2900.0,43.0,Viennese malt,200.0,3.0,oatmeal,100.0,...,,,,,,41.0,90.0,10.0,15.0,1.5
2,3500,50,Pilsner malt,2500.0,36.0,Monastery malt,500.0,7.0,Caramel malt light,500.0,...,,,,,,32.7,80.0,10.0,10.0,1.0
3,2400,44,Wheat malt dark,2000.0,37.0,Munich malt type I.,650.0,12.0,Caramel malt dark type I.,400.0,...,,,,,,24.7,90.0,15.0,10.0,2.0
4,3000,50,rice,2000.0,33.0,Pilsner malt,1000.0,17.0,Munich malt type I.,,...,,,,,,32.4,90.0,5.0,10.0,0.5
5,4500,90,Pale ale malt,400.0,8.0,Wheat malt light,100.0,2.0,Brown malt,,...,,,,,,22.8,60.0,5.0,10.0,0.5
6,3600,61,Wheat malt light,850.0,14.0,Munich malt type II,800.0,14.0,Pilsner malt,600.0,...,,,,,,26.5,90.0,5.0,12.5,1.5
7,6000,89,Pilsner Malt Bohemian,450.0,7.0,Pilsner malt extra light,300.0,4.0,Aromatic malt,,...,,,,,,26.7,70.0,10.0,10.0,1.0
8,3500,78,Pilsner malt,500.0,11.0,Wheat malt light,500.0,11.0,Caramel malt light,,...,,,,,,30.4,90.0,10.0,14.0,2.0
9,5200,100,Viennese malt,,,,,,,,...,,,,,,22.5,60.0,5.0,10.0,0.5


In [31]:
target_df.select_dtypes(include=['object']).columns

Index(['variety', 'variety.1', 'variety.2', 'variety.3', 'variety.4',
       'variety.5', 'variety.6', 'variety.7', 'variety.8', 'use', 'variety.9',
       'use.1', 'variety.10', 'use.2', 'variety.11', 'use.3', 'variety.12',
       'shape', 'use.4', 'variety.13', 'shape.1', 'use.5', 'variety.14',
       'shape.2', 'use.6', 'Cooking time [min].2', 'variety.15', 'shape.3',
       'use.7', 'Cooking time [min].3', 'variety.16', 'shape.4', 'use.8',
       'Cooking time [min].4', 'variety.17', 'shape.5', 'use.9',
       'Cooking time [min].5', 'variety.18', 'shape.6', 'use.10',
       'Cooking time [min].6', 'variety.19', 'shape.7', 'use.11',
       'Cooking time [min].7', 'variety.20', 'shape.8', 'use.12',
       'Cooking time [min].8', 'variety.21', 'variety.22', 'Water-malt [: 1]',
       'method', 'method.1', 'method.2', 'method.3', 'method.4', 'method.5'],
      dtype='object')

In [32]:
## These columns are of object datatype in pandas which means they are categorical. 
## Continuous columns should have float values
## However some columns like Cooking time and Water malt seems like predominantly float calues but
## are wrongly seen to be in categorical data because they have a few spurious entries
## Cooking time has mostly numbers but some fields like '7 days' which needs to be converted to equivalent number
## Similarly Water Malt column has mostly floats but there is one spurious entry saying '4th'
## We clean such spurious entries to get them as continuous columns

In [33]:
def cooking_time_rectifier(v):
    try:
        return float(v)
    except:
        if str(v).endswith('days'):
            return float(str(v)[:-4].strip()) * 60 * 24

for i in range(2, 9):
    colname = 'Cooking time [min].' + str(i)
    target_df[colname] = target_df[colname].apply(cooking_time_rectifier).astype(float)

In [34]:
def float_converter(v):
    if not v:
        return 0.0
    try:
        result = float(v)
    except:
        result = float_converter(str(v)[:-1])
    return result

target_df['Water-malt [: 1]'] = target_df['Water-malt [: 1]'].apply(float_converter).astype(float)

In [35]:
target_df.select_dtypes(include=['object']).columns

Index(['variety', 'variety.1', 'variety.2', 'variety.3', 'variety.4',
       'variety.5', 'variety.6', 'variety.7', 'variety.8', 'use', 'variety.9',
       'use.1', 'variety.10', 'use.2', 'variety.11', 'use.3', 'variety.12',
       'shape', 'use.4', 'variety.13', 'shape.1', 'use.5', 'variety.14',
       'shape.2', 'use.6', 'variety.15', 'shape.3', 'use.7', 'variety.16',
       'shape.4', 'use.8', 'variety.17', 'shape.5', 'use.9', 'variety.18',
       'shape.6', 'use.10', 'variety.19', 'shape.7', 'use.11', 'variety.20',
       'shape.8', 'use.12', 'variety.21', 'variety.22', 'method', 'method.1',
       'method.2', 'method.3', 'method.4', 'method.5'],
      dtype='object')

In [36]:
## Now we have all categorical columns separate and we once again one-hot encode these columns
## All of the continuous columns will be normalized to 0 mean, 1 std dev just like we did with input data
## It is also safe to convert nan's to 0 in tehse comntinuous columns

In [37]:
categorical_output_df = pd.get_dummies(target_df.select_dtypes(include=['object']))
categorical_output_df.head()

Unnamed: 0,variety_BEST Red X,variety_Bohemian floor malt,variety_Corn flakes,variety_Jerusalem artichoke,variety_Mairs Otter Malt,variety_Maris Otter Malts,variety_Munich malt type I.,variety_Munich malt type II,variety_Oak smoke wheat malt,variety_Pale ale malt,...,method.2_Partial mash cooking,method.2_infusion,method.3_Decoction,method.3_Heat,method.3_Infusion partial mash,method.3_infusion,method.4_Decoction,method.4_Heat,method.4_infusion,method.5_Heat
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [38]:
continuous_output_df = target_df.select_dtypes(exclude=['object'])
continuous_output_df = continuous_output_df.replace(np.nan, 0, regex=True)
continuous_output_df_statistics = (continuous_output_df.mean(axis=0), continuous_output_df.std(axis=0))

continuous_output_df = continuous_output_df.apply(lambda x: (x - x.mean()) / x.std(), axis=0)
continuous_output_df.head()

Unnamed: 0,Amount [g],Proportion of [%],Amount [g].1,Proportion of [%].1,Amount [g].2,Proportion of [%].2,Amount [g].3,Proportion of [%].3,Amount [g].4,Proportion of [%].4,...,Temp. [° C].4,Bulk [g].5,Rest time [min].5,Resting temp. [° C].5,Start temp. [° C].5,Volume [L],Total cooking time [min],Post-polymerization time [min],Evaporation rate [% / h],Loss of spent grains [L]
0,-0.126861,1.28291,-0.354834,-1.287215,-0.356567,-0.500904,0.077875,0.186721,-0.302633,-0.359912,...,-0.141421,-0.099504,-0.099504,-0.099504,-0.099504,-0.343819,-0.876496,-0.585989,-0.169373,-0.784181
1,-0.254762,-0.438269,0.11556,1.19477,-0.368273,-0.598382,-0.365747,-0.478884,-0.302633,-0.359912,...,-0.141421,-0.099504,-0.099504,-0.099504,-0.099504,-0.120281,1.090751,0.254706,0.257227,0.842149
2,-0.254762,-0.526535,0.04954,0.781106,-0.192688,-0.208469,0.999244,0.852326,-0.302633,-0.359912,...,-0.141421,-0.099504,-0.099504,-0.099504,-0.099504,-0.220571,0.435002,0.254706,-0.169373,0.028984
3,-0.348556,-0.791332,-0.032986,0.8402,-0.104895,0.278923,0.657996,0.852326,-0.302633,-0.359912,...,-0.141421,-0.099504,-0.099504,-0.099504,-0.099504,-0.317236,1.090751,1.095401,-0.169373,1.655314
4,-0.297395,-0.526535,-0.032986,0.603821,0.099954,0.766315,-0.706994,-0.700752,-0.302633,-0.359912,...,-0.141421,-0.099504,-0.099504,-0.099504,-0.099504,-0.224196,1.090751,-0.585989,-0.169373,-0.784181


In [39]:
target_statistics_df = pd.concat([*continuous_output_df_statistics], axis=1)
target_statistics_df.columns = ['Mean', 'std']
target_statistics_df.head()

Unnamed: 0,Mean,std
Amount [g],6487.80198,11727.833247
Proportion of [%],61.930693,22.658887
Amount [g].1,2199.851485,6058.752778
Proportion of [%].1,22.782178,16.921941
Amount [g].2,829.220792,1708.572974


In [40]:
output_arr1 = np.array(categorical_output_df)
output_arr2 = np.array(continuous_output_df)
output_arr1.shape, output_arr2.shape

((101, 483), (101, 91))

In [41]:
output_matrix = np.hstack((output_arr1, output_arr2))
output_matrix.shape

(101, 574)

In [42]:
## Now we have an input matrix which has data of 6063 dimensions
## And their target is another vector of 574 dimensions
## So our problem now is one of Regression to identify the function f that predicts f(X) to be as close as possible to Y
## Any regression method can be applied to this data

In [43]:
input_matrix.shape, output_matrix.shape

((101, 6063), (101, 574))

In [44]:
reg = LinearRegression().fit(input_matrix, output_matrix)
reg.score(input_matrix, output_matrix)

1.0

In [45]:
## The below mentioned parameters describe the linear model that we have learned as f()
## reg.coef_, reg.intercept_

In [46]:
## Test prediction on Training data samples
## We take the first two input rows and try to predict the model's output for the same

In [47]:
pred = reg.predict(input_matrix[:2])
pred.shape

(2, 574)

In [48]:
## These predictions need to rescaled since the model has worked on normalized data
## we use the statistics to get back the original scale of data
## and also do some filtering operations to convert very very small values to zero
pred

array([[-1.71737624e-16,  2.73218947e-17, -1.90928003e-16, ...,
        -5.85989426e-01, -1.69372717e-01, -7.84180774e-01],
       [ 3.07046055e-16, -1.82145965e-17, -3.72965547e-16, ...,
         2.54705631e-01,  2.57226894e-01,  8.42148962e-01]])

In [49]:
continuous_output_df.columns

Index(['Amount [g]', 'Proportion of [%]', 'Amount [g].1',
       'Proportion of [%].1', 'Amount [g].2', 'Proportion of [%].2',
       'Amount [g].3', 'Proportion of [%].3', 'Amount [g].4',
       'Proportion of [%].4', 'Amount [g].5', 'Proportion of [%].5',
       'Amount [g].6', 'Proportion of [%].6', 'Amount [g].7',
       'Proportion of [%].7', 'Amount [g].8', 'Time [min]', 'Amount [g].9',
       'Time [min].1', 'Amount [g].10', 'Time [min].2', 'Amount [g].11',
       'Time [min].3', 'Amount [g].12', 'Cooking time [min]', 'Amount [g].13',
       'Cooking time [min].1', 'Amount [g].14', 'Cooking time [min].2',
       'Amount [g].15', 'Cooking time [min].3', 'Amount [g].16',
       'Cooking time [min].4', 'Amount [g].17', 'Cooking time [min].5',
       'Amount [g].18', 'Cooking time [min].6', 'Amount [g].19',
       'Cooking time [min].7', 'Amount [g].20', 'Cooking time [min].8',
       'Amount [g].21', 'Degree of fermentation [%]',
       'Fermentation temp. [° C]', 'Amount [g].22',


In [50]:
def rescale_predictions(df):
    for col in continuous_output_df.columns:
        mu, sd = target_statistics_df.loc[col]
        df[col] = mu + (df[col] * sd)
    
    values_standardizer = lambda row: list(map(lambda x: 0 if -2**(-10) < x < 2**(-10) else x, row))
    df = df.apply(values_standardizer)
    return df

In [51]:
colnames = list(categorical_output_df.columns) + list(continuous_output_df.columns)

In [52]:
pred_df = pd.DataFrame(pred, columns=colnames)
pred_df = rescale_predictions(pred_df)
pred_df

Unnamed: 0,variety_BEST Red X,variety_Bohemian floor malt,variety_Corn flakes,variety_Jerusalem artichoke,variety_Mairs Otter Malt,variety_Maris Otter Malts,variety_Munich malt type I.,variety_Munich malt type II,variety_Oak smoke wheat malt,variety_Pale ale malt,...,Temp. [° C].4,Bulk [g].5,Rest time [min].5,Resting temp. [° C].5,Start temp. [° C].5,Volume [L],Total cooking time [min],Post-polymerization time [min],Evaporation rate [% / h],Loss of spent grains [L]
0,0,0,0,0,1.0,0,0,0,0,0,...,0,0,0,0,0,22.5,60.0,5.0,10.0,0.5
1,0,0,0,0,0.0,0,0,0,0,0,...,0,0,0,0,0,41.0,90.0,10.0,15.0,1.5


In [53]:
pred_df.loc[0]

variety_BEST Red X                         0.0
variety_Bohemian floor malt                0.0
variety_Corn flakes                        0.0
variety_Jerusalem artichoke                0.0
variety_Mairs Otter Malt                   1.0
variety_Maris Otter Malts                  0.0
variety_Munich malt type I.                0.0
variety_Munich malt type II                0.0
variety_Oak smoke wheat malt               0.0
variety_Pale ale malt                      0.0
variety_Pilsen malt                        0.0
variety_Pilsner Malt Bohemian              0.0
variety_Pilsner malt                       0.0
variety_Potatoes                           0.0
variety_Smoked malt                        0.0
variety_Stout malt                         0.0
variety_Viennese malt                      0.0
variety_Wheat malt dark                    0.0
variety_Wheat malt light                   0.0
variety_barley                             0.0
variety_rice                               0.0
variety.1_Amb

In [54]:
## Here we see that the model almost accurately predicts the values for the first 2 rows of trainign data.
## There's a very good chance for overfitting in Linear Regression
## Things todo
'''
1. Run Cross-validation after splitting into train and test set and measure performance on test set
2. Add L1, L2 penalties for Linear regression to control overfitting
3. Try out nonlinear methods like SVM Kernel Regression to see if a more complicated function would be able
   to better explain our data
'''

'\n1. Run Cross-validation after splitting into train and test set and measure performance on test set\n2. Add L1, L2 penalties for Linear regression to control overfitting\n3. Try out nonlinear methods like SVM Kernel Regression to see if a more complicated function would be able\n   to better explain our data\n'

In [92]:
def scale_input_data(df):

    surname_df = df['Surname'].replace(np.nan, '', regex=True) \
                              .apply(str) \
                              .apply(sentence_preprocessor) \
                              .apply(lambda x: leftpad(x, SURNAME_VECTOR_SIZE)) \
                              .apply(embed_tokens)
    

    desc_df = df['Brief description'].replace(np.nan, '', regex=True) \
                                     .apply(sentence_preprocessor) \
                                     .apply(lambda x: leftpad(x, BRIEF_DESCRIPTION_VECTOR_SIZE)) \
                                     .apply(embed_tokens)
    
    cont_df = df.iloc[:, 3:9].drop(labels='Aroma', axis=1)
    for col in cont_df.columns:
        mu, sd = input_statistics_df.loc[col]
        cont_df[col] = (cont_df[col] - mu) / sd
    
    cat_df = pd.get_dummies(ip[['Aroma', 'Beer type']]) + \
             pd.DataFrame(0, index=np.arange(len(ip)), columns=categorical_df.columns)
    cat_df = cat_df.replace(np.nan, 0, regex=True)

    arr1 = np.stack(desc_df)
    arr2 = np.stack(surname_df)
    arr3 = np.array(cat_df)
    arr4 = np.array(cont_df)
    
    return np.hstack((arr1, arr2, arr3, arr4))

In [99]:
test_input = collections.OrderedDict({'Surname': ['surname1', 'surname2'], 
                                      'Brief description': ['brief_desc1', 'brief_desc2'],
                                      'Beer type': ['American pale ale', 'Light wheat beer'],
                                      'Deflection volume [L]': [20.0, 30.0],
                                      'Alc [% vol]': [6.0, 3.0],
                                      'Bitter value [IBU]': [33.0, 10.0],
                                      'Color [EBC]': [15, 12],
                                      'Aroma': ['strong', 'strong'],
                                      'Carbonation [g / l]': [3.0, 6.0]})
test_df = pd.DataFrame.from_dict(test_input)
test_df

Unnamed: 0,Surname,Brief description,Beer type,Deflection volume [L],Alc [% vol],Bitter value [IBU],Color [EBC],Aroma,Carbonation [g / l]
0,surname1,brief_desc1,American pale ale,20.0,6.0,33.0,15,strong,3.0
1,surname2,brief_desc2,Light wheat beer,30.0,3.0,10.0,12,strong,6.0


In [101]:
test_ip_matrix = scale_input_data(test_df)
test_ip_matrix.shape

(2, 6063)

In [102]:
pred = reg.predict(test_ip_matrix)
pred.shape

(2, 574)

In [103]:
pred_df = pd.DataFrame(pred, columns=colnames)
pred_df = rescale_predictions(pred_df)
pred_df

Unnamed: 0,variety_BEST Red X,variety_Bohemian floor malt,variety_Corn flakes,variety_Jerusalem artichoke,variety_Mairs Otter Malt,variety_Maris Otter Malts,variety_Munich malt type I.,variety_Munich malt type II,variety_Oak smoke wheat malt,variety_Pale ale malt,...,Temp. [° C].4,Bulk [g].5,Rest time [min].5,Resting temp. [° C].5,Start temp. [° C].5,Volume [L],Total cooking time [min],Post-polymerization time [min],Evaporation rate [% / h],Loss of spent grains [L]
0,0.0,0.006913,0.002307,0.00232,0.907669,0.011199,-0.033233,-0.014379,0.01115,-0.018935,...,0.688553,-124.956815,-0.374836,-1.949054,-1.799378,19.265675,56.579196,5.598546,9.331651,0.442078
1,0.002553,-0.018823,0.0,0.0,0.330665,-0.021677,-0.070289,-0.010586,0.021293,-0.089863,...,2.876304,459.841422,1.379546,7.173694,6.621716,30.097945,66.23776,8.280296,11.868687,0.954899
