In [1]:
# the following will make jupyter display 'wider' (less side-to-side scrolling required)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf
import pandas as pd 
import matplotlib.pyplot as plt
import os

### Utility functions

In [3]:
# There are lots of empty entries in the 'Bean_Type' column.
# They don't hurt anything, but can be confusing in any output.
def blank_to_unspecified(value):
    value = value.strip()
    if value == '':
        value = 'unspecified'
    return value

In [4]:
# This function is a hack. I'm not positive, but I'm pretty sure
# that Neural Networks must have a binary Target field.
# So this is my way of identifying any ratings which fall
# within the desired range.
#
# return True if rating is >= 400
# else return False
#
# Note that when I tried using actual bool return types (i.e. True & False)
# I got some wierd errors. This probably needs improvement. It would also
# be cool if we could make this more configurable, so the user could
# create different neural nets that look for different ranges of 'Rating'
def bin_ratings(rating):
    if rating == 5.75: return 1
    if rating == 5.50: return 1
    if rating == 5.25: return 1
    if rating == 5.00: return 1
    
    if rating == 4.75: return 1
    if rating == 4.50: return 1
    if rating == 4.25: return 1
    if rating == 4.00: return 1
    
    if rating == 3.75: return 0
    if rating == 3.50: return 0
    if rating == 3.25: return 0
    if rating == 3.00: return 0

    if rating == 2.75: return 0
    if rating == 2.50: return 0
    if rating == 2.25: return 0
    if rating == 2.00: return 0
    
    if rating == 1.75: return 0
    if rating == 1.50: return 0
    if rating == 1.25: return 0
    if rating == 1.00: return 0
    
    #print( f"error: rating={rating} type={type(rating)}" )
    return "2"


#     value = int(float(rating) * 100)
#     if(value >= 400):
#         return "1"
#     else:
#         return "0"

In [5]:
# This function performs the fit_transform step using whichever
# scaler object is passed in (e.g. StandardScaler or MinMaxScaler)
def do_scale(scaler, X_train, X_test):
    # Scale the data
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.fit_transform(X_test)
    return X_train_scaled, X_test_scaled

In [6]:
# Determine which values to replace if counts are less than 'threshold'
def reduce_count_vals(df, colname, threshold):
    counts = df[colname].value_counts()
    replace_list = list(counts[counts < threshold].index)

    # Replace in dataframe
    for item in replace_list:
       df[colname] = df[colname].replace(item,"Other")
    
    # Check to make sure binning was successful
    #df[colname].value_counts()

In [7]:
# This function automatically finds the columns
# which are objects (since they are most likely 'strings')
# does the following:
# - gets the column names
# - instantiates a OneHotEncoder object
# - creates a new dataframe with just the column names from above
# - encodes just those features
# - merges the new one-hot-encoded columns back into the original dataframe
# - drops the original column names from the dataframe
# - createa a pd.Series from the 'Target' column
# - drops the target column
# - creates the X features
# - returns the munged dataframe (df), the features (X), and the target (y)
def do_one_hot(df, y_col):
    # Create a list of columns that are 'object' type
    obj_cat = df.dtypes[df.dtypes == "object"].index.tolist()
    # Create a OneHotEncoder instance
    enc = OneHotEncoder(sparse=False)
    # Fit and transform the OneHotEncoder using the categorical variable list
    encode_df = pd.DataFrame(enc.fit_transform(df[obj_cat]))
    # Add the encoded variable names to the DataFrame
    encode_df.columns = enc.get_feature_names(obj_cat)
    
    # Merge one-hot encoded features and drop the originals
    df = df.merge(encode_df, left_index=True, right_index=True)
    df = df.drop(obj_cat,1)
    
    # Split our preprocessed data into our features and target arrays
    y = df[y_col].values
    df.drop(columns=[y_col], inplace=True)
    X = df.values
    print(X)
    print(f"merged df.shape()={df.shape}")
    
    return df, X, y

In [8]:
# This function automatically creates the Neural Network
# input, hidden, and output layers based on parameters
# stored in the 'layers' dictionary
def build_model(inputs, layers):
    nn = tf.keras.models.Sequential()
    first = True
    for layer in layers:
        if first:
            first = False
            nn.add(tf.keras.layers.Dense(units=layer['units'], activation=layer['act'], input_dim=inputs))
        else:
            nn.add(tf.keras.layers.Dense(units=layer['units'], activation=layer['act']))

    print(nn.summary())
    nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return nn

In [9]:
# This function will automatically perform splitting and training
# of the the Neural Net model. It returns 'fit_model' in case
# it's needed for later steps in the flow
#
# FIXME: There's a bug in here somewhere. I can't use the 'checkpoints_dir'
# parameter below, because the '{epoch:02d}.hdf5' stuff dies if the line
# of code looks like this:
#        checkpoint_path = f"{checkpoints_dir}/weights_2.{epoch:02d}.hdf5"
# something about the first pair of curly braces vs. the second pair
# inside the f-string. So I just hard-coded the dir to always be 'checkpoints'

def train_nn_model(model, X_train, y_train, n_epochs, checkpoint_dir):
    # Create a callback that saves the model's weights every epoch
    from tensorflow.keras.callbacks import ModelCheckpoint
    
    # Define the checkpoint path and filenames
    os.makedirs("checkpoints",exist_ok=True)
    #checkpoint_file = f"weights.{epoch:02d}.hdf5"
    #checkpoint_path = f"checkpoints_opt/weights.{epoch:02d}.hdf5"
    checkpoint_path = "checkpoints/weights_2.{epoch:02d}.hdf5"
    
    cp_callback_opt = ModelCheckpoint(
        filepath=checkpoint_path,
        verbose=1,
        save_weights_only=True,
        period=5)

    # Normally we use 'save_freq', but it behaves strangely, and I could not get
    # it to save every 5 epochs. The 'period' param is now deprecated, but it works.
        #save_freq='epoch')
        
    fit_model = model.fit(X_train, y_train, epochs=n_epochs, callbacks=[cp_callback_opt])
    return fit_model

In [10]:
# This is no longer used, since I now drop 'Company' and 'Bean_Origin_or_Bar_Name',
def bin_names(name,map,counter):
    if name in map:
        return map[name]
    else:
        counter[0] = counter[0] + 1
        map[name] = counter[0]
        return map[name]

In [11]:
# This is a hack to reduce the number of entries in the 'Cocoa_Percent' column.
def bin_percentiles(percentile):
    percentile = int(float(percentile.replace('%', '')))
    if(percentile >= 95): return "95"
    if(percentile >= 90): return "90"
    if(percentile >= 85): return "85"
    if(percentile >= 80): return "80"
    if(percentile >= 75): return "75"
    if(percentile >= 70): return "70"
    if(percentile >= 65): return "65"
    if(percentile >= 60): return "60"
    if(percentile >= 55): return "55"
    return "50_or_below"

In [12]:
# The scatter plots are mildly interesting, but any 'size' other than
# the default of '1' makes it too messy
def do_scatter_plots(df):
    #fig, axes = plt.subplots(nrows=8, ncols=8, figsize=(24,24))
    colnames = df.columns
    ncols = len(colnames)
    for xx in range(0,ncols):
        for yy in range(xx+1,ncols):
            #if xx == yy:
            #    continue
            titlestring = f"x={colnames[xx]} vs. y={colnames[yy]}"
            #df.plot.scatter(ax=axes[xx,yy], x=colnames[xx], y=colnames[yy], title=titlestring, c=df['IS_SUCCESSFUL'], s=(df['ASK_AMT']*20), colormap='winter')
            df.plot.scatter(
                figsize=(6,6),
                x=colnames[xx], 
                y=colnames[yy], 
                title=titlestring,
                xlabel=colnames[xx],
                ylabel=colnames[yy],
                c=df['Rating'], 
                colormap='winter'
            )
                #s=(df['Rating']), 



In [13]:
# For some reason, I kept getting errors in one-hot encoding unless I 
# forced 'Review Date' to be interpreted as a string (even though it was
# a string after the read_csv() step - FIXME)
def to_string(value):
    other = f"_{value}_"
    return other

# Extract, Transform, Load stuff (but the Load into a DB piece is missing)

In [14]:
def do_ETL():
    df = pd.read_csv("Resources/flavors_of_cacao.csv")

    # replace ' ' with '_' in column names
    colnames = [
            'Company',
            'Bean_Origin_or_Bar_Name',
            'REF',
            'Review_Date',
            'Cocoa_Percent',
            'Company_Location',
            'Rating',
            'Bean_Type',
            'Broad_Bean_Origin'
    ]
    df.columns = colnames

    df.dropna(axis='rows', how='any', inplace=True)

    company_names = {}
    company_counter = [0]
    origin_names = {}
    origin_counter = [0]

    #df['Company']           = df['Company'].apply(bin_names, args=(company_names, company_counter))
    #df['BeanBarName']       = df['BeanBarName'].apply(bin_names, args=(origin_names, origin_counter))
    df.drop(columns=[
        'Company',
        'Bean_Origin_or_Bar_Name',
        'REF',
        'Company_Location'
    ], inplace=True)

    df['Review_Date']       = df['Review_Date'].apply(to_string)
    reduce_count_vals(df, 'Review_Date', 100)

    df['Cocoa_Percent']     = df['Cocoa_Percent'].apply(bin_percentiles)

    #reduce_count_vals(df, 'Company_Location', 38)

    df['Rating']            = df['Rating'].apply(bin_ratings)

    df['Bean_Type']         = df['Bean_Type'].apply(blank_to_unspecified)
    reduce_count_vals(df, 'Bean_Type', 10)

    reduce_count_vals(df, 'Broad_Bean_Origin', 40)
    
    return df

# Create and Train the Neural Network stuff

In [15]:
def do_NN(df):
    df, X, y = do_one_hot(df, 'Rating')

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
    
    print(f"X_train={X_train}")
    print(f"y_train={y_train}")

    #X_train_scaled, X_test_scaled = do_scale(StandardScaler(), X_train, X_test)
    X_train_scaled, X_test_scaled = do_scale(MinMaxScaler(), X_train, X_test)

    nins = df.shape[1]
    print(f"nins={nins}")
    nn_model = build_model(inputs=nins, layers=[
        #{'units': 512, 'act': 'relu'},
        {'units': 256, 'act': 'relu'},
        {'units': 128, 'act': 'relu'},
        {'units': 64, 'act': 'relu'},
        {'units': 32, 'act': 'relu'},
        {'units': 16, 'act': 'relu'},
        {'units': 8, 'act': 'relu'},
        {'units': 4, 'act': 'relu'},
        {'units': 1, 'act': 'sigmoid'},

    ])
    trained_model = train_nn_model(nn_model, X_train_scaled, y_train, n_epochs=100, checkpoint_dir="checkpoints")

    model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
    print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

    nn_model.save("ChocoloateBarRatings_opt1.h5")


# Do the actual work

In [16]:
df = do_ETL()
do_NN(df)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]]
merged df.shape()=(1791, 42)
X_train=[[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
y_train=[0 0 0 ... 0 0 0]
nins=42
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 256)               11008     
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
___



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100

Epoch 00005: saving model to checkpoints\weights_2.05.hdf5
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100

Epoch 00010: saving model to checkpoints\weights_2.10.hdf5
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100

Epoch 00015: saving model to checkpoints\weights_2.15.hdf5
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100

Epoch 00020: saving model to checkpoints\weights_2.20.hdf5
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100

Epoch 00025: saving model to checkpoints\weights_2.25.hdf5
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100

Epoch 00030: saving model to checkpoints\weights_2.30.hdf5
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100

Epoch 00035: saving model to checkpoints\weights_2.35.hdf5
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100

Epoch 00040: saving model to checkpoints\weights_2.40.hdf5
Epoch 41/

Epoch 75/100

Epoch 00075: saving model to checkpoints\weights_2.75.hdf5
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100

Epoch 00080: saving model to checkpoints\weights_2.80.hdf5
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100

Epoch 00085: saving model to checkpoints\weights_2.85.hdf5
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100

Epoch 00090: saving model to checkpoints\weights_2.90.hdf5
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100

Epoch 00095: saving model to checkpoints\weights_2.95.hdf5
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100

Epoch 00100: saving model to checkpoints\weights_2.100.hdf5
14/14 - 0s - loss: 0.7793 - accuracy: 0.9286
Loss: 0.7792798280715942, Accuracy: 0.9285714030265808


### How many 'Bean Types' are there?

In [17]:
df = pd.read_csv("Resources/flavors_of_cacao.csv")
num_beans = len(df['Bean_Type'].value_counts())
print(f"\nThere are {num_beans} different 'Bean_Type' entries, even though only Criollo, Forastero, and Trinitario species exist\n")
df['Bean_Type'].value_counts()


There are 41 different 'Bean_Type' entries, even though only Criollo, Forastero, and Trinitario species exist



                            887
Trinitario                  419
Criollo                     153
Forastero                    87
Forastero (Nacional)         52
Blend                        41
Criollo, Trinitario          39
Forastero (Arriba)           37
Criollo (Porcelana)          10
Trinitario, Criollo           9
Forastero (Parazinho)         8
Forastero (Arriba) ASS        6
Nacional (Arriba)             3
Matina                        3
EET                           3
Beniano                       3
Criollo (Ocumare 61)          2
Trinitario, Forastero         2
Trinitario (85% Criollo)      2
Forastero (Catongo)           2
Criollo, Forastero            2
Amazon, ICS                   2
Criollo (Amarru)              2
Amazon mix                    2
Nacional                      2
Criollo (Ocumare)             1
Criollo (Ocumare 67)          1
Trinitario, TCGA              1
Trinitario (Amelonado)        1
Trinitario, Nacional          1
Forastero (Amelonado)         1
Foraster

### How many 'Broad_Bean_Origins' are there?

In [18]:
df = pd.read_csv("Resources/flavors_of_cacao.csv")
num_origins = len(df['Broad_Bean_Origin'].value_counts())
print(f"\nThere are {num_origins} different 'Broad_Bean_Origin' entries\n")
df['Broad_Bean_Origin'].value_counts()


There are 100 different 'Broad_Bean_Origin' entries



Venezuela                214
Ecuador                  193
Peru                     165
Madagascar               145
Dominican Republic       141
                        ... 
Peru, Belize               1
Peru, Mad., Dom. Rep.      1
PNG, Vanuatu, Mad          1
Trinidad, Ecuador          1
Venezuela, Carribean       1
Name: Broad_Bean_Origin, Length: 100, dtype: int64

### Scatter plots on original, unmodified data

In [19]:
# FIXME: this used to work yesterday, before all my data-cleaning hacks. Do not use now.
#df = pd.read_csv("Resources/flavors_of_cacao.csv")
#do_scatter_plots(df)