In [1]:
# Program configuration
env = None
sample_size = None
# Set env, if env = test, will only be run locally and display the result
env = "prod"
env = "test"

# Nb model to generate
nb_models = 10
# Nb value to filter, if less than, we remove those
nb_min_value_before = 50

normalize_training = False
max_to_add = 1000

# Number of value on which to train, if null, train on all value
sample_size = None
test_size = 1000
nb_min_value_after = 200

forbidden_regexp="[0-9]+"

# If false, value might persist, good to rerun, bad for memory
garbage_collector = True

In [2]:
# On production environment, just use garbage collection
import os
import gc
import time
from datetime import datetime

if garbage_collector or env == "prod":
    os.environ['GC'] = 'true'
else:
    os.environ['GC'] = 'false'
    
def del_object(object):
    if os.environ['GC'] == 'true':
        del(object)
        gc.collect()

def pt(toPrint):
    print('{0} - {1}'.format(datetime.now(), toPrint))


In [3]:
# Read training data + test data
import pandas as pd

def read_data(env, sample_size):
    pt("start - read_data")
    df_data = pd.read_json("../input/train.json")
    df_test = pd.read_json("../input/test.json")
    # df_sample_input = pd.read_csv("../input/sample_submission.csv")
    if env == "prod":
        # set that to some default value
        df_test['cuisine'] = "todo"
        if sample_size is not None and sample_size > 0:
            df_data = df_data.sample(sample_size)
        else:
            df_data = df_data
    else:
        if sample_size is not None and sample_size > 0:
            df_data = df_data.sample(sample_size)
        df_test = df_data.sample(test_size)

        # Removing all df_test from df_data to ensure not train with test data
        df_common = df_data.merge(df_test,on=['id'])
        df_data = df_data[(~df_data.id.isin(df_common.id))]
    pt("end   - read_data")
    return df_data, df_test

In [4]:
import re

def process_value(myValue, regex):
    success = True
    result = myValue.lower()
    
    result = regex.sub('', result)
    # Remove trimming s
    if result.endswith('s'):
        result = result[:-1]
    
    success = len(result) > 2
    
    result = "res_" + result
    
    return result, success

def preprocess_dataframe(df1, df2, 
                         column_name = 'ingredients',
                         split_by = '\s+'):
    """
    Given 2 dataframe, extract the column ""
    """
    pt("start - preprocess_dataframe")
    count1 = len(df1)
    count2 = len(df2)
    
    total_df = df1.append(df2, ignore_index=True)
    
    d_list = []
    
    output = []
    
    # REGULAR EXPRESSION FOR CHARACGTER TO REMOVE IN PROCESS VALUE
    regex = re.compile('[^a-zA-Z]')
    
    for index, row in total_df.iterrows():
        for value_field in row[column_name]:
            # split by space
            # values = re.split(split_by, value_field)
            # Reverse split by non alphabetical character
            values = re.split(r"[^a-zA-Z]", value_field)
            
            current_values = []
            for value in values:
                
                # Remove all weird characters.
                processed_value, processed_success = process_value(value, regex)
                
                # Only add it if it was not already done
                if processed_value not in current_values and processed_success:
                    # build a dictionnary of all values
                    if processed_value not in output:
                        output.append(processed_value)

                    current_values.append(processed_value)
                    d_list.append({'id':row['id'], 
                                   'value':processed_value})

    total_df = total_df.append(d_list, ignore_index=True)
    total_df = total_df.groupby('id')['value'].value_counts()
    total_df = total_df.unstack(level=-1).fillna(0)
    
    # Then, we need to merge df_1 and df_2 with their id
    df1 = df1.merge(total_df, left_on='id', right_on='id', how='inner')
    df2 = df2.merge(total_df, left_on='id', right_on='id', how='inner')
    
    del(total_df)

    # We do not need the ingredients column now, so, we can remove it
    df1 = df1.drop(columns=column_name)
    df2 = df2.drop(columns=column_name)
    pt("end   - preprocess_dataframe")
    return df1, df2

In [5]:
def remove_occurence(df1, df2, min_value=0, exclude_columns=['cuisine', 'id']):
    """Given 2 dataframe, remove all occurence that happen less than x times"""
    pt("start - remove_occurence")
    total_df = df1.append(df2, ignore_index=True)
    # Removing the ignored columns
    total_df = total_df.drop(columns=exclude_columns)
    all_columns = list(total_df.columns.values)
    column_to_remove = []
    for column in all_columns:
        total = total_df[column].sum()
        #print(column + " {0}".format(total))
        if total < min_value:
            #print("remove: {0} for {1}".format(total, column))
            column_to_remove.append(column)
    #df1 = df1.drop(columns=column_to_remove)
    #df2 = df2.drop(columns=column_to_remove)
    print("{0} columns left. Removed {1} out of {2} columns.".format(len(all_columns)-len(column_to_remove),
                                                                    len(column_to_remove), len(all_columns)))
    pt("end   - remove_occurence")
    return df1, df2

In [6]:
def normalize_input(df_data, max_to_add=1000, column_name='cuisine'):
    pt("start - normalize_input")
    # Multiply the training data set for food where there is not that much data
    df_count = df_data.groupby(column_name)[column_name]
    max_recipe_count_per_cuisine = df_count.count().max()
    # Loop over all cuisine, if while < minimal, add this dataframe, then add a sample of those, to get exactly the same number
    print("Max value per cuisine = {0}".format(max_recipe_count_per_cuisine))

    for cuisine in df_data.cuisine.unique():
        # nb for cuisine
        df_cuisine = df_data.loc[df_data[column_name] == cuisine]
        recipe_count = df_cuisine.shape[0]
        nb_recipe_to_add = max_recipe_count_per_cuisine - recipe_count
        print("Got {0} value for {1}, need to add {2}".format(recipe_count, cuisine, nb_recipe_to_add))
        tmp_df = None
        nb_recipe_to_add = min(max_to_add, nb_recipe_to_add)
        if nb_recipe_to_add != 0:
            while nb_recipe_to_add != 0:
                if nb_recipe_to_add >= recipe_count:
                    # Add the full dataframe
                    if tmp_df is None:
                        tmp_df = df_cuisine
                    else:
                        tmp_df = tmp_df.append(df_cuisine, ignore_index=True)
                    nb_recipe_to_add -= recipe_count
                else:
                    # Only add a sample of it
                    if tmp_df is None:
                        tmp_df = df_cuisine
                    else:
                        tmp_df = tmp_df.append(df_cuisine.sample(nb_recipe_to_add), ignore_index=True)
                    nb_recipe_to_add = 0
                # Add tmp df to df_data
            df_data = df_data.append(tmp_df, ignore_index=True)
            print("Append a dataframe of {0} values".format(tmp_df.shape[0]))
            del(tmp_df)
    pt("end   - normalize_input")
    return df_data

In [7]:
df_data, df_test = read_data(env, sample_size)
display(df_data.sample(5))
df_data, df_test = preprocess_dataframe(df_data, df_test)
display(df_data.sample(5))

2018-09-15 22:21:21.677008 - start - read_data
2018-09-15 22:21:23.929559 - end   - read_data


Unnamed: 0,cuisine,id,ingredients
10675,italian,20755,"[chili flakes, salt, vegetables, shredded mozz..."
13157,french,19273,"[dried thyme, green onions, ground turkey, chi..."
20060,italian,849,"[boneless skinless chicken breasts, cayenne pe..."
13403,brazilian,11785,"[chicken wings, flour, salt, lime, vegetable o..."
28769,thai,5133,"[whitefish, gluten, extra virgin coconut oil, ..."


2018-09-15 22:21:23.947977 - start - preprocess_dataframe


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


2018-09-15 22:21:48.585818 - end   - preprocess_dataframe


Unnamed: 0,cuisine,id,res_abalone,res_abbamele,res_absinthe,res_abura,res_acai,res_accent,res_accompaniment,res_achiote,...,res_yum,res_yuzu,res_yuzukosho,res_zatarain,res_zero,res_zest,res_zesty,res_zinfandel,res_ziti,res_zucchini
31940,indian,46912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38427,italian,7185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7919,greek,18808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33753,italian,40089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12499,southern_us,24145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
df_data, df_test = remove_occurence(df_data, df_test, nb_min_value_before)
if normalize_training:
    df_data = normalize_input(df_data, max_to_add)
    df_data, df_test = remove_occurence(df_data, df_test, nb_min_value_after)

2018-09-15 22:21:48.741062 - start - remove_occurence
785 columns left. Removed 1955 out of 2740 columns.
2018-09-15 22:21:50.316914 - end   - remove_occurence


In [9]:
# Generate our training/validation datasets
from sklearn import model_selection
import sklearn

# Name of the result column
result_cols = ['cuisine']
result_excl_cols = ['cuisine_']

input_cols = [
    'res_'
]
input_excl_cols = []
# Removing input_cols = ['store', 'item',
# dom, cw, 

# Train on everything

# Get the final values
def get_values(df, cols=[], excl_cols = []):
    columns = df.columns.values
    # Remove all columns that are not inside the list
    cols_to_drop = []
    for column in columns:
        find = False
        ignore = False
        for excl_col in excl_cols:
            if column.startswith(excl_col):
                ignore = True
        if ignore is False:
            for col in cols:
                if column.startswith(col):
                    find = True
        if not find:
            cols_to_drop.append(column)
    print("dropping columns")
    df = df.drop(columns=cols_to_drop)
    print("end dropping columns")
    new_order = sorted(df.columns.values)
    # Same order for both training and testing set
    df = df[new_order]
    return df.values

df_data_shuffle = sklearn.utils.shuffle(df_data)

X_train = get_values(df_data, input_cols, input_excl_cols)
X_test = get_values(df_test, input_cols, input_excl_cols)

Y_train = get_values(df_data, result_cols, result_excl_cols).ravel()

del_object(df_data)
# In test env, we calculate it for the test only
if env == "test":
    Y_test = get_values(df_test, result_cols, result_excl_cols).ravel()
    

dropping columns
end dropping columns
dropping columns
end dropping columns
dropping columns
end dropping columns
dropping columns
end dropping columns


In [10]:
# Normalize the data


X_all = [x + y for x, y in zip(X_train, X_test)]

from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler() 

# Don't cheat - fit only on training data
# Def adding x_train + X_test + X_validation to fit all of them
scaler.fit(X_all)  

X_train = scaler.transform(X_train) 
X_test = scaler.transform(X_test) 

In [11]:
# Custom function to calculate the SMAPE
def get_score(Y_validation, Y_validation_predict):
    nb_success = 0
    for i in range(0, len(Y_validation)):
        if Y_validation[i] == Y_validation_predict[i]:
            nb_success += 1
    return nb_success / len(Y_validation) * 100

In [12]:
# Import algorithm
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import *
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor

import lightgbm as lgbm

models = []

#models.append(('LogisticRegression', LogisticRegression()))
#models.append(('KNeighborsClassifier', KNeighborsClassifier()))
#models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis()))
#models.append(('GaussianNB', GaussianNB()))
#models.append(('SVC', SVC()))

for i in range(5, 5 + nb_models):
    #models.append(('MLPClassifier_adamrelu_{0}'.format(i), MLPClassifier(hidden_layer_sizes=(i,), 
    #                                                            activation='relu', 
    #                                                            solver='adam',
    #                                                            alpha=0.001, 
    #                                                            batch_size='auto',
    #learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=1000, shuffle=True,
    #random_state=i, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True,
    #early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)))
    
    # Try multiple solutions
    hidden_layers = (50, )
    
    models.append(('MLPClassifier_adamrelu_earlystopping_{0}'.format(i), MLPClassifier(hidden_layer_sizes=hidden_layers, 
                                                                activation='logistic', 
                                                                solver='adam',
                                                                alpha=0.001, 
                                                                batch_size='auto',
    learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=1000, shuffle=True,
    random_state=i, tol=0.00001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True,
    early_stopping=True, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)))
#models = []
#models.append(('lgbm', lgbm.sklearn.LGBMRegressor()))
# High value until first model get solved
best_model = "UNKNOWN"

res = []
# Testing all models, one by one
for name, model in models:
    print("Executing for model {0}".format(name))
    time_start = datetime.now()

    # Training the model
    model.fit(X_train, Y_train)
    
    print("Finish fit for {0}".format(name))

    Y_test_result = model.predict(X_test)
    res.append(Y_test_result)
    if env == "test":
        # We can calculate the avg error
        score = get_score(Y_test, Y_test_result)
        print("Model {0} got score of {1}, time: {2}".format(name, score, datetime.now() - time_start))
    else:
        # Let's write an output file, with the name of the model
        print("Writing output file {0}.csv for model {0}".format(name))
        
        df_test['cuisine'] = Y_test_result
        result_df = df_test[['id', 'cuisine']]
        result_df['cuisine'] = Y_test_result
        
        result_df.to_csv("{0}.csv".format(name), index=False)
    del(model)

Executing for model MLPClassifier_adamrelu_earlystopping_5
Finish fit for MLPClassifier_adamrelu_earlystopping_5
Model MLPClassifier_adamrelu_earlystopping_5 got score of 76.3, time: 0:00:15.962253
Executing for model MLPClassifier_adamrelu_earlystopping_6
Finish fit for MLPClassifier_adamrelu_earlystopping_6
Model MLPClassifier_adamrelu_earlystopping_6 got score of 76.2, time: 0:00:15.036856
Executing for model MLPClassifier_adamrelu_earlystopping_7
Finish fit for MLPClassifier_adamrelu_earlystopping_7
Model MLPClassifier_adamrelu_earlystopping_7 got score of 75.8, time: 0:00:19.375624
Executing for model MLPClassifier_adamrelu_earlystopping_8




Finish fit for MLPClassifier_adamrelu_earlystopping_8
Model MLPClassifier_adamrelu_earlystopping_8 got score of 77.60000000000001, time: 0:00:14.427346
Executing for model MLPClassifier_adamrelu_earlystopping_9
Finish fit for MLPClassifier_adamrelu_earlystopping_9
Model MLPClassifier_adamrelu_earlystopping_9 got score of 77.4, time: 0:00:16.440599
Executing for model MLPClassifier_adamrelu_earlystopping_10
Finish fit for MLPClassifier_adamrelu_earlystopping_10
Model MLPClassifier_adamrelu_earlystopping_10 got score of 76.4, time: 0:00:12.214482
Executing for model MLPClassifier_adamrelu_earlystopping_11
Finish fit for MLPClassifier_adamrelu_earlystopping_11
Model MLPClassifier_adamrelu_earlystopping_11 got score of 75.0, time: 0:00:14.852795
Executing for model MLPClassifier_adamrelu_earlystopping_12
Finish fit for MLPClassifier_adamrelu_earlystopping_12
Model MLPClassifier_adamrelu_earlystopping_12 got score of 75.1, time: 0:00:13.991575
Executing for model MLPClassifier_adamrelu_earl

In [13]:
# For all result in res, if test, display the result, if not, write it to a file
final_res = []
nb_variable = len(res[0])
for variable in range(0, nb_variable):
    final_res.append(0.0)
    dict_cuisine = {}
    for i in range(0, len(res)):
        cuisine_found = res[i][variable]
        if cuisine_found in dict_cuisine:
            dict_cuisine[cuisine_found] += 1
        else:
            dict_cuisine[cuisine_found] = 1
    # Now, we need to find the most common one for all the values inside dict_cuisine
    current_value = 0
    current_cuisine = ""
    for cuisine in dict_cuisine:
        if dict_cuisine[cuisine] > current_value:
            current_cuisine = cuisine
            current_value = dict_cuisine[cuisine]
    
    final_res[variable] = current_cuisine

if env == "test":
    # We can calculate the avg error
    score = get_score(Y_test, final_res)
    print("avg model got score of {0}".format(score))
else:
    print("Writing output file merged.csv".format(name))

    df_test['cuisine'] = final_res
    result_df = df_test[['id', 'cuisine']]
    result_df['cuisine'] = final_res

    result_df.to_csv("merged.csv".format(name), index=False)

avg model got score of 78.7
