# Machine Learning Section
## Generates dataset using Cosine Similarity and stores to a pickle file.
Should already have csv files stored in the rawdata folder. If not, this notebook will run data_download.ipynb which can take several minutes.

In [10]:
import csv
import pandas as pd
import unidecode
from bitarray import bitarray
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from pathlib import Path

### File Locations

In [11]:
import_location = "rawdata"
save_location = "processed"

# This cell will generate the recommendation matrix using Cosine Similarity.
## It first checks to see if the source data is located in /rawdata and, if not, calls data_download.ipynb to download it. THIS TAKES SEVERAL MINUTES.

## If the data is found, it loads the CSV files into memory and performs data transforms on the tables. Then, the training algorithm is run on the resulting data.

In [12]:
def check_files():
    try:
        location = Path.cwd() / import_location / 'alcoholic_full.csv'
        open(location, mode='r', encoding='utf-8')
        location = Path.cwd() / import_location / 'ingredients.csv'
        open(location, mode='r', encoding='utf-8')
    except FileNotFoundError as err:
        %run ./data_download.ipynb


#  Load alcoholic drink recipe CSV into dict
check_files() # makes sure the files exist and downloads them if not.
location = Path.cwd() / import_location / 'alcoholic_full.csv'
with open(location, mode='r', encoding='utf-8') as infile:
    reader = csv.reader(infile)
    array_headers = []
    alcoholic_recipe_list = []
    alcoholic_recipe_dict = {}
    next(reader, None)

    for row in reader:
        array_headers.append(row[1])
        alcoholic_recipe_dict[row[1]] = row[17:32]
        alcoholic_recipe_list.append(row[17:32])
    # print(str(len(alcoholic_recipe_dict)) + " recipes loaded!")

#  Load ingredient CSV into list
location = Path.cwd() / import_location / 'ingredients.csv'
with open(location, mode='r', encoding='utf-8') as infile:
    reader = csv.reader(infile)
    ingredientlist = []
    next(reader, None)
    for row in reader:
        ingredientlist.append(unidecode.unidecode(row[0].lower().strip()))
    # print(str(len(ingredientlist)) + " ingredients loaded! Processing...")

#  Create new dict with ingredients
alcoholic_drink_dict = {}
for ingredient in ingredientlist:
    emptyarray = bitarray(len(alcoholic_recipe_dict))
    emptyarray.setall(0)
    alcoholic_drink_dict[ingredient] = emptyarray

#  Set flags for each recipe per ingredient.
for r in range(0, len(alcoholic_recipe_list)):
    recipe = alcoholic_recipe_list[r]
    for ingredient in recipe:
        if ingredient == 'Bailey':
            ingredient = 'baileys irish cream'
        else:
            ingredient = unidecode.unidecode(ingredient.strip().lower())
        if len(ingredient) > 0:
            current_array = alcoholic_drink_dict[ingredient]
            current_array[r] = 1
            alcoholic_drink_dict[ingredient] = current_array

#  Build prune list for unused ingredients
unused_ingredients = []
for i in alcoholic_drink_dict.keys():
    if alcoholic_drink_dict[i].count(1) == 0:
        unused_ingredients.append(i)

#  Prune unused ingredients
for i in unused_ingredients:
    del alcoholic_drink_dict[i]

#  Build dataframe of recipes by ingredient
data = {}
for a in array_headers:
    data[a] = []
indexes = []
for i in alcoholic_drink_dict.keys():
    indexes.append(i)
    for a in range(len(array_headers)):
        data[array_headers[a]].append(alcoholic_drink_dict[i][a])
df = pd.DataFrame(data, index=indexes)

ingredient_matrix = cosine_similarity(df)
df_sims = pd.DataFrame(ingredient_matrix, df.index)
df_sims.columns = df.index

# Save the resulting data for later use.
location = Path.cwd() / save_location / 'cosine_matrix.pickle'
with open(location,'wb') as modelFile:
    pickle.dump(df_sims, modelFile)
location = Path.cwd() / save_location / 'ingredient_dict.pickle'
with open(location,'wb') as modelFile:
    pickle.dump(df, modelFile)
location = Path.cwd() / save_location / 'recipe_dict.pickle'
with open(location,'wb') as modelFile:
    pickle.dump(alcoholic_recipe_dict, modelFile)