In [1]:
import psycopg2
import pandas as pd
import numpy as np
import psycopg2.extras as extras
from ast import literal_eval
from psycopg2.extensions import register_adapter, AsIs

psycopg2.extensions.register_adapter(np.int64, psycopg2._psycopg.AsIs)

pd.options.display.max_columns = 100

## Create Tables

In [3]:
try:
    # Connect to database
    connection = psycopg2.connect(
                    host="##",
                    database="##",
                    user="##",
                    password="##"
    )

    cursor = connection.cursor()
    
    create_table_query = '''
        DROP TABLE IF EXISTS recipeIngredientJoin;
        DROP TABLE IF EXISTS cleanIngredients;
        DROP TABLE IF EXISTS rawIngredients;
        DROP TABLE IF EXISTS recipeTagJoin;
        DROP TABLE IF EXISTS tags;
        DROP TABLE IF EXISTS interactions;
        DROP TABLE IF EXISTS users;
        DROP TABLE IF EXISTS recipes;

        CREATE TABLE IF NOT EXISTS recipes (
          recipeID INTEGER PRIMARY KEY,
          name VARCHAR(255),
          minutes INTEGER,
          contributorID INTEGER,
          submittedDate DATE,
          tagsList TEXT [],
          nutritionList FLOAT [],
          numSteps INTEGER,
          stepsList TEXT [],
          description TEXT,
          ingredientNameList TEXT [],
          numIngredients INTEGER,
          i FLOAT,
          nameTokenList INTEGER [],
          ingredientTokenList TEXT,
          stepTokenList INTEGER [],
          techniqueList INTEGER [],
          calorieLevel FLOAT,
          ingredientIDList INTEGER [],
          isComplete BOOLEAN,
          calories FLOAT,
          fatPDV FLOAT,
          sugarPDV FLOAT,
          sodiumPDV FLOAT,
          proteinPDV FLOAT,
          saturatedFatPDV FLOAT,
          carbsPDV FLOAT
        );

        CREATE TABLE IF NOT EXISTS cleanIngredients (
          cleanIngredientID INTEGER PRIMARY KEY,
          name VARCHAR(255)
        );

        CREATE TABLE IF NOT EXISTS rawIngredients (
          rawIngredientID INTEGER PRIMARY KEY,
          cleanIngredientID INTEGER,
          rawName VARCHAR(255),
          rawNameLength INTEGER,
          processedName VARCHAR(255),
          processedNameLength INTEGER
        );

        CREATE TABLE IF NOT EXISTS recipeIngredientJoin (
           recipeID INTEGER NOT NULL,
           cleanIngredientID INTEGER NOT NULL,
           PRIMARY KEY (recipeID, cleanIngredientID),
           FOREIGN KEY (recipeID) REFERENCES recipes (recipeID),
           FOREIGN KEY (cleanIngredientID) REFERENCES cleanIngredients (cleanIngredientID)
        );

        CREATE TABLE IF NOT EXISTS tags (
          tagID INTEGER PRIMARY KEY,
          tag VARCHAR(255),
          numRecipes INTEGER
        );

        CREATE TABLE IF NOT EXISTS recipeTagJoin (
           recipeID INTEGER NOT NULL,
           tagID INTEGER NOT NULL,
           PRIMARY KEY (recipeID, tagID),
           FOREIGN KEY (recipeID) REFERENCES recipes (recipeID),
           FOREIGN KEY (tagID) REFERENCES tags (tagID)
        );

        CREATE TABLE IF NOT EXISTS interactions (
          interactionID INTEGER PRIMARY KEY,
          recipeID INTEGER,
          userID INTEGER,
          date DATE,
          rating INTEGER,
          review TEXT,
          FOREIGN KEY (recipeID) REFERENCES recipes (recipeID)
        );

        CREATE TABLE IF NOT EXISTS users (
          userID INTEGER PRIMARY KEY,
          techniquesList INTEGER [],
          itemsList INTEGER [],
          numItems INTEGER,
          ratingsList FLOAT [],
          numRatings INTEGER
        );

    '''
    
    cursor.execute(create_table_query)
    connection.commit()
    print("Table created successfully in PostgreSQL ")

except (Exception, psycopg2.Error) as error :
    print ("Error while connecting to PostgreSQL", error)

Table created successfully in PostgreSQL 


## Insert Data into Tables

In [4]:
def insert_data(conn, df, table):
    """
    Using psycopg2.extras.execute_values() to insert the dataframe
    Code from: https://naysan.ca/2020/05/09/pandas-to-postgresql-using-psycopg2-bulk-insert-performance-benchmark/
    """
    # Create a list of tuples from the dataframe values
    tuples = [tuple(x) for x in df.to_numpy()]
    
    # Comma-separated dataframe columns
    cols = ','.join(list(df.columns))
    
    # SQL quert to execute
    query  = "INSERT INTO %s(%s) VALUES %%s" % (table, cols)
    cursor = conn.cursor()
    try:
        extras.execute_values(cursor, query, tuples)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    
    print("Inserted data into {} table.".format(table))
    cursor.close()

In [5]:
# Clean ingredients
cleanIngredients = pd.read_csv("Clean Recipe Data/cleanIngredients.csv")
# cleanIngredients.head()

insert_data(conn, cleanIngredients, "cleaningredients")

Inserted data into cleaningredients table.


In [6]:
# Raw ingredients
rawIngredients = pd.read_csv("Clean Recipe Data/rawIngredients.csv")
# rawIngredients.head()

insert_data(conn, rawIngredients, "rawingredients")

Inserted data into rawingredients table.


In [7]:
# Tags
tags = pd.read_csv("Clean Recipe Data/tags.csv")
# tags.head()

insert_data(conn, tags, "tags")

Inserted data into tags table.


In [8]:
# Recipes
recipes = pd.read_csv("Clean Recipe Data/recipes.csv", parse_dates=["submittedDate"])
# nonblank_recipes = recipes.copy().dropna(how="any", axis=0)

# Convert list columns to lists
list_columns = ["tagsList", "nutritionList", "stepsList", "ingredientNameList", \
                "nameTokenList", "stepTokenList", "techniqueList", "ingredientIDList"]
print("Converting columns to list:")
for col in list_columns:
    print(col)
    # Replace nan with an empty list
    recipes.loc[pd.isnull(recipes[col]), col] = "[]"
    recipes[col] = recipes[col].apply(lambda x: literal_eval(str(x)))
    
insert_data(conn, recipes, "recipes")

Converting columns to list:
tagsList
nutritionList
stepsList
ingredientNameList
nameTokenList
stepTokenList
techniqueList
ingredientIDList
Inserted data into recipes table.


In [9]:
# Recipe Ingredient Join
recipeIngredientJoin = pd.read_csv("Clean Recipe Data/recipeIngredientJoin.csv")
recipeIngredientJoin.drop_duplicates(inplace=True)
# recipeIngredientJoin.head()

insert_data(conn, recipeIngredientJoin, "recipeIngredientJoin")

Inserted data into recipeIngredientJoin table.


In [10]:
# Recipe Tags Join
recipeTagJoin = pd.read_csv("Clean Recipe Data/recipeTagJoin.csv")
recipeTagJoin.drop_duplicates(inplace=True)
# recipeTagJoin.head()

insert_data(conn, recipeTagJoin, "recipeTagJoin")

Inserted data into recipeTagJoin table.


In [11]:
# Interactions
interactions = pd.read_csv("Clean Recipe Data/interactions.csv")
# interactions.head()

insert_data(conn, interactions, "interactions")

Inserted data into interactions table.


In [12]:
# users
users = pd.read_csv("Clean Recipe Data/users.csv")

# Convert list columns to lists
list_columns_users = ["techniquesList", "itemsList", "ratingsList"]
print("Converting columns to list:")
for col in list_columns_users:
    print(col)
    # Replace nan with an empty list
    users.loc[pd.isnull(users[col]), col] = "[]"
    users[col] = users[col].apply(lambda x: literal_eval(str(x)))

insert_data(conn, users, "users")

Converting columns to list:
techniquesList
itemsList
ratingsList
Inserted data into users table.


In [13]:
#closing database connection.
if(connection):
    cursor.close()
    connection.close()
    print("PostgreSQL connection is closed")

PostgreSQL connection is closed
