# 1. Acquiring Data
### Data 1: FlavorDB
#### ref: "Food Pairing and Data Science," Vincent Choo, link=https://vchoo.github.io/

In [32]:
# for basic data science
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

# for downloading files off the internet
import urllib.request
import json
import time

# for network graphs
from colour import Color
from matplotlib.collections import LineCollection
import networkx as nx
import mpld3

In [33]:
# JSON files are at addresses of this form
def flavordb_entity_url(x):
    return "https://cosylab.iiitd.edu.in/flavordb2/entities_json?id="+str(x)


# translates the JSON file at the specified web address into a dictionary
def get_flavordb_entity(x):
    # source: https://stackoverflow.com/questions/12965203/how-to-get-json-from-webpage-into-python-script
    with urllib.request.urlopen(flavordb_entity_url(x)) as url:
        return json.loads(url.read().decode())
    return None

In [34]:
# the names of the "columns" in the raw JSON objects
def flavordb_entity_cols():
    return [
        'entity_id', 'entity_alias_readable', 'entity_alias_synonyms',
        'natural_source_name', 'category_readable', 'molecules'
    ]


# what we want to rename the JSON object "columns" to
def flavordb_df_cols():
    return [
        'entity id', 'alias', 'synonyms',
        'scientific name', 'category', 'molecules'
    ]


# "subcolumns" in the "molecules" column that we are interested in
def molecules_df_cols():
    return ['pubchem id', 'common name', 'flavor profile']  

In [35]:
def clean_flavordb_dataframes(flavor_df, molecules_df):
    """
    Helps ensure consistent intra-column typing and converts all strings to lowercase.
    """
    strtype = type('')
    settype = type(set())
    
    # ensuring that these columns have type str
    for k in ['alias', 'scientific name', 'category']:
        flavor_df[k] = [
            elem.strip().lower() if isinstance(elem, strtype) else ''
            for elem in flavor_df[k]
        ]
    
    def map_to_synonyms_set(elem):
        if isinstance(elem, set):
            return elem  # If it's already a set, return it as-is
        elif isinstance(elem, str):
            # Check for empty strings before accessing characters
            if elem.strip() == "":
                return set()  # Empty string -> return empty set
            elif elem[0] == '{' and elem[-1] == '}':
                # If the string looks like a set (e.g., "{item1, item2}"), evaluate it
                try:
                    return eval(elem)
                except Exception:
                    return set()  # If eval fails, return empty set
            else:
                # Otherwise, assume it's a comma-separated string
                return set(elem.strip().lower().split(', '))
        else:
            # If it's neither a string nor a set, return an empty set
            return set()
    
    flavor_df['synonyms'] = [
        map_to_synonyms_set(elem)
        for elem in flavor_df['synonyms']
    ]
    
    molecules_df['flavor profile'] = [
        set([x.strip().lower() for x in elem])
        for elem in molecules_df['flavor profile']
    ]
    
    return [
        flavor_df.groupby('entity id').first().reset_index(),
        molecules_df.groupby('pubchem id').first().reset_index()
    ]

In [36]:
# generate dataframes from some of the JSON objects
def get_flavordb_dataframes(start, end):
    """
    Download JSON data, converts it to DataFrames, and cleans them.
    
    Returns DataFrames for both foods and molecules, as well as missing JSON entries.
    """
    # make intermediate values to make dataframes from
    flavordb_data = []
    molecules_dict = {}
    missing = [] # numbers of the missing JSON files during iteration
    
    flavordb_cols = flavordb_entity_cols()
    
    for i in range(start, end):
        # we use a try-except here because some of the JSON pages are missing
        try:
            # 1: Find the JSON file. Gets the ith food entity, as a JSON dict
            fdbe = get_flavordb_entity(i + 1)

            # get only the relevant fields (columns) of the dict
            flavordb_series = [fdbe[k] for k in flavordb_cols[:-1]]
            flavordb_series.append( # convert the field to a set
                set([m['pubchem_id'] for m in fdbe['molecules']])
            )
            flavordb_data.append(flavordb_series)

            # update the molecules dataframe with the data in 'molecules' field
            for m in fdbe['molecules']:
                if m['pubchem_id'] not in molecules_dict:
                    molecules_dict[m['pubchem_id']] = [
                        m['common_name'],
                        set(m['flavor_profile'].split('@'))
                    ]
        except urllib.error.HTTPError as e:
            if e.code == 404: # if the JSON file is missing
                missing.append(i)
            else:
                raise RuntimeError(
                    'Error while fetching JSON object from ' + flavordb_entity_url(x)
                ) from e
            
    # generate the dataframes
    flavordb_df = pd.DataFrame(
        flavordb_data,
        columns=flavordb_df_cols()
    )
    molecules_df = pd.DataFrame(
        [
            [k, v[0], v[1]]
             for k, v in molecules_dict.items()
        ],
        columns=molecules_df_cols()
    )
    
    # clean up the dataframe columns
    flavordb_df, molecules_df = clean_flavordb_dataframes(flavordb_df, molecules_df)
    
    return [flavordb_df, molecules_df, missing]

In [37]:
def update_flavordb_dataframes(df0, df1, ranges):
    """
    Logs progress to the console while processing ranges.
    """
    df0_old = df0
    df1_old = df1
    missing_old = []

    # Time the download process
    start = time.time()

    try:
        for idx, (a, b) in enumerate(ranges):
            print(f"Processing range {a}-{b} ({idx + 1}/{len(ranges)})...")
            df0_new, df1_new, missing_new = get_flavordb_dataframes(a, b)
            
            # Append new data using pd.concat()
            df0_old = pd.concat([df0_old, df0_new], ignore_index=True)
            df1_old = pd.concat([df1_old, df1_new], ignore_index=True)
            missing_old.extend(missing_new)

        return df0_old, df1_old, missing_old
    except Exception as e:
        raise e  # Rethrow the error to handle it later
    finally:
        # Save the DataFrames as CSV files
        df0_old.to_csv('flavordb.csv', index=False)
        df1_old.to_csv('molecules.csv', index=False)

        end = time.time()
        mins = (end - start) / 60.0
        print(f'Downloading took: {mins:.2f} minutes')

In [38]:
# take new dataframes
df0 = pd.DataFrame(columns=flavordb_df_cols())
df1 = pd.DataFrame(columns=molecules_df_cols())

# fill the DataFrames with JSON files up to id = 1000
ranges = [(50 * i, 50 * (i + 1)) for i in range(20)]
# update & save the dataframes as csv files
update_flavordb_dataframes(df0, df1, ranges)

Processing range 0-50 (1/20)...
Processing range 50-100 (2/20)...
Processing range 100-150 (3/20)...
Processing range 150-200 (4/20)...
Processing range 200-250 (5/20)...
Processing range 250-300 (6/20)...
Processing range 300-350 (7/20)...
Processing range 350-400 (8/20)...
Processing range 400-450 (9/20)...
Processing range 450-500 (10/20)...
Processing range 500-550 (11/20)...
Processing range 550-600 (12/20)...
Processing range 600-650 (13/20)...
Processing range 650-700 (14/20)...
Processing range 700-750 (15/20)...
Processing range 750-800 (16/20)...
Processing range 800-850 (17/20)...
Processing range 850-900 (18/20)...
Processing range 900-950 (19/20)...
Processing range 950-1000 (20/20)...
Downloading took: 37.35 minutes


(    entity id             alias  \
 0           1   bakery products   
 1           2             bread   
 2           3         rye bread   
 3           4     wheaten bread   
 4           5       white bread   
 ..        ...               ...   
 930       970   saskatoon berry   
 931       971    nanking cherry   
 932       972  japanese pumpkin   
 933       977        guinea hen   
 934       978         cucurbita   
 
                                               synonyms scientific name  \
 0                                    {bakery products}        poacceae   
 1                                              {bread}        poacceae   
 2                                          {rye bread}             rye   
 3                            {soda farls, soda scones}           wheat   
 4                                        {white bread}           wheat   
 ..                                                 ...             ...   
 930  {pacific serviceberry, western serv

In [39]:
# Load the datasets
flavordb = pd.read_csv("flavordb.csv")
molecules = pd.read_csv("molecules.csv")

# Inspect the structure of each dataset
print("Flavordb Dataset:")
print(flavordb.head(), "\n")
print(flavordb.info(), "\n")

print("Molecules Dataset:")
print(molecules.head(), "\n")
print(molecules.info(), "\n")

# Summary statistics
print("Flavordb Numerical Summary:")
print(flavordb.describe(), "\n")

print("Molecules Numerical Summary:")
print(molecules.describe(), "\n")

# Categorical column summaries (e.g., unique values)
print("Flavordb Categorical Columns Summary:")
for col in ['alias', 'scientific name', 'category']:
    print(f"Column: {col}")
    print(f"Unique Values: {flavordb[col].nunique()}")
    print(f"Sample Values: {flavordb[col].unique()[:5]}\n")

print("Molecules Categorical Columns Summary:")
for col in ['common name', 'flavor profile']:
    print(f"Column: {col}")
    print(f"Unique Values: {molecules[col].nunique()}")
    print(f"Sample Values: {molecules[col].unique()[:5]}\n")

# Missing values
print("Missing Values in Flavordb Dataset:")
print(flavordb.isnull().sum() / len(flavordb) * 100, "\n")

print("Missing Values in Molecules Dataset:")
print(molecules.isnull().sum() / len(molecules) * 100, "\n")

Flavordb Dataset:
   entity id            alias                       synonyms scientific name  \
0          1  bakery products            {'bakery products'}        poacceae   
1          2            bread                      {'bread'}        poacceae   
2          3        rye bread                  {'rye bread'}             rye   
3          4    wheaten bread  {'soda farls', 'soda scones'}           wheat   
4          5      white bread                {'white bread'}           wheat   

  category                                          molecules  
0   bakery          {27457, 7976, 31252, 26808, 22201, 26331}  
1   bakery  {1031, 1032, 644104, 527, 8723, 31260, 15394, ...  
2   bakery  {644104, 7824, 643731, 8468, 1049, 5372954, 80...  
3   bakery  {5365891, 6915, 1146, 12170, 8082, 31251, 7958...  
4   bakery  {7361, 7362, 10883, 994, 11173, 5365891, 11559...   

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 935 entries, 0 to 934
Data columns (total 6 columns):
 #   Column

### Data 2: thecocktaildb
#### src: https://github.com/lauriharpf/thecocktaildb-downloader & https://www.thecocktaildb.com/

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Target websites
urls = [
    "https://www.winefolly.com/wine-pairing/",
    "https://www.liquor.com/food-pairings-5095199",
]

pairings = []

for url in urls:
    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(response.text, "html.parser")

    # Extract relevant pairing data (adjust selectors based on website structure)
    for item in soup.find_all("div", class_="pairing-item"):
        food = item.find("h3").text.strip()
        beverage = item.find("p").text.strip()
        pairings.append({"Food Item": food, "Beverage Type": "Wine", "Beverage Name": beverage})

# Convert to DataFrame & Save
df = pd.DataFrame(pairings)
df.to_csv("food_beverage_pairings.csv", index=False)
print("Scraping completed. Data saved as 'food_beverage_pairings.csv'")

Scraping completed. Data saved as 'food_beverage_pairings.csv'


In [5]:
import praw
import pandas as pd

# ‚úÖ Fill in your Reddit API Credentials
REDDIT_CLIENT_ID = "0xNSe32m-y6xNZgKbV7Ltg"
REDDIT_CLIENT_SECRET = "NGOTyFfSoL9ExOSBuvE9vNT7Sv84zA"
REDDIT_USERNAME = "Substantial_Dot_5742"  # Your Reddit username
REDDIT_PASSWORD = "your_reddit_password_here"  # üîπ Replace with your actual Reddit password
USER_AGENT = "FoodBeveragePairingBot/1.0"

# ‚úÖ Authenticate with Reddit API
reddit = praw.Reddit(
    client_id=REDDIT_CLIENT_ID,
    client_secret=REDDIT_CLIENT_SECRET,
    username=REDDIT_USERNAME,
    password=REDDIT_PASSWORD,
    user_agent=USER_AGENT
)

print("‚úÖ Reddit API Authentication Successful!")

# ‚úÖ Fetch top food & beverage pairing mentions from Reddit
subreddits = ["wine", "cocktails", "foodpairings"]
pairings = []

for sub in subreddits:
    for submission in reddit.subreddit(sub).top(limit=50):  # Fetch top 50 discussions
        text = submission.title + " " + submission.selftext

        # Extract simple pairing mentions based on common phrases
        if "pairs well with" in text or "goes great with" in text:
            sentences = text.split(".")
            for sentence in sentences:
                if "pairs well with" in sentence or "goes great with" in sentence:
                    parts = sentence.split("pairs well with") if "pairs well with" in sentence else sentence.split("goes great with")
                    if len(parts) == 2:
                        food, beverage = parts[0].strip(), parts[1].strip()
                        pairings.append({"Food Item": food, "Beverage Name": beverage})

# ‚úÖ Convert data to DataFrame & Save
df = pd.DataFrame(pairings)
df.to_csv("reddit_food_beverage_pairings.csv", index=False)
print("‚úÖ Reddit data saved as 'reddit_food_beverage_pairings.csv'")

‚úÖ Reddit API Authentication Successful!


OAuthException: invalid_grant error processing request

In [10]:
import praw
import pandas as pd
import re
from collections import defaultdict

# ‚úÖ Authenticate in read-only mode
reddit = praw.Reddit(
    client_id="0xNSe32m-y6xNZgKbV7Ltg",
    client_secret="NGOTyFfSoL9ExOSBuvE9vNT7Sv84zA",
    user_agent="FoodPairingBot/1.0"
)

print("‚úÖ Reddit API Authentication Successful!")

# ‚úÖ Expand food & beverage keywords
food_terms = [
    "steak", "sushi", "cheese", "pizza", "burger", "pasta", "tacos", "salmon", "lobster", "chocolate",
    "barbecue", "ribs", "chicken", "mushrooms", "lasagna", "shrimp", "duck", "fajitas", "mac and cheese"
]
drink_terms = [
    "wine", "beer", "whiskey", "cocktail", "champagne", "sake", "martini", "margarita", "red wine",
    "white wine", "rum", "vodka", "gin", "cider", "tequila", "mocktail"
]

subreddits = ["Pairing"]
pairings = defaultdict(int)

for sub in subreddits:
    print(f"Fetching posts & comments from r/{sub}...")
    try:
        for submission in reddit.subreddit(sub).hot(limit=500):  # Fetch top 500 posts
            text = (submission.title + " " + submission.selftext).lower()

            # Extract food & beverage mentions
            found_foods = [f for f in food_terms if re.search(rf"\b{f}\b", text)]
            found_drinks = [d for d in drink_terms if re.search(rf"\b{d}\b", text)]

            # Extract from comments
            submission.comments.replace_more(limit=0)  # Load all comments
            for comment in submission.comments.list():
                comment_text = comment.body.lower()
                found_foods += [f for f in food_terms if re.search(rf"\b{f}\b", comment_text)]
                found_drinks += [d for d in drink_terms if re.search(rf"\b{d}\b", comment_text)]

            # Create food & beverage pairs
            for food in set(found_foods):
                for drink in set(found_drinks):
                    pairings[(food, drink)] += 1  # Count co-occurrences

    except Exception as e:
        print(f"‚ö†Ô∏è Skipping r/{sub} due to error: {e}")

# ‚úÖ Convert to DataFrame & Save
df = pd.DataFrame([(food, drink, count) for (food, drink), count in pairings.items()],
                  columns=["Food Item", "Beverage Type", "Pairing Strength"])
df.to_csv("reddit_food_beverage_pairings.csv", index=False)
print("‚úÖ Reddit data saved as 'reddit_food_beverage_pairings.csv' with", len(df), "entries.")

‚úÖ Reddit API Authentication Successful!
Fetching posts & comments from r/Pairing...
‚úÖ Reddit data saved as 'reddit_food_beverage_pairings.csv' with 1 entries.


I think this could work. 
1. expand the food_terms list to all foods in the flavordb dataset
2. expand drink_terms list to all drinks in cocktaildb
3. increase the number of more active subreddits