In [36]:
# Libraries.
import os
import numpy
import kagglehub

try:
  from pyspark.sql import SparkSession, DataFrame, functions as f
except:
  !pip install pyspark
  from pyspark.sql import SparkSession, DataFrame, functions as f

In [17]:
# Download data from Kaggle.
dataset_handle = "openfoodfacts/world-food-facts"

path = kagglehub.dataset_download(dataset_handle)

if len(os.listdir(path)) == 1:
  path = os.path.join(path, os.listdir(path)[0])

In [20]:
# Assign data into a dataframe.
spark = SparkSession.builder.getOrCreate()

food = spark.read.csv(path, header = True, inferSchema = True, sep = "\t")

In [24]:
# See the first 5 entries.
food.limit(10).show()

+-------+--------------------+--------------------+----------+--------------------+---------------+----------------------+--------------------+------------+--------+---------+--------------+--------------------+--------------------+----------+---------------+-------------+-------+------------+--------------------+-------------------------+------+-----------+---------+---------+--------------+------------------------+------+-----------+---------------+------+---------+----------------+-------------+--------------------+---------+------------+------+-----------+---------+---------------+-------------+-----------+--------------------+--------------+------------------+---------------------------+-------------------------+------------------------------+---------------------------------------+-------------------------------------+------------------------------------------+------------------+------------------+-------------+-------------+--------------------+--------------------+-------------

In [27]:
# What is the number of observations in the dataset?
print(f"There are a total of {food.count()} observations in the dataset.")
print(f"There are {len(food.columns)} columns in the dataset.")

There are a total of 356027 observations in the dataset.
There are 163 columns in the dataset.


In [30]:
# Print the name of all the columns.
print(f"Column names: ", *[f" - {column}" for column in food.columns], sep = "\n")

Column names: 
 - code
 - url
 - creator
 - created_t
 - created_datetime
 - last_modified_t
 - last_modified_datetime
 - product_name
 - generic_name
 - quantity
 - packaging
 - packaging_tags
 - brands
 - brands_tags
 - categories
 - categories_tags
 - categories_en
 - origins
 - origins_tags
 - manufacturing_places
 - manufacturing_places_tags
 - labels
 - labels_tags
 - labels_en
 - emb_codes
 - emb_codes_tags
 - first_packaging_code_geo
 - cities
 - cities_tags
 - purchase_places
 - stores
 - countries
 - countries_tags
 - countries_en
 - ingredients_text
 - allergens
 - allergens_en
 - traces
 - traces_tags
 - traces_en
 - serving_size
 - no_nutriments
 - additives_n
 - additives
 - additives_tags
 - additives_en
 - ingredients_from_palm_oil_n
 - ingredients_from_palm_oil
 - ingredients_from_palm_oil_tags
 - ingredients_that_may_be_from_palm_oil_n
 - ingredients_that_may_be_from_palm_oil
 - ingredients_that_may_be_from_palm_oil_tags
 - nutrition_grade_uk
 - nutrition_grade_fr
 - 

In [37]:
# What is the name of the 105th column?
print(f"The 105th column is named {food.columns[104]}.")

# What is the type of the observations of the 105th column?
print(f"The type of the observations of the 105th column is {food.schema[104].dataType}.")

# What is the product name of the 19th observation?
print(f"The product name of the 19th observation is {food.select('product_name').limit(19).collect()[18][0]}.")

The 105th column is named -glucose_100g.
The type of the observations of the 105th column is DoubleType().
The product name of the 19th observation is Lotus Organic Brown Jasmine Rice.
