In [6]:
import pandas as pd
import os
import openpyxl

def read_text_file(file_path):
    with open(file_path, 'r', encoding='latin-1') as file:
        lines = file.readlines()
    
    headers = lines[0].strip().split('\t')
    
    data = []
    for line in lines[1:]:
        fields = line.strip().split('\t')
        if len(fields) < len(headers):
            fields += [''] * (len(headers) - len(fields))
        data.append(fields)
    
    df = pd.DataFrame(data, columns=headers)
    return df

def convert_to_set(langual_codes, delimiter=' '):
    if isinstance(langual_codes, str):
        return set(langual_codes.split(delimiter))
    else:
        return set()  # or np.nan to output NaN instead of set()

def read_excel_file(excel_path, sheet_name):
    xls = pd.ExcelFile(excel_path)
    df = pd.read_excel(xls, sheet_name)
    return df

In [2]:
# Preprocess nevo db
def preprocess_nevo():
    df_nevo_langal = pd.DataFrame()

    text_file_path = 'data/NL RIVM-NEVO 2008-05-22.txt'
    df_nevo_langal = read_text_file(text_file_path)

    df_nevo_langal['LANGUALCODES'] = df_nevo_langal['LANGUALCODES'].apply(convert_to_set)

    df_nevo_langal = df_nevo_langal.rename(columns={'ENGFDNAM':'FoodName'})
    df_nevo_langal = df_nevo_langal.rename(columns={'FOODID':'FoodID'})
    df_nevo_langal = df_nevo_langal.rename(columns={'REMARKS':'Remarks'})
    df_nevo_langal = df_nevo_langal.rename(columns={'LANGUALCODES':'LangualCodes'})
    df_nevo_langal = df_nevo_langal.rename(columns={'ORIGFDNM':'OriginalFoodName'})

    df_nevo_langal.to_excel("data/processed/preprocess/NL_RIVM-NEVO_2008-05-22.xlsx", index=False)

preprocess_nevo()

In [9]:
# Preprocess frida db
def preprocess_frida():
    excel_file_path = 'data/raw/Frida_5.1_November_2023.xlsx'
    df_frida_langal = read_excel_file(excel_file_path, 'Food')
    df_frida_langal['LangualCode'] = df_frida_langal['LangualCode'].apply(lambda x: convert_to_set(x, delimiter=','))
    
    df_frida_langal = df_frida_langal.rename(columns={'FÃ¸devareNavn':'OriginalFoodName'})
    df_frida_langal = df_frida_langal.rename(columns={'LangualCode':'LangualCodes'})

    df_frida_langal.to_excel("data/processed/preprocess/Frida_preprocess_5.1_November_2023.xlsx", index=False)

preprocess_frida()