In [31]:
# Suppress dataframe concat warning
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import yaml
import os
import numpy as np

class DataFolder():
    def __init__(self, 
                 folder_path, 
                 labels_path="./標籤集合.xlsx", 
                 prices_path="./圖片的價格.xlsx"):
        '''
        folder_path: yolov7pytorch folder
        '''
        self.folder_path = folder_path
        self.labels = pd.DataFrame()
        self.prices = pd.DataFrame()
        self.side_dishes = []
        self.rices = []
        self.containers = []
        self.main_dishes = []
        self.class_id_to_name = {}
        
        self.train_data = pd.DataFrame()
        self.valid_data = pd.DataFrame()
        self.test_data = pd.DataFrame()

        self.load_labels(labels_path)
        self.load_prices(prices_path)
        self.load_class_id_to_name(folder_path + "/data.yaml")
        self.load_train(folder_path + "/train")
        self.load_valid(folder_path + "/valid")
        self.load_test(folder_path + "/test")

    def load_labels(self, path):
        if path.endswith('.xlsx'):
            self.labels = pd.read_excel(path)
        else:
            print('File format not supported')
            return
        
        # Get unique labels
        self.side_dishes = self.labels["副菜"].dropna().unique().tolist()
        self.rices = self.labels["飯"].dropna().unique().tolist()
        self.containers = self.labels["容器"].dropna().unique().tolist()
        self.main_dishes = self.labels["主菜"].dropna().unique().tolist()
        # Add lack labels manually
        # lack_labels = []
        # if self.folder_path == "final project.v9i.yolov7pytorch_decrease_label_version":
        #     lack_labels = ["sauteed pork", "main_dish_25", "main_dish_30", "main_dish_40"]
        # else:
        #     lack_labels = ["sauteed pork"]
        # self.main_dishes.extend(lack_labels)
        
        # Make a dic store the class name and its price
        dic = {}
        for item in self.side_dishes + self.rices + self.main_dishes:
            dic[item.split('(')[0]] = int(item.split(') ')[1].split('(')[0])
        print(dic)
        
        # Leave english labels
        self.side_dishes = [item.split('(')[0] for item in self.side_dishes]
        self.rices = [item.split('(')[0] for item in self.rices]
        self.containers = [item.split('(')[0] for item in self.containers]
        self.main_dishes = [item.split('(')[0] for item in self.main_dishes]
        
    def load_prices(self, path):
        if path.endswith('.xlsx'):
            self.prices = pd.read_excel(path)
        else:
            print('File format not supported')

    def load_class_id_to_name(self, path):
        self._load_yaml(path)

    def load_train(self, path):
        self.train_data = self._load_data(path)

    def load_valid(self, path):
        self.valid_data = self._load_data(path)

    def load_test(self, path):
        self.test_data = self._load_data(path)

    def _load_data(self, path):
        '''
        dataframe columns:
            * side_dishes: sum of the area ratio of each side dish
            * rices: one hot encoding 3 types of rice
            * containers: one hot encoding 2 types of container
            * main_dishes: one hot encoding >50 types of main dishes
            * side_dishes_n: number of side dishes
            * price: 
            * fair price: 
            * diff: price - fair price
        '''
        col = self.side_dishes + self.rices + self.containers + self.main_dishes
        data = pd.DataFrame(columns=col)
        containerless_n = 0
        priceless_n = 0
        
        # For each image label file
        for filename in os.listdir(path + "/labels"):
            if not filename.endswith(".txt"):
                print(f"{filename} is not a label file.")
                continue

            df = pd.read_csv(path + "/labels/" + filename, 
                             sep=" ", 
                             header=None,
                             names=["class_id", "x_center", "y_center", "width", "height"])
            df["class_name"] = df["class_id"].apply(lambda x: self.class_id_to_name[x])
            df["area"] = df.apply(lambda x: self._cal_area(x["width"], x["height"]), axis=1)
            
            # If has container, calculate the ratio
            if df[df["class_name"].isin(self.containers)]["area"].sum() != 0:
                df["area_ratio"] = df["area"] / df[df["class_name"].isin(self.containers)]["area"].sum()
            # Else calculate the ratio of all
            else: 
                df["area_ratio"] = df["area"] / df["area"].sum()
                containerless_n += 1
            
            # In case bad name
            try:
                # Get the price
                price = self.prices[self.prices["filename"] == filename.split("_jpg")[0]]["price"].values[0]
                # Get the fair price
                fair_price = self.prices[self.prices["filename"] == filename.split("_jpg")[0]]["fair price"].values[0]
            except:
                print(filename)
                price = 69
            
            # If the price is not available, replace it with the fair price
            if pd.isna(price):
                price = fair_price
                priceless_n += 1

            # Make a new row
            dic = {}
            for index, row in df.iterrows():
                # Sum of the area ratio of each class
                dic[row["class_name"]] = dic.get(row["class_name"], 0) + row["area_ratio"]
            dic["side_dishes_n"] = len(df[df["class_name"].isin(self.side_dishes)])
            dic["price"] = price
            dic["fair price"] = fair_price
            dic["diff"] = price - fair_price

            # Concat the new row to the data, index is the filename, fill na with 0
            data = pd.concat([data, pd.DataFrame(dic, index=[filename.split("_jpg")[0]])]).fillna(0)
        
        print(f"There are {len(data)} image label files in the dataset {path}.")
        print(f"There are {containerless_n} no container in the dataset.")
        print(f"There are {priceless_n} no price in the dataset.")
        
        return data
    
    def _load_yaml(self, path):
        with open(path, 'r') as f:
            try:
                data = yaml.safe_load(f)
                print(f"There are {data['nc']} classes in the dataset from yaml.")
                self.class_id_to_name = data["names"]
            except yaml.YAMLError as exc:
                print(exc)

    def _cal_area(self, width, height):
        return width * height

In [32]:
datafolder = DataFolder("yolov7pytorch")

{'side dish': 10, 'white rice': 20, 'purple rice': 20, 'brown rice': 20, 'grilled mackerel': 30, 'salmon': 40, 'braised fish fillet': 30, 'steamed fish fillet': 30, 'sweet and spicy fried pork chops': 25, 'cuttlefish steak': 25, 'fried cod steak': 25, 'sweet and spicy chicken steak': 25, 'chicken steak': 25, 'fried pork chops': 25, 'braised pork ribs': 25, 'grilled chicken leg steak': 25, 'honey grilled chicken leg': 25, 'grilled chicken steak': 25, 'big fried chicken leg': 40, 'small fried chicken leg': 25, 'big grilled chicken leg': 40, 'small grilled chicken leg': 25, 'stir-fried minced pork': 25, 'pork with scrambled eggs': 25, 'sauteed pork': 25, 'garlic white meat': 25, 'braised pork': 25, 'curry chicken': 25, 'spicy chicken': 25, 'three cup chicken': 25, 'scallion chicken': 25, 'sausage': 25, 'kara chicken leg steak': 25, 'fried spanish mackerel': 25, 'fried chicken leg steak': 25, 'lion head': 25, 'big herbal chicken leg': 40, 'fried fish fillet': 30, 'braised fish': 30, 'small

In [33]:
# Apply linear regression
from sklearn.linear_model import LinearRegression

def MAPE(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

model = LinearRegression()

# Train with diff
model.fit(datafolder.train_data.drop(["price", "fair price", "diff"], axis=1), datafolder.train_data["diff"])
print("Train with diff--------------")

# MAPE with valid_data
y_hat = model.predict(datafolder.valid_data.drop(["price", "fair price", "diff"], axis=1))
# Add back fair price
y_hat += datafolder.valid_data["fair price"]
#print(datafolder.valid_data["price"])
print("MAPE with valid data:", MAPE(datafolder.valid_data["price"], y_hat))

# Output to csv: [filename, price, fair price, diff, y_hat, y_hat + fair price, round_to_the_nearest_integer]
output = pd.DataFrame(index=datafolder.valid_data.index)
output["filename"] = datafolder.valid_data.index
output["price"] = datafolder.valid_data["price"]
output["fair price"] = datafolder.valid_data["fair price"]
output["diff"] = datafolder.valid_data["diff"]
output["y_hat(fair price+diff)"] = y_hat
output["rounded_5"] = output["y_hat(fair price+diff)"].apply(lambda x: round(x/5)*5)
output.to_csv("./output/1130_valid_data.csv", index=False)

# MAPE with test_data
y_hat = model.predict(datafolder.test_data.drop(["price", "fair price", "diff"], axis=1))
# Add back fair price
y_hat += datafolder.test_data["fair price"]
print("MAPE with test data:", MAPE(datafolder.test_data["price"], y_hat))

# Output to csv: [filename, price, fair price, diff, y_hat, y_hat + fair price, round_to_the_nearest_integer]
output = pd.DataFrame(index=datafolder.test_data.index)
output["filename"] = datafolder.test_data.index
output["price"] = datafolder.test_data["price"]
output["fair price"] = datafolder.test_data["fair price"]
output["diff"] = datafolder.test_data["diff"]
output["y_hat(fair price+diff)"] = y_hat
output["rounded_5"] = output["y_hat(fair price+diff)"].apply(lambda x: round(x/5)*5)
output.to_csv("./output/1130_test_data.csv", index=False)


Train with diff--------------
MAPE with valid data: 7.593748510357239
MAPE with test data: 6.96816418399699
