In [126]:
# Suppress dataframe concat warning
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import yaml
import os
import numpy as np

class DataFolder():
    def __init__(self, 
                 folder_path, 
                 labels_path="./標籤集合.xlsx", 
                 prices_path="./圖片的價格.xlsx"):
        '''
        folder_path: yolov7pytorch folder
        '''
        self.folder_path = folder_path
        self.labels = pd.DataFrame()
        self.prices = pd.DataFrame()
        self.side_dishes = []
        self.rices = []
        self.containers = []
        self.main_dishes = []
        self.class_id_to_name = {}
        
        self.train_data = pd.DataFrame()
        self.valid_data = pd.DataFrame()
        self.test_data = pd.DataFrame()

        self.load_labels(labels_path)
        self.load_prices(prices_path)
        self.load_class_id_to_name(folder_path + "/data.yaml")
        self.load_train(folder_path + "/train")
        self.load_valid(folder_path + "/valid")
        self.load_test(folder_path + "/test")

    def load_labels(self, path):
        if path.endswith('.xlsx'):
            self.labels = pd.read_excel(path)
        else:
            print('File format not supported')
            return
        
        # Get unique labels
        self.side_dishes = self.labels["副菜"].dropna().unique().tolist()
        self.rices = self.labels["飯"].dropna().unique().tolist()
        self.containers = self.labels["容器"].dropna().unique().tolist()
        self.main_dishes = self.labels["主菜"].dropna().unique().tolist()
        # Add lack labels manually
        # lack_labels = []
        # if self.folder_path == "final project.v9i.yolov7pytorch_decrease_label_version":
        #     lack_labels = ["sauteed pork", "main_dish_25", "main_dish_30", "main_dish_40"]
        # else:
        #     lack_labels = ["sauteed pork"]
        # self.main_dishes.extend(lack_labels)
        
        # Leave english labels
        self.side_dishes = [item.split('(')[0] for item in self.side_dishes]
        self.rices = [item.split('(')[0] for item in self.rices]
        self.containers = [item.split('(')[0] for item in self.containers]
        self.main_dishes = [item.split('(')[0] for item in self.main_dishes]
        
    def load_prices(self, path):
        if path.endswith('.xlsx'):
            self.prices = pd.read_excel(path)
        else:
            print('File format not supported')

    def load_class_id_to_name(self, path):
        self._load_yaml(path)

    def load_train(self, path):
        self.train_data = self._load_data(path)

    def load_valid(self, path):
        self.valid_data = self._load_data(path)

    def load_test(self, path):
        self.test_data = self._load_data(path)

    def _load_data(self, path):
        '''
        dataframe columns:
            * side_dishes: sum of the area ratio of each side dish
            * rices: one hot encoding 3 types of rice
            * containers: one hot encoding 2 types of container
            * main_dishes: one hot encoding >50 types of main dishes
            * side_dishes_n: number of side dishes
            * price: 
            * fair price: 
        '''
        col = self.side_dishes + self.rices + self.containers + self.main_dishes
        data = pd.DataFrame(columns=col)
        containerless_n = 0
        priceless_n = 0
        
        # For each image label file
        for filename in os.listdir(path + "/labels"):
            if not filename.endswith(".txt"):
                print(f"{filename} is not a label file.")
                continue

            df = pd.read_csv(path + "/labels/" + filename, 
                             sep=" ", 
                             header=None,
                             names=["class_id", "x_center", "y_center", "width", "height"])
            df["class_name"] = df["class_id"].apply(lambda x: self.class_id_to_name[x])
            df["area"] = df.apply(lambda x: self._cal_area(x["width"], x["height"]), axis=1)
            
            # If has container, calculate the ratio
            if df[df["class_name"].isin(self.containers)]["area"].sum() != 0:
                df["area_ratio"] = df["area"] / df[df["class_name"].isin(self.containers)]["area"].sum()
            # Else calculate the ratio of all
            else: 
                df["area_ratio"] = df["area"] / df["area"].sum()
                containerless_n += 1
            
            # In case bad name
            try:
                # Get the price
                price = self.prices[self.prices["filename"] == filename.split("_jpg")[0]]["price"].values[0]
                # Get the fair price
                fair_price = self.prices[self.prices["filename"] == filename.split("_jpg")[0]]["fair price"].values[0]
            except:
                print(filename)
                price = 69
            
            # If the price is not available, replace it with the fair price
            if pd.isna(price):
                price = fair_price
                priceless_n += 1

            # Make a new row
            dic = {}
            for index, row in df.iterrows():
                # Sum of the area ratio of each class
                dic[row["class_name"]] = dic.get(row["class_name"], 0) + row["area_ratio"]
            dic["side_dishes_n"] = len(df[df["class_name"].isin(self.side_dishes)])
            dic["price"] = price
            dic["fair price"] = fair_price

            # Concat the new row to the data, index is the filename, fill na with 0
            data = pd.concat([data, pd.DataFrame(dic, index=[filename.split("_jpg")[0]])]).fillna(0)
        
        print(f"There are {len(data)} image label files in the dataset {path}.")
        print(f"There are {containerless_n} no container in the dataset.")
        print(f"There are {priceless_n} no price in the dataset.")

        return data
    
    def _load_yaml(self, path):
        with open(path, 'r') as f:
            try:
                data = yaml.safe_load(f)
                print(f"There are {data['nc']} classes in the dataset from yaml.")
                self.class_id_to_name = data["names"]
            except yaml.YAMLError as exc:
                print(exc)

    def _cal_area(self, width, height):
        return width * height

In [127]:
# Apply linear regression
from sklearn.linear_model import LinearRegression

def MAPE(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def Pipeline(datafolder):
    model = LinearRegression()
    # Train with price
    model.fit(datafolder.train_data.drop(["price", "fair price"], axis=1), datafolder.train_data["price"])
    print("Train with price")
    # MAPE with valid_data
    y_hat = model.predict(datafolder.valid_data.drop(["price", "fair price"], axis=1))
    print("MAPE with valid data:", MAPE(datafolder.valid_data["price"], y_hat))
    # MAPE with test_data
    y_hat = model.predict(datafolder.test_data.drop(["price", "fair price"], axis=1))
    print("MAPE with test data:", MAPE(datafolder.test_data["price"], y_hat))

    # train with fair price
    model.fit(datafolder.train_data.drop(["price", "fair price"], axis=1), datafolder.train_data["fair price"])
    print("Train with fair price")
    # MAPE with valid_data
    y_hat = model.predict(datafolder.valid_data.drop(["price", "fair price"], axis=1))
    print("MAPE with valid data:", MAPE(datafolder.valid_data["fair price"], y_hat))
    # MAPE with test_data
    y_hat = model.predict(datafolder.test_data.drop(["price", "fair price"], axis=1))
    print("MAPE with test data:", MAPE(datafolder.test_data["fair price"], y_hat))

In [128]:
datafolder = DataFolder("yolov7pytorch")

There are 49 classes in the dataset from yaml.
There are 747 image label files in the dataset final project.v9i.yolov7pytorch/train.
There are 3 no container in the dataset.
There are 66 no price in the dataset.
There are 135 image label files in the dataset final project.v9i.yolov7pytorch/valid.
There are 4 no container in the dataset.
There are 11 no price in the dataset.
There are 62 image label files in the dataset final project.v9i.yolov7pytorch/test.
There are 0 no container in the dataset.
There are 4 no price in the dataset.
There are 43 classes in the dataset from yaml.
There are 417 image label files in the dataset final project.v6i.yolov7pytorch/train.
There are 12 no container in the dataset.
There are 66 no price in the dataset.
There are 76 image label files in the dataset final project.v6i.yolov7pytorch/valid.
There are 4 no container in the dataset.
There are 11 no price in the dataset.
There are 34 image label files in the dataset final project.v6i.yolov7pytorch/test.


In [129]:
Pipeline(datafolder=datafolder)

path: final project.v9i.yolov7pytorch
Train with price
MAPE with valid data: 11.2424969270028
MAPE with test data: 9.469748420855177
Train with fair price
MAPE with valid data: 9.867784844619152
MAPE with valid data: 8.814267335240283
path: final project.v6i.yolov7pytorch
Train with price
MAPE with valid data: 13.733312420509957
MAPE with test data: 12.591195012298332
Train with fair price
MAPE with valid data: 13.714737511716182
MAPE with valid data: 11.319105421358
path: final project.v1i.yolov7pytorch
Train with price
MAPE with valid data: 21.505468170777274
MAPE with test data: 19.218485666457102
Train with fair price
MAPE with valid data: 18.3242191638293
MAPE with valid data: 21.661201049128923


In [130]:
datafolderv92 = DataFolder("final project.v9i.yolov7pytorch_decrease_label_version")
display(datafolderv92.train_data.columns)
print("path:", "final project.v9i.yolov7pytorchXXX")
Pipeline(datafolder=datafolderv92)

There are 9 classes in the dataset from yaml.
There are 747 image label files in the dataset final project.v9i.yolov7pytorch_decrease_label_version/train.
There are 3 no container in the dataset.
There are 66 no price in the dataset.
There are 135 image label files in the dataset final project.v9i.yolov7pytorch_decrease_label_version/valid.
There are 4 no container in the dataset.
There are 11 no price in the dataset.
There are 62 image label files in the dataset final project.v9i.yolov7pytorch_decrease_label_version/test.
There are 0 no container in the dataset.
There are 4 no price in the dataset.


Index(['side dish', 'white rice', 'purple rice', 'brown rice', 'plate', 'box',
       'grilled mackerel', 'salmon', 'braised fish fillet',
       'steamed fish fillet', 'sweet and spicy fried pork chops',
       'cuttlefish steak', 'fried cod steak', 'sweet and spicy chicken steak',
       'chicken steak', 'fried pork chops', 'braised pork ribs',
       'grilled chicken leg steak', 'honey grilled chicken leg',
       'grilled chicken steak', 'big fried chicken leg',
       'small fried chicken leg', 'big grilled chicken leg',
       'small grilled chicken leg', 'stir-fried minced pork',
       'pork with scrambled eggs', 'sauteed pork ', 'garlic white meat',
       'braised pork', 'curry chicken', 'spicy chicken', 'three cup chicken',
       'scallion chicken', 'sausage', 'kara chicken leg steak',
       'fried spanish mackerel', 'fried chicken leg steak', 'lion head',
       'big herbal chicken leg', 'fried fish fillet', 'braised fish',
       'small herbal chicken leg', 'pig ear', 'f

path: final project.v9i.yolov7pytorchXXX
Train with price
MAPE with valid data: 10.048689249234325
MAPE with test data: 10.110272463783776
Train with fair price
MAPE with valid data: 9.35709567795009
MAPE with valid data: 9.160213777207467
