In [152]:
# Suppress dataframe concat warning
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import yaml
import os
import numpy as np

class DataFolder():
    def __init__(self, 
                 folder_path, 
                 labels_path="./標籤集合.xlsx", 
                 prices_path="./圖片的價格.xlsx"):
        '''
        folder_path: yolov7pytorch folder
        '''
        self.folder_path = folder_path
        self.labels = pd.DataFrame()
        self.prices = pd.DataFrame()
        self.side_dishes = []
        self.rices = []
        self.containers = []
        self.main_dishes = []
        self.class_id_to_name = {}
        
        self.train_data = pd.DataFrame()
        self.valid_data = pd.DataFrame()
        self.test_data = pd.DataFrame()

        self.load_labels(labels_path)
        self.load_prices(prices_path)
        self.load_class_id_to_name(folder_path + "/data.yaml")
        self.load_train(folder_path + "/train")
        self.load_valid(folder_path + "/valid")
        self.load_test(folder_path + "/test")

    def load_labels(self, path):
        if path.endswith('.xlsx'):
            self.labels = pd.read_excel(path)
        else:
            print('File format not supported')
            return
        
        # Get unique labels
        self.side_dishes = self.labels["副菜"].dropna().unique().tolist()
        self.rices = self.labels["飯"].dropna().unique().tolist()
        self.containers = self.labels["容器"].dropna().unique().tolist()
        self.main_dishes = self.labels["主菜"].dropna().unique().tolist()
        # Add lack labels manually
        # lack_labels = []
        # if self.folder_path == "final project.v9i.yolov7pytorch_decrease_label_version":
        #     lack_labels = ["sauteed pork", "main_dish_25", "main_dish_30", "main_dish_40"]
        # else:
        #     lack_labels = ["sauteed pork"]
        # self.main_dishes.extend(lack_labels)
        
        # Make a dic store the class name and its price
        dic = {}
        for item in self.side_dishes + self.rices + self.main_dishes:
            dic[item.split('(')[0]] = int(item.split(') ')[1].split('(')[0])
        print(dic)
        
        # Leave english labels
        self.side_dishes = [item.split('(')[0] for item in self.side_dishes]
        self.rices = [item.split('(')[0] for item in self.rices]
        self.containers = [item.split('(')[0] for item in self.containers]
        self.main_dishes = [item.split('(')[0] for item in self.main_dishes]
        
    def load_prices(self, path):
        if path.endswith('.xlsx'):
            self.prices = pd.read_excel(path)
        else:
            print('File format not supported')

    def load_class_id_to_name(self, path):
        self._load_yaml(path)

    def load_train(self, path):
        self.train_data = self._load_data(path)

    def load_valid(self, path):
        self.valid_data = self._load_data(path)

    def load_test(self, path):
        self.test_data = self._load_data(path)

    def _load_data(self, path):
        '''
        dataframe columns:
            * side_dishes: sum of the area ratio of each side dish
            * rices: one hot encoding 3 types of rice
            * containers: one hot encoding 2 types of container
            * main_dishes: one hot encoding >50 types of main dishes
            * side_dishes_n: number of side dishes
            * price: 
            * fair price: 
            * diff: price - fair price
        '''
        col = self.side_dishes + self.rices + self.containers + self.main_dishes
        data = pd.DataFrame(columns=col)
        containerless_n = 0
        priceless_n = 0
        
        # For each image label file
        for filename in os.listdir(path + "/labels"):
            if not filename.endswith(".txt"):
                print(f"{filename} is not a label file.")
                continue

            df = pd.read_csv(path + "/labels/" + filename, 
                             sep=" ", 
                             header=None,
                             names=["class_id", "x_center", "y_center", "width", "height"])
            df["class_name"] = df["class_id"].apply(lambda x: self.class_id_to_name[x])
            df["area"] = df.apply(lambda x: self._cal_area(x["width"], x["height"]), axis=1)
            
            # If has container, calculate the ratio
            if df[df["class_name"].isin(self.containers)]["area"].sum() != 0:
                df["area_ratio"] = df["area"] / df[df["class_name"].isin(self.containers)]["area"].sum()
            # Else calculate the ratio of all
            else: 
                df["area_ratio"] = df["area"] / df["area"].sum()
                containerless_n += 1
            
            # In case bad name
            try:
                # Get the price
                price = self.prices[self.prices["filename"] == filename.split("_jpg")[0]]["price"].values[0]
                # Get the fair price
                fair_price = self.prices[self.prices["filename"] == filename.split("_jpg")[0]]["fair price"].values[0]
            except:
                print(filename)
                price = 69
            
            # If the price is not available, replace it with the fair price
            if pd.isna(price):
                price = fair_price
                priceless_n += 1

            # Make a new row
            dic = {}
            for index, row in df.iterrows():
                # Sum of the area ratio of each class
                dic[row["class_name"]] = dic.get(row["class_name"], 0) + row["area_ratio"]
            dic["side_dishes_n"] = len(df[df["class_name"].isin(self.side_dishes)])
            dic["price"] = price
            dic["fair price"] = fair_price
            dic["diff"] = price - fair_price

            # Concat the new row to the data, index is the filename, fill na with 0
            data = pd.concat([data, pd.DataFrame(dic, index=[filename.split("_jpg")[0]])]).fillna(0)
        
        print(f"There are {len(data)} image label files in the dataset {path}.")
        print(f"There are {containerless_n} no container in the dataset.")
        print(f"There are {priceless_n} no price in the dataset.")
        
        return data
    
    def _load_yaml(self, path):
        with open(path, 'r') as f:
            try:
                data = yaml.safe_load(f)
                print(f"There are {data['nc']} classes in the dataset from yaml.")
                self.class_id_to_name = data["names"]
            except yaml.YAMLError as exc:
                print(exc)

    def _cal_area(self, width, height):
        return width * height

In [153]:
# Apply linear regression
from sklearn.linear_model import LinearRegression

def MAPE(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def Pipeline(datafolder):
    model = LinearRegression()
    
    # Train with price
    model.fit(datafolder.train_data.drop(["price", "fair price", "diff"], axis=1), datafolder.train_data["price"])
    print("Train with price")
    # MAPE with valid_data
    y_hat = model.predict(datafolder.valid_data.drop(["price", "fair price", "diff"], axis=1))
    print("MAPE with valid data:", MAPE(datafolder.valid_data["price"], y_hat))
    # MAPE with test_data
    y_hat = model.predict(datafolder.test_data.drop(["price", "fair price", "diff"], axis=1))
    print("MAPE with test data:", MAPE(datafolder.test_data["price"], y_hat))

    # Train with diff
    model.fit(datafolder.train_data.drop(["price", "fair price", "diff"], axis=1), datafolder.train_data["diff"])
    print("Train with diff")
    # MAPE with valid_data
    y_hat = model.predict(datafolder.valid_data.drop(["price", "fair price", "diff"], axis=1))
    # Add back fair price
    y_hat += datafolder.valid_data["fair price"]
    print("MAPE with valid data:", MAPE(datafolder.valid_data["price"], y_hat))
    # MAPE with test_data
    y_hat = model.predict(datafolder.test_data.drop(["price", "fair price", "diff"], axis=1))
    # Add back fair price
    y_hat += datafolder.test_data["fair price"]
    print("MAPE with test data:", MAPE(datafolder.test_data["price"], y_hat))

    return model

In [154]:
datafolder = DataFolder("yolov7pytorch")

{'side dish': 10, 'white rice': 20, 'purple rice': 20, 'brown rice': 20, 'grilled mackerel': 30, 'salmon': 40, 'braised fish fillet': 30, 'steamed fish fillet': 30, 'sweet and spicy fried pork chops': 25, 'cuttlefish steak': 25, 'fried cod steak': 25, 'sweet and spicy chicken steak': 25, 'chicken steak': 25, 'fried pork chops': 25, 'braised pork ribs': 25, 'grilled chicken leg steak': 25, 'honey grilled chicken leg': 25, 'grilled chicken steak': 25, 'big fried chicken leg': 40, 'small fried chicken leg': 25, 'big grilled chicken leg': 40, 'small grilled chicken leg': 25, 'stir-fried minced pork': 25, 'pork with scrambled eggs': 25, 'sauteed pork': 25, 'garlic white meat': 25, 'braised pork': 25, 'curry chicken': 25, 'spicy chicken': 25, 'three cup chicken': 25, 'scallion chicken': 25, 'sausage': 25, 'kara chicken leg steak': 25, 'fried spanish mackerel': 25, 'fried chicken leg steak': 25, 'lion head': 25, 'big herbal chicken leg': 40, 'fried fish fillet': 30, 'braised fish': 30, 'small

In [155]:
model = Pipeline(datafolder=datafolder)

Train with price
MAPE with valid data: 10.749437311034333
MAPE with test data: 9.868806911733095
Train with diff
MAPE with valid data: 7.593748510357242
MAPE with test data: 6.968164183996996


In [156]:
model.feature_names_in_

array(['side dish', 'white rice', 'purple rice', 'brown rice', 'plate',
       'box', 'grilled mackerel', 'salmon', 'braised fish fillet',
       'steamed fish fillet', 'sweet and spicy fried pork chops',
       'cuttlefish steak', 'fried cod steak',
       'sweet and spicy chicken steak', 'chicken steak',
       'fried pork chops', 'braised pork ribs',
       'grilled chicken leg steak', 'honey grilled chicken leg',
       'grilled chicken steak', 'big fried chicken leg',
       'small fried chicken leg', 'big grilled chicken leg',
       'small grilled chicken leg', 'stir-fried minced pork',
       'pork with scrambled eggs', 'sauteed pork', 'garlic white meat',
       'braised pork', 'curry chicken', 'spicy chicken',
       'three cup chicken', 'scallion chicken', 'sausage',
       'kara chicken leg steak', 'fried spanish mackerel',
       'fried chicken leg steak', 'lion head', 'big herbal chicken leg',
       'fried fish fillet', 'braised fish', 'small herbal chicken leg',
       

In [157]:
model.coef_

array([ 1.14023194e+01, -2.45067259e+01, -2.36213096e+01, -2.81382780e+01,
        6.43117153e+00,  6.63422863e+00,  2.26032919e+01,  3.64703694e-01,
        5.32423996e+00, -9.63763828e-03,  2.13162821e-13, -1.77280047e+01,
       -4.46767759e+00,  2.70203774e+01, -5.12450544e+00, -6.67941217e+00,
       -3.52590538e+01, -2.54726188e+00,  2.13346099e-01,  7.17512804e+00,
       -4.92832931e+00,  1.72442736e+00,  6.17711281e+00,  5.26411558e+00,
        3.10743501e+01,  1.94095935e+01,  1.38788062e+01,  2.09760154e+01,
        5.80908728e-01,  1.60302506e+01,  7.22705773e+00,  1.09633019e+01,
        1.99088547e+01, -7.61399657e-01,  4.86261108e-01, -1.85529885e+00,
       -7.10542736e-15,  3.29149207e+00,  4.60092622e+01,  4.21884749e-15,
       -1.83807180e+00, -8.55877776e+00,  2.47239645e+01, -6.16416222e-01,
       -2.46019802e+00, -1.08689839e+01, -1.83959945e+01,  6.63190320e+00,
        6.09968940e+00,  3.60128128e+01,  6.03959009e+00,  8.39259246e+00,
        0.00000000e+00, -

In [158]:
model.intercept_

4.497685334096449

In [159]:
def Predict(image=None):
    '''Def predict(image):
        ...
        Output = {
        "labe1":{"number":1,location:[[Upleft,Upright,Downleft,Downright],[],[]...]}
        "label2":{"number":2,location:[[],[],[]...]}
        "label3":{"number":1,location:[[],[],[]...]}
        }
        Return Output
        '''
    '''11_22_53
    28 0.5 0.507118254879449 1 0.9857634902411022
        48 0.5246204620462046 0.43256027554535015 0.44550055005500555 0.4600574052812859
        34 0.28771177117711766 0.40107921928817447 0.48155115511551155 0.6221469575200919
        37 0.4185808580858086 0.7576234213547646 0.3830363036303631 0.32029850746268657
        37 0.6995269526952695 0.2943742824339839 0.44767876787678773 0.43167623421354767
        37 0.7822772277227723 0.6765786452353616 0.35491749174917503 0.3648794489092996'''
    sample_output = {
        "label1":{"number":28,"location":[0.5,0.507118254879449,1,0.9857634902411022]},
        "label2":{"number":48,"location":[0.5246204620462046,0.43256027554535015,0.44550055005500555,0.4600574052812859]},
        "label3":{"number":34,"location":[0.28771177117711766,0.40107921928817447,0.48155115511551155,0.6221469575200919]},
        "label4":{"number":37,"location":[0.4185808580858086,0.7576234213547646,0.3830363036303631,0.32029850746268657]},
        "label5":{"number":37,"location":[0.6995269526952695,0.2943742824339839,0.44767876787678773,0.43167623421354767]},
        "label6":{"number":37,"location":[0.7822772277227723,0.6765786452353616,0.35491749174917503,0.3648794489092996]}
    }
    return sample_output

In [160]:
class Processor():
    def __init__(self):
        self.model_feature_names_in_ = np.array(['side dish', 'white rice', 'purple rice', 'brown rice', 'plate',
       'box', 'grilled mackerel', 'salmon', 'braised fish fillet',
       'steamed fish fillet', 'sweet and spicy fried pork chops',
       'cuttlefish steak', 'fried cod steak',
       'sweet and spicy chicken steak', 'chicken steak',
       'fried pork chops', 'braised pork ribs',
       'grilled chicken leg steak', 'honey grilled chicken leg',
       'grilled chicken steak', 'big fried chicken leg',
       'small fried chicken leg', 'big grilled chicken leg',
       'small grilled chicken leg', 'stir-fried minced pork',
       'pork with scrambled eggs', 'sauteed pork', 'garlic white meat',
       'braised pork', 'curry chicken', 'spicy chicken',
       'three cup chicken', 'scallion chicken', 'sausage',
       'kara chicken leg steak', 'fried spanish mackerel',
       'fried chicken leg steak', 'lion head', 'big herbal chicken leg',
       'fried fish fillet', 'braised fish', 'small herbal chicken leg',
       'pig ear', 'fried shishamo', 'herbal chicken', 'shrimp rolls',
       'pan fried fish fillet', 'shacha pork', 'thai pork',
       'white sauce chicken', 'fried chicken rolls',
       'unknown square main dish', 'cheese pork chops', 'side_dishes_n'])
        self.model_coef_ = np.array([ 1.14023194e+01, -2.45067259e+01, -2.36213096e+01, -2.81382780e+01,
        6.43117153e+00,  6.63422863e+00,  2.26032919e+01,  3.64703694e-01,
        5.32423996e+00, -9.63763828e-03,  2.13162821e-13, -1.77280047e+01,
       -4.46767759e+00,  2.70203774e+01, -5.12450544e+00, -6.67941217e+00,
       -3.52590538e+01, -2.54726188e+00,  2.13346099e-01,  7.17512804e+00,
       -4.92832931e+00,  1.72442736e+00,  6.17711281e+00,  5.26411558e+00,
        3.10743501e+01,  1.94095935e+01,  1.38788062e+01,  2.09760154e+01,
        5.80908728e-01,  1.60302506e+01,  7.22705773e+00,  1.09633019e+01,
        1.99088547e+01, -7.61399657e-01,  4.86261108e-01, -1.85529885e+00,
       -7.10542736e-15,  3.29149207e+00,  4.60092622e+01,  4.21884749e-15,
       -1.83807180e+00, -8.55877776e+00,  2.47239645e+01, -6.16416222e-01,
       -2.46019802e+00, -1.08689839e+01, -1.83959945e+01,  6.63190320e+00,
        6.09968940e+00,  3.60128128e+01,  6.03959009e+00,  8.39259246e+00,
        0.00000000e+00, -3.01345455e+00])
        self.model_intercept_ = 4.497685334096449
        self.class_id_to_name = ['big fried chicken leg', 'big grilled chicken leg', 'big herbal chicken leg', 'box', 'braised fish', 'braised fish fillet', 'braised pork', 'braised pork ribs', 'brown rice', 'chicken steak', 'curry chicken', 'cuttlefish steak', 'fried chicken leg steak', 'fried chicken rolls', 'fried cod steak', 'fried pork chops', 'fried shishamo', 'fried spanish mackerel', 'garlic white meat', 'grilled chicken leg steak', 'grilled chicken steak', 'grilled mackerel', 'herbal chicken', 'honey grilled chicken leg', 'kara chicken leg steak', 'lion head', 'pan fried fish fillet', 'pig ear', 'plate', 'pork with scrambled eggs', 'purple rice', 'salmon', 'sausage', 'sauteed pork', 'scallion chicken', 'shacha pork', 'shrimp rolls', 'side dish', 'small fried chicken leg', 'small grilled chicken leg', 'small herbal chicken leg', 'spicy chicken', 'steamed fish fillet', 'stir-fried minced pork', 'sweet and spicy chicken steak', 'thai pork', 'three cup chicken', 'unknown square main dish', 'white rice', 'white sauce chicken']
        self.name_to_money = {'side dish': 10, 'white rice': 20, 'purple rice': 20, 'brown rice': 20, 'grilled mackerel': 30, 'salmon': 40, 'braised fish fillet': 30, 'steamed fish fillet': 30, 'sweet and spicy fried pork chops': 25, 'cuttlefish steak': 25, 'fried cod steak': 25, 'sweet and spicy chicken steak': 25, 'chicken steak': 25, 'fried pork chops': 25, 'braised pork ribs': 25, 'grilled chicken leg steak': 25, 'honey grilled chicken leg': 25, 'grilled chicken steak': 25, 'big fried chicken leg': 40, 'small fried chicken leg': 25, 'big grilled chicken leg': 40, 'small grilled chicken leg': 25, 'stir-fried minced pork': 25, 'pork with scrambled eggs': 25, 'sauteed pork': 25, 'garlic white meat': 25, 'braised pork': 25, 'curry chicken': 25, 'spicy chicken': 25, 'three cup chicken': 25, 'scallion chicken': 25, 'sausage': 25, 'kara chicken leg steak': 25, 'fried spanish mackerel': 25, 'fried chicken leg steak': 25, 'lion head': 25, 'big herbal chicken leg': 40, 'fried fish fillet': 30, 'braised fish': 30, 'small herbal chicken leg': 25, 'pig ear': 25, 'fried shishamo': 25, 'herbal chicken': 25, 'shrimp rolls': 25, 'pan fried fish fillet': 30, 'shacha pork': 25, 'thai pork': 25, 'white sauce chicken': 25, 'fried chicken rolls': 25, 'unknown square main dish': 25, 'cheese pork chops': 25}
        self.containers = ["box", "plate"]
        self.side_dishes = ["side dish"]
        
    def _cal_area(self, width, height):
        return width * height
    
    def process_output(self, output):
        data = pd.DataFrame(columns=self.model_feature_names_in_)
        dic = {}
        side_dishes_n = 0
        for _, value in output.items():
            class_name = self.class_id_to_name[value["number"]]
            area = self._cal_area(value["location"][2], value["location"][3])
            dic[class_name] = dic.get(class_name, 0) + area
            # if is side dish
            if class_name in self.side_dishes:
                side_dishes_n += 1
        
        # If dic has container, calculate the ratio
        if sum([dic.get(key, 0) for key in self.containers]) != 0:
            overall_container = sum([dic.get(key, 0) for key in self.containers])
            for key in dic.keys():
                dic[key] = dic[key] / overall_container
        # Else calculate the ratio of all
        else: 
            overall = sum(dic.values())
            for key in dic.keys():
                dic[key] = dic[key] / overall

        dic["side_dishes_n"] = side_dishes_n
        
        data = pd.concat([data, pd.DataFrame(dic, index=[0])]).fillna(0)
        return data
    
    def _calc_fair_price(self, data):
        fair_price = 0
        for key in data.keys():
            if key in self.containers or key in self.side_dishes:
                continue
            if data[key].values[0] == 0:
                continue
            if key == "side_dishes_n":
                fair_price += data[key].values[0] * self.name_to_money["side dish"]
                continue
            
            fair_price += self.name_to_money[key]
        return fair_price
    
    def predict(self, output):
        data = self.process_output(output)
        # Calc fair price
        fair_price = self._calc_fair_price(data)
        print(fair_price)
        # Predict diff
        diff = np.dot(data, self.model_coef_) + self.model_intercept_
        # Add back fair price
        fair_price = diff + fair_price
        return fair_price

In [161]:
output = Predict()
processor = Processor()
data = processor.process_output(output)
display(data)

Unnamed: 0,side dish,white rice,purple rice,brown rice,plate,box,grilled mackerel,salmon,braised fish fillet,steamed fish fillet,...,herbal chicken,shrimp rolls,pan fried fish fillet,shacha pork,thai pork,white sauce chicken,fried chicken rolls,unknown square main dish,cheese pork chops,side_dishes_n
0,0.451873,0.207916,0,0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3


In [162]:
model.predict(data)

array([7.99630889])

In [163]:
y_hat = processor.predict(output)
print(y_hat)

75
[82.99630894]
