In [27]:
import pandas as pd
import yaml
import os
import numpy as np

class DataFolder():
    def __init__(self, 
                 folder_path, 
                 labels_path="./標籤集合.xlsx", 
                 prices_path="./圖片的價格1121.xlsx"):
        '''
        folder_path: yolov7pytorch folder
        '''
        self.labels = pd.DataFrame()
        self.prices = pd.DataFrame()
        self.side_dishes = []
        self.rices = []
        self.containers = []
        self.main_dishes = []
        self.class_id_to_name = {}
        
        self.train_data = pd.DataFrame()
        self.valid_data = pd.DataFrame()
        self.test_data = pd.DataFrame()

        self.load_labels(labels_path)
        self.load_prices(prices_path)
        self.load_class_id_to_name(folder_path + "/data.yaml")
        self.load_train(folder_path + "/train")
        self.load_valid(folder_path + "/valid")
        self.load_test(folder_path + "/test")

    def load_labels(self, path):
        if path.endswith('.xlsx'):
            self.labels = pd.read_excel(path)
        else:
            print('File format not supported')
            return
        
        # Get unique labels
        self.side_dishes = self.labels["副菜"].dropna().unique().tolist()
        self.rices = self.labels["飯"].dropna().unique().tolist()
        self.containers = self.labels["容器"].dropna().unique().tolist()
        self.main_dishes = self.labels["主菜"].dropna().unique().tolist()
        self.main_dishes.append("sauteed pork")
        
        # Leave english labels
        self.side_dishes = [item.split('(')[0] for item in self.side_dishes]
        self.rices = [item.split('(')[0] for item in self.rices]
        self.containers = [item.split('(')[0] for item in self.containers]
        self.main_dishes = [item.split('(')[0] for item in self.main_dishes]
        
    def load_prices(self, path):
        if path.endswith('.xlsx'):
            self.prices = pd.read_excel(path)
        else:
            print('File format not supported')

    def load_class_id_to_name(self, path):
        self._load_yaml(path)

    def load_train(self, path):
        self.train_data = self._load_data(path)

    def load_valid(self, path):
        self.valid_data = self._load_data(path)

    def load_test(self, path):
        self.test_data = self._load_data(path)

    def _load_data(self, path):
        '''
        dataframe columns:
            * side_dishes: sum of the area ratio of each side dish
            * rices: one hot encoding 3 types of rice
            * containers: one hot encoding 2 types of container
            * main_dishes: one hot encoding >50 types of main dishes
            * side_dishes_n: number of side dishes
            * price: 
            * fair price: 
        '''
        col = self.side_dishes + self.rices + self.containers + self.main_dishes
        data = pd.DataFrame(columns=col)
        containerless_n = 0
        priceless_n = 0
        
        # For each image label file
        for filename in os.listdir(path + "/labels"):
            if not filename.endswith(".txt"):
                print(f"{filename} is not a label file.")
                continue

            df = pd.read_csv(path + "/labels/" + filename, 
                             sep=" ", 
                             header=None,
                             names=["class_id", "x_center", "y_center", "width", "height"])
            df["class_name"] = df["class_id"].apply(lambda x: self.class_id_to_name[x])
            df["area"] = df.apply(lambda x: self._cal_area(x["width"], x["height"]), axis=1)
            
            # If has container, calculate the ratio
            if df[df["class_name"].isin(self.containers)]["area"].sum() != 0:
                df["area_ratio"] = df["area"] / df[df["class_name"].isin(self.containers)]["area"].sum()
            # Else calculate the ratio of all
            else: 
                df["area_ratio"] = df["area"] / df["area"].sum()
                containerless_n += 1
            
            # Get the price
            price = self.prices[self.prices["filename"] == filename.split("_jpg")[0]]["price"].values[0]
            # Get the fair price
            fair_price = self.prices[self.prices["filename"] == filename.split("_jpg")[0]]["fair price"].values[0]
            # If the price is not available, replace it with the fair price
            if pd.isna(price):
                price = fair_price
                priceless_n += 1

            # Make a new row
            dic = {}
            for index, row in df.iterrows():
                # Sum of the area ratio of each class
                dic[row["class_name"]] = dic.get(row["class_name"], 0) + row["area_ratio"]
            dic["side_dishes_n"] = len(df[df["class_name"].isin(self.side_dishes)])
            dic["price"] = price
            dic["fair price"] = fair_price

            # Concat the new row to the data, index is the filename, fill na with 0
            data = pd.concat([data, pd.DataFrame(dic, index=[filename.split("_jpg")[0]])]).fillna(0)
        
        print(f"There are {len(data)} image label files in the dataset {path}.")
        print(f"There are {containerless_n} no container in the dataset.")
        print(f"There are {priceless_n} no price in the dataset.")

        return data
    
    def _load_yaml(self, path):
        with open(path, 'r') as f:
            try:
                data = yaml.safe_load(f)
                print(f"There are {data['nc']} classes in the dataset from yaml.")
                self.class_id_to_name = data["names"]
            except yaml.YAMLError as exc:
                print(exc)

    def _cal_area(self, width, height):
        return width * height

In [28]:
datafolder = DataFolder("final project.v9i.yolov7pytorch")

There are 49 classes in the dataset from yaml.


  data = pd.concat([data, pd.DataFrame(dic, index=[filename.split("_jpg")[0]])]).fillna(0)


There are 747 image label files in the dataset final project.v9i.yolov7pytorch/train.
There are 3 no container in the dataset.
There are 66 no price in the dataset.


  data = pd.concat([data, pd.DataFrame(dic, index=[filename.split("_jpg")[0]])]).fillna(0)


There are 135 image label files in the dataset final project.v9i.yolov7pytorch/valid.
There are 4 no container in the dataset.
There are 11 no price in the dataset.


  data = pd.concat([data, pd.DataFrame(dic, index=[filename.split("_jpg")[0]])]).fillna(0)


There are 62 image label files in the dataset final project.v9i.yolov7pytorch/test.
There are 0 no container in the dataset.
There are 4 no price in the dataset.


In [30]:
# Apply linear regression
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(datafolder.train_data.drop(["price", "fair price"], axis=1), datafolder.train_data["price"])
model.coef_, model.intercept_

array([ 9.20155345e+00, -2.12590582e+01, -1.47868113e+01, -2.94990416e-01,
        4.25757570e+01,  3.26299237e+01,  8.69814168e+01,  1.41809264e+02,
        7.45315257e+01,  6.31020182e+01,  1.67688086e-12,  1.09466089e+02,
        5.68543741e+01,  1.17942626e+02,  7.57796035e+01,  4.33471191e+01,
       -5.32907052e-14,  8.64758589e+01,  7.84195168e+01,  7.13262074e+01,
        1.43356245e+02,  7.88008443e+01,  7.86462986e+01,  1.11875828e+02,
        9.43140301e+01,  1.02077896e+02, -2.27373675e-13,  9.86157105e+01,
        7.65454728e+01,  9.10939942e+01,  7.47949025e+01,  7.63811230e+01,
        1.01280594e+02,  1.76477588e+02,  6.81441701e+01,  6.42112726e+01,
        1.42108547e-13,  1.70107544e+02,  1.25581601e+02, -4.26325641e-14,
        1.41778174e+02,  5.16080609e+01,  6.39939536e+01,  8.19702104e+01,
        3.48004122e+01,  5.30684332e+01,  1.83154136e+02,  7.44742879e+01,
        6.56576231e+01,  1.09128738e+02,  1.19930681e+02,  0.00000000e+00,
        9.31028868e+01,  

In [33]:
y_hat = model.predict(datafolder.valid_data.drop(["price", "fair price"], axis=1))
y_hat

array([ 87.29534077,  76.35869216,  66.72275606,  72.34103337,
        63.18348846,  80.62302788,  63.85895251,  61.58253962,
        76.41387452,  72.21246582, 120.95758639,  70.65727688,
        60.11412457,  79.26269607,  81.27220847,  66.1839342 ,
        84.48370552,  85.67356901,  84.71321406,  71.37888581,
       111.01090807,  72.1831646 ,  61.39412472,  25.40140086,
        76.8589151 ,  77.16003255,  87.76928823,  72.01955644,
        75.91080633,  74.05697115,  73.64947501,  81.84436073,
        89.44192073,  72.29385162,  68.32675619,  92.24002744,
        62.13356403,  68.0267879 ,  73.84292322,  87.69287199,
        69.62307925,  49.69951666,  74.82012951,  96.83911368,
        26.63504629,  52.35493311,  78.334893  ,  70.30637444,
        75.37511742,  72.60275438,  74.79406584,  69.54396477,
        73.32761893, 111.71478096,  51.55100644,  72.45870791,
        71.31422291,  86.34907161,  69.40858928,  73.79588215,
        62.37815337,  62.97141902,  72.2715513 ,  72.13

In [None]:
def MAPE(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100