In [4]:
import pandas as pd
import glob
import numpy as np
from operator import truediv
import os
import xml.etree.cElementTree as ET
import pickle

In [5]:
def get_X_Y(df):
    """
    Get big X and big Y
    """
    X = df.iloc[:,1:5].values
    Y = df.iloc[:,0].values
    Z = df.iloc[:,-1].values
    
    return X,Y,Z

def get_center(df, X_df, param):
    """
    Get the center of x or y
    This function serves as a sub-function for get_center_division()
    :param: Can be 'x' or 'y', specify which center to be calculated
    """
    if param == 'x': 
        list_x_center = []
        for i in range(0, len(df)):
            x_center = (X_df[i][2] - X_df[i][0])/2 # (xmax - xmin)/2
            list_x_center.append(x_center)
        return list_x_center
    elif param == 'y':
        list_y_center = []
        for i in range(0, len(df)):
            y_center = (X_df[i][3] - X_df[i][1])/2 # (ymax - ymin)/2
            list_y_center.append(y_center)
        return list_y_center
    else:
        return None
    
def get_center_division(df,X_df,center):
    """
    Get the x divided by width or y divided by height
    :param: Can be 'x' or 'y', specify which center to be calculated
    """
    if center == 'x':
        the_center = get_center(df,X_df,center)
        width = df['width'].values
        xDVwidth = list(map(truediv, the_center, width)) 

        return xDVwidth
    elif center == 'y':
        the_center = get_center(df,X_df,center)
        height = df['height'].values
        yDVheight = list(map(truediv, the_center, height)) 

        return yDVheight
    else:
        return None

def get_maxmin_subtract(df,X_df,param):
    """
    Get the center of x or y
    This function serves as a sub-function for get_subtracted_division()
    :param: Can be 'x' or 'y', specify which center to be calculated
    """
    if param == 'x':
        list_x_subtract = []
        for i in range(0, len(df)):
            list_x_subtract.append(X_df[i][2] - X_df[i][0]) # xmax - xmin
        return list_x_subtract
    elif param == 'y':
        list_y_subtract = []
        for i in range(0, len(df)):
            list_y_subtract.append(X_df[i][3] - X_df[i][1]) # ymax-ymin
        return list_y_subtract
    else:
        return None
    
def get_subtracted_division(df,X_df,center):
    """
    Get the x divided by width or y divided by height
    :param: Can be 'x' or 'y', specify which center to be calculated
    """
    if center == 'x':
        the_center = get_maxmin_subtract(df,X_df,center)
        width = df['width'].values
        xDVwidth = list(map(truediv, the_center, width)) 

        return xDVwidth
    elif center == 'y':
        the_center = get_maxmin_subtract(df,X_df,center)
        height = df['height'].values
        yDVheight = list(map(truediv, the_center, height)) 

        return yDVheight
    else:
        return None

    
def readcsv(the_path):

    path = the_path
    all_files = glob.glob(path + "/*.csv")
    li = []
    
    bigX, bigY, bigZ = [],[],[]

    for filename in all_files:

            a_df = pd.read_csv(filename)
            li.append(a_df)
            a_df['count']=a_df.groupby('filename')['Name'].transform('count').values
            X,Y,Z = get_X_Y(a_df)
            bigX.append(X)
            bigY.append(Y)
            bigZ.append(Z)
    df = pd.concat(li, axis=0, ignore_index=True)
    

    return df, bigX, bigY, bigZ

def readxml(the_path):
    all_files = list(glob.iglob(os.path.join (the_path, "*.xml")))
    li = []
#     i = 0
    for fn in all_files:
#         i += 1
#         if i < 3:
            dfcols = ['Name','xmin', 'ymin', 'xmax','ymax', 'filename']

            mytree = ET.parse(fn)
            myroot = mytree.getroot()
            rows = myroot.findall('object')
            rows2 = myroot.findall('filename')

            for obj2 in rows2:
                name = obj2.text 
            xml_data = [ [obj[0].text, obj[4][0].text,obj[4][1].text,obj[4][2].text,obj[4][3].text, name] for obj in rows]

            df_xml = pd.DataFrame(xml_data, columns=dfcols)
            df_xml['count']=df_xml.groupby('filename')['Name'].transform('count').values
            
            li.append(df_xml)

    df = pd.concat(li, axis=0, ignore_index=True)
    return df


df_classification, bigX_cl, bigY_cl, bigZ_cl = readcsv('csv_classification/')
df_classification

Unnamed: 0,Name,xmin,ymin,xmax,ymax,width,height,filename,count
0,product_name,81,4,171,43,1104,126,1608466652_0my5W.jpg@product_block_10.jpg,9
1,product_name,334,9,447,47,1104,126,1608466652_0my5W.jpg@product_block_10.jpg,9
2,product_total_money,859,76,1099,122,1104,126,1608466652_0my5W.jpg@product_block_10.jpg,9
3,vat,1008,30,1086,74,1104,126,1608466652_0my5W.jpg@product_block_10.jpg,9
4,product_name,189,10,320,47,1104,126,1608466652_0my5W.jpg@product_block_10.jpg,9
...,...,...,...,...,...,...,...,...,...
17193,vat,1394,81,1494,148,1542,227,1610255383_w9L3a.jpg@product_block_8.jpg,14
17194,product_quantity,839,121,909,188,1542,227,1610255383_w9L3a.jpg@product_block_8.jpg,14
17195,product_total_money,1189,126,1529,212,1542,227,1610255383_w9L3a.jpg@product_block_8.jpg,14
17196,product_unit_price,135,128,468,195,1542,227,1610255383_w9L3a.jpg@product_block_8.jpg,14


In [7]:
df_detection, bigX_dt, bigY_dt, bigZ_dt = readcsv('csv_detection/')

In [8]:
X_classification,Y_classification,Z_classification = get_X_Y(df_classification)

In [9]:
X_detection,Y_detection,Z_detection = get_X_Y(df_detection)

In [10]:
def get_xy_coordinate_in_a_single_box(X_cl):
    x = []
    y = []
    for i in range(0, len(X_cl)):
        x.append(X_cl[i][0])
        x.append(X_cl[i][2])
        y.append(X_cl[i][1])
        y.append(X_cl[i][3])
    c = 2
    fi_x= lambda x, c: [tuple(x[i:i+c]) for i in range(0, len(x), c)]
    fi_y= lambda y, c: [tuple(y[i:i+c]) for i in range(0, len(y), c)]
    x = fi_y(x,c)
    y = fi_y(y,c)
    return x,y

def get_xy_coordinate():
    x = []
    y = []
    for i in range(0, len(bigX_cl)):
        xx,yy = get_xy_coordinate_in_a_single_box(bigX_cl[i])
        x.append(xx)
        y.append(yy)
    return x,y
x_coords,y_coords = get_xy_coordinate()

In [11]:
"""
This part finds the relationship of y-coordinates
"""
def get_over_lap(a,b):
    """
    Get the interval overlapping between two lists
    """
    return max(0, min(a[1],b[1])-max(a[0],b[0]))

def get_ovl(box, i):
    """
    Get the list of interval overlapping of the desired box and other boxes in y_coords
    """
    rs = []
    for j in range(len(y_coords[i])):
        rs.append(get_over_lap(box,y_coords[i][j]))
    return rs

def count_sameline(box, i):
    """
    Count the number of other boxes that are on the same line with the given box
    using a given interval
    """
    ovl = get_ovl(box, i)
    c = 0
    for i in ovl:
        if i >= 29:
            c+=1
    return c

def create_list_sameline():
    """
    Create a list of boxes that count the number of boxes on a same line
    """
    lst = []
    for i in range(0,len(y_coords)): # Loop over the 2D lists(files) inside // i = 0 - ...
        for j in range(0,len(y_coords[i])): # Loop over the 1D list in the 2D lists // j = 0 - len(y_coords[0]) = 0 - 9
            lst.append(count_sameline(y_coords[i][j], i))
    return lst


In [12]:
"""
This part finds the relationship of x-coordinates
"""
# Part 1
def get_boxes_on_row(box, i):
    """
    Given a single box, append to the list all the boxes that lie on a same line
    :param: box y-coordinates of a single box to determine whether they are on a same line
    :param: i the ith file
    """
    ovl = get_ovl(box, i)
    lst_bor = []
    time = 0
    for j in range(0, len(ovl)):
        if ovl[j] >= 29:
            lst_bor.append(x_coords[i][j])
    return lst_bor
    
def get_pos_in_row(box, i, j):
    """
    Given a single box, determine the position(index) of the box on the line
    """
    lst_bor = get_boxes_on_row(box, i)
    box = x_coords[i][j] # convert
    lst_sorted = lst_bor.copy()
    lst_sorted.sort()
    index = 0
    for boxes in lst_bor:
        if box == boxes:
            single_ix = lst_sorted.index(boxes)
            index += single_ix
    return index

def create_list_posline():
    """
    Final func:
        Create a list of positions on rows of each box
    """
    lst = []
    for i in range(0,len(y_coords)): # Loop over the 2D lists(files) inside // i = 0 - ...
        for j in range(0,len(y_coords[i])): # Loop over the 1D list in the 2D lists // j = 0 - len(y_coords[0]) = 0 - 9
            lst.append(get_pos_in_row(y_coords[i][j], i, j))
    return lst
# Part 2
def calculate_prev_dis(box,i, j):
    """
    Given a single box, get the distance between a given box to the previous box
    Constraint: if box is at the position index 0, then distance = 0
    need debug
    """
    lst_bor = get_boxes_on_row(box, i)
    box = x_coords[i][j] # convert
    lst_sorted = lst_bor.copy()
    lst_sorted.sort()
    distance = 0
    for boxes in lst_bor:
        if box == boxes:
            single_ix = lst_sorted.index(boxes)
            if single_ix != 0:
                distance += lst_sorted[single_ix][0] - lst_sorted[single_ix -1][1] # xmin[current] - xmax[current-1]
            else:
                distance = 0
    return distance

def calculate_next_dis(box,i,j):
    lst_bor = get_boxes_on_row(box, i)
    box = x_coords[i][j] # convert
    lst_sorted = lst_bor.copy()
    lst_sorted.sort()
    lst_index = []
    distance = 0
    for boxes in lst_bor:
        if box == boxes:
            single_ix = lst_sorted.index(boxes)
            if single_ix < len(lst_sorted) - 1:
                distance += lst_sorted[single_ix+1][0] - lst_sorted[single_ix][1] # xmin[current+1] - xmax[current]
            else:
                distance = 0
    return distance

def get_all_prev_dis():
    lst = []
    for i in range(0,len(y_coords)): # Loop over the 2D lists(files) inside // i = 0 - ...
        for j in range(0,len(y_coords[i])): # Loop over the 1D list in the 2D lists // j = 0 - len(y_coords[0]) = 0 - 9
            lst.append(calculate_prev_dis(y_coords[i][j], i, j))
    return lst
def get_all_next_dis():
    lst = []
    for i in range(0,len(y_coords)): # Loop over the 2D lists(files) inside // i = 0 - ...
        for j in range(0,len(y_coords[i])): # Loop over the 1D list in the 2D lists // j = 0 - len(y_coords[0]) = 0 - 9
            lst.append(calculate_next_dis(y_coords[i][j], i, j))
    return lst
def test():
    print(get_boxes_on_row(y_coords[0][0], 0))
    a = get_boxes_on_row(y_coords[0][0], 0)
    a.sort()
    print(a)
    print(x_coords[0])
    print('true pos: ', get_pos_in_row(y_coords[0][0], 0,0)) # True position on a line in picture
    print(get_all_prev_dis())
    print(get_all_next_dis())


In [13]:
def get_features(df,X_df):
    x_centerDVwidth = get_center_division(df, X_df, 'x')
    y_centerDVheight = get_center_division(df,X_df, 'y')

    x_subtractDVwidth = get_subtracted_division(df,X_df, 'x')
    y_subtractDVheight = get_subtracted_division(df,X_df, 'y')
    
    return x_centerDVwidth, y_centerDVheight, x_subtractDVwidth, y_subtractDVheight
def create_newdf(df, X_df):
    x_centerDVwidth, y_centerDVheight, x_subtractDVwidth, y_subtractDVheight = get_features(df, X_df)
    
    df = df.assign(x_centerDVwidth =x_centerDVwidth )
    df = df.assign(y_centerDVheight =y_centerDVheight )
    df = df.assign(x_subtractDVwidth =x_subtractDVwidth)
    df = df.assign(y_subtractDVheight =y_subtractDVheight )
    
    df = df[['x_centerDVwidth','y_centerDVheight','x_subtractDVwidth','y_subtractDVheight','count', 'Name']]
    return df

In [14]:
new_df_cl = create_newdf(df_classification, X_classification)
new_df_cl = new_df_cl.assign(NumBoxInLine = create_list_sameline())
new_df_cl = new_df_cl.assign(PosBoxOnLine = create_list_posline())
new_df_cl = new_df_cl.assign(PrevDis = get_all_prev_dis())
new_df_cl = new_df_cl.assign(NextDis = get_all_next_dis())


In [15]:
new_df_dt = create_newdf(df_detection, X_detection)

In [16]:
def get_new_XY(df):
    X = df.iloc[:,:4].values
    Y = df.iloc[:,-1].values
    return X,Y

In [17]:
newX_cl,newY_cl = get_new_XY(new_df_cl)
newX_cl = newX_cl.tolist()

In [18]:
newX_dt,newY_dt = get_new_XY(new_df_dt)
newX_dt = newX_dt.tolist()

In [19]:
def get_detected_class(df_cl,df_dt, X_cl, X_dt, Y_cl, Y_dt):
    """
    Get the detected class from df2 with if equal coordinates
    :param: df1 the classification dataframe
    :param: df2 the detection dataframe
    :return: the list of featured_class taken from df2
    """
    result_featured = []
    pos = 0
    for i in range(0,len(X_cl)):
        if X_cl[i] in X_dt:
            position_of_X_dt = X_dt.index(X_cl[i]) # The position of the desired ROW in X_dt
            result_featured.append(Y_dt[position_of_X_dt]) # Getting the desired feature from the specific position
        else:
            result_featured.append("NaN")
    return result_featured

a = get_detected_class(new_df_cl,new_df_dt,newX_cl,newX_dt,newY_cl,newY_dt)

In [20]:
new_df_cl = new_df_cl.assign(FeaturedClass = a)


new_df_cl = new_df_cl[['Name','x_centerDVwidth','y_centerDVheight','x_subtractDVwidth','y_subtractDVheight','NumBoxInLine','PosBoxOnLine','count','PrevDis','NextDis','FeaturedClass']]

new_df_cl

Unnamed: 0,Name,x_centerDVwidth,y_centerDVheight,x_subtractDVwidth,y_subtractDVheight,NumBoxInLine,PosBoxOnLine,count,PrevDis,NextDis,FeaturedClass
0,product_name,0.040761,0.154762,0.081522,0.309524,4,1,9,39,18,text
1,product_name,0.051178,0.150794,0.102355,0.301587,4,3,9,14,0,text
2,product_total_money,0.108696,0.182540,0.217391,0.365079,3,2,9,221,0,price
3,vat,0.035326,0.174603,0.070652,0.349206,1,0,9,0,0,text
4,product_name,0.059330,0.146825,0.118659,0.293651,3,1,9,18,14,text
...,...,...,...,...,...,...,...,...,...,...,...
17193,vat,0.032425,0.147577,0.064851,0.295154,2,1,14,780,0,text
17194,product_quantity,0.022698,0.147577,0.045396,0.295154,3,1,14,371,280,text
17195,product_total_money,0.110246,0.189427,0.220493,0.378855,3,2,14,280,0,price
17196,product_unit_price,0.107977,0.147577,0.215953,0.295154,3,0,14,0,371,price


In [21]:
new_df_cl.isnull().any()

Name                  False
x_centerDVwidth       False
y_centerDVheight      False
x_subtractDVwidth     False
y_subtractDVheight    False
NumBoxInLine          False
PosBoxOnLine          False
count                 False
PrevDis               False
NextDis               False
FeaturedClass         False
dtype: bool

In [22]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
X = new_df_cl.iloc[:,:10].values
lines = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(lines.fit_transform(X))

from sklearn.preprocessing import LabelEncoder
Y = LabelEncoder().fit_transform(new_df_cl.FeaturedClass)
# len(X)
len(Y)

17198

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.31, random_state = 1)

In [24]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, :24] = sc.fit_transform(X_train[:, :24])
X_test[:, :24] = sc.transform(X_test[:, :24])
X_train

array([[-0.08887874268469949, -0.18336764540069436, 2.5130976263188494,
        ..., 1.445203400643282, -0.46505134585076663,
        -0.3473710187228068],
       [-0.08887874268469949, 5.453524790673502, -0.3979153016291657,
        ..., -2.801872785534867, -0.46505134585076663,
        -0.1508531929824561],
       [-0.08887874268469949, -0.18336764540069436, 2.5130976263188494,
        ..., 1.1418408159162712, 1.4547602385146035, 0.8317359357192973],
       ...,
       [-0.08887874268469949, -0.18336764540069436, 2.5130976263188494,
        ..., -0.374972107718782, -0.46505134585076663,
        -0.28531275796269606],
       [-0.08887874268469949, -0.18336764540069436, -0.3979153016291657,
        ..., -0.07160952299177131, -0.46505134585076663,
        0.35078441482843903],
       [-0.08887874268469949, -0.18336764540069436, -0.3979153016291657,
        ..., -0.07160952299177131, -0.42931017273758154,
        -0.3628855839128345]], dtype=object)

In [25]:
from sklearn.ensemble import RandomForestClassifier
classifier =  RandomForestClassifier(n_estimators=200, max_depth= 20, min_samples_split= 8, max_features= 2,
                                 class_weight='balanced', min_samples_leaf=1, random_state = 3)
classifier.fit(X_train,y_train)
from sklearn.metrics import accuracy_score, classification_report
y_pred = classifier.predict(X_test)
print(accuracy_score(y_test, y_pred)*100,'%', end='')

98.66841710427607 %

In [26]:
from xgboost import XGBClassifier
model_xgb = XGBClassifier(objective='multi:softprob',learning_rate = 0.05,
                          max_depth= 20,
                          n_estimators = 200,
                          colsample_bytree = 0.5,random_state = 2, use_label_encoder = False)
model_xgb.fit(X_train,y_train)
y_pred = model_xgb.predict(X_test)
print(accuracy_score(y_test,y_pred)*100, '%')

98.70592648162041 %


In [27]:
from sklearn.ensemble import GradientBoostingClassifier
model_GB= GradientBoostingClassifier(random_state=2,n_estimators=100,learning_rate=0.05,loss="deviance")
model_GB.fit(X_train,y_train)
y_pred= model_GB.predict(X_test)
print(accuracy_score(y_test, y_pred)*100, '%')

98.74343585896474 %
