In [1]:
import pandas as pd
import glob
import numpy as np
from operator import truediv

In [2]:
def get_X_Y(df):
    """
    Get big X and big Y
    """
    X = df.iloc[:,1:5].values
    Y = df.iloc[:,0].values
    Z = df.iloc[:,-1].values
    
    return X,Y,Z

def get_center(df, X_df, param):
    """
    Get the center of x or y
    This function serves as a sub-function for get_center_division()
    :param: Can be 'x' or 'y', specify which center to be calculated
    """
    if param == 'x': 
        list_x_center = []
        for i in range(0, len(df)):
            x_center = (X_df[i][2] - X_df[i][0])/2 # (xmax - xmin)/2
            list_x_center.append(x_center)
        return list_x_center
    elif param == 'y':
        list_y_center = []
        for i in range(0, len(df)):
            y_center = (X_df[i][3] - X_df[i][1])/2 # (ymax - ymin)/2
            list_y_center.append(y_center)
        return list_y_center
    else:
        return None
    
def get_center_division(df,X_df,center):
    """
    Get the x divided by width or y divided by height
    :param: Can be 'x' or 'y', specify which center to be calculated
    """
    if center == 'x':
        the_center = get_center(df,X_df,center)
        width = df['width'].values
        xDVwidth = list(map(truediv, the_center, width)) 

        return xDVwidth
    elif center == 'y':
        the_center = get_center(df,X_df,center)
        height = df['height'].values
        yDVheight = list(map(truediv, the_center, height)) 

        return yDVheight
    else:
        return None

def get_maxmin_subtract(df,X_df,param):
    """
    Get the center of x or y
    This function serves as a sub-function for get_subtracted_division()
    :param: Can be 'x' or 'y', specify which center to be calculated
    """
    if param == 'x':
        list_x_subtract = []
        for i in range(0, len(df)):
            list_x_subtract.append(X_df[i][2] - X_df[i][0]) # xmax - xmin
        return list_x_subtract
    elif param == 'y':
        list_y_subtract = []
        for i in range(0, len(df)):
            list_y_subtract.append(X_df[i][3] - X_df[i][1]) # ymax-ymin
        return list_y_subtract
    else:
        return None
    
def get_subtracted_division(df,X_df,center):
    """
    Get the x divided by width or y divided by height
    :param: Can be 'x' or 'y', specify which center to be calculated
    """
    if center == 'x':
        the_center = get_maxmin_subtract(df,X_df,center)
        width = df['width'].values
        xDVwidth = list(map(truediv, the_center, width)) 

        return xDVwidth
    elif center == 'y':
        the_center = get_maxmin_subtract(df,X_df,center)
        height = df['height'].values
        yDVheight = list(map(truediv, the_center, height)) 

        return yDVheight
    else:
        return None

    
def read(the_path):

    path = the_path
    all_files = glob.glob(path + "/*.csv")
    li = []
    
    bigX, bigY, bigZ = [],[],[]
#     i = 0
    for filename in all_files:
#         i += 1
#         if i < 3:
            a_df = pd.read_csv(filename)
            li.append(a_df)
            a_df['count']=a_df.groupby('filename')['Name'].transform('count').values
            X,Y,Z = get_X_Y(a_df)
            bigX.append(X)
            bigY.append(Y)
            bigZ.append(Z)
    df = pd.concat(li, axis=0, ignore_index=True)
    

    return df, bigX, bigY, bigZ


df_classification, bigX_cl, bigY_cl, bigZ_cl = read('csv_classification/')


In [3]:
df_detection, bigX_dt, bigY_dt, bigZ_dt = read('csv_detection/')

In [4]:
X_classification,Y_classification,Z_classification = get_X_Y(df_classification)

In [5]:
X_detection,Y_detection,Z_detection = get_X_Y(df_detection)

In [6]:
def get_xy_coordinate_in_a_single_box(X_cl):
    x = []
    y = []
    for i in range(0, len(X_cl)):
        x.append(X_cl[i][0])
        x.append(X_cl[i][2])
        y.append(X_cl[i][1])
        y.append(X_cl[i][3])
    c = 2
    fi_x= lambda x, c: [tuple(x[i:i+c]) for i in range(0, len(x), c)]
    fi_y= lambda y, c: [tuple(y[i:i+c]) for i in range(0, len(y), c)]
    x = fi_y(x,c)
    y = fi_y(y,c)
    return x,y

def get_xy_coordinate():
    x = []
    y = []
    for i in range(0, len(bigX_cl)):
        xx,yy = get_xy_coordinate_in_a_single_box(bigX_cl[i])
        x.append(xx)
        y.append(yy)
    return x,y
x_coords,y_coords = get_xy_coordinate()

In [7]:
"""
This part finds the relationship of y-coordinates
"""
def get_over_lap(a,b):
    """
    Get the interval overlapping between two lists
    """
    return max(0, min(a[1],b[1])-max(a[0],b[0]))

def get_ovl(box, i):
    """
    Get the list of interval overlapping of the desired box and other boxes in y_coords
    """
    rs = []
    for j in range(len(y_coords[i])):
        rs.append(get_over_lap(box,y_coords[i][j]))
    return rs
#     return [get_over_lap(box, y) for y in y_coords]

def count_sameline(box, i):
    """
    Count the number of other boxes that are on the same line with the given box
    using a given interval
    """
    ovl = get_ovl(box, i)
    c = 0
    for i in ovl:
        if i >= 29:
            c+=1
    return c

def create_list_sameline():
    """
    Create a list of boxes that count the number of boxes on a same line
    """
    lst = []
    for i in range(0,len(y_coords)): # Loop over the 2D lists(files) inside // i = 0 - ...
        for j in range(0,len(y_coords[i])): # Loop over the 1D list in the 2D lists // j = 0 - len(y_coords[0]) = 0 - 9
            lst.append(count_sameline(y_coords[i][j], i))
    return lst


In [8]:
"""
This part finds the relationship of x-coordinates
"""
# Part 1
def get_boxes_on_row(box, i):
    """
    Given a single box, append to the list all the boxes that lie on a same line
    :param: box y-coordinates of a single box to determine whether they are on a same line
    :param: i the ith file
    """
    ovl = get_ovl(box, i)
    lst_bor = []
    time = 0
    for j in range(0, len(ovl)):
        if ovl[j] >= 29:
            lst_bor.append(x_coords[i][j])
    return lst_bor
    
def get_pos_in_row(box, i, j):
    """
    Given a single box, determine the position(index) of the box on the line
    """
    lst_bor = get_boxes_on_row(box, i)
    box = x_coords[i][j] # convert
    lst_sorted = lst_bor.copy()
    lst_sorted.sort()
    index = 0
    for boxes in lst_bor:
        if box == boxes:
            single_ix = lst_sorted.index(boxes)
            index += single_ix
    return index

def create_list_posline():
    """
    Final func:
        Create a list of positions on rows of each box
    """
    lst = []
    for i in range(0,len(y_coords)): # Loop over the 2D lists(files) inside // i = 0 - ...
        for j in range(0,len(y_coords[i])): # Loop over the 1D list in the 2D lists // j = 0 - len(y_coords[0]) = 0 - 9
            lst.append(get_pos_in_row(y_coords[i][j], i, j))
    return lst
# Part 2
def calculate_prev_dis(box,i, j):
    """
    Given a single box, get the distance between a given box to the previous box
    Constraint: if box is at the position index 0, then distance = 0
    need debug
    """
    lst_bor = get_boxes_on_row(box, i)
    box = x_coords[i][j] # convert
    lst_sorted = lst_bor.copy()
    lst_sorted.sort()
    lst_index = 0
    distance = 0
    for boxes in lst_bor:
        if box == boxes:
            single_ix = lst_sorted.index(boxes)
            if single_ix != 0:
                distance += lst_sorted[single_ix][0] - lst_sorted[single_ix -1][1]
            else:
                distance = 0
    return distance

def get_all_prev_dis():
    lst = []
    for i in range(0,len(y_coords)): # Loop over the 2D lists(files) inside // i = 0 - ...
        for j in range(0,len(y_coords[i])): # Loop over the 1D list in the 2D lists // j = 0 - len(y_coords[0]) = 0 - 9
            lst.append(calculate_prev_dis(y_coords[i][j], i, j))
    return lst

def test():
    print(get_boxes_on_row(y_coords[0][0], 0))
    a = get_boxes_on_row(y_coords[0][0], 0)
    a.sort()
    print(a)
    print(x_coords[0])
    print('true pos: ', get_pos_in_row(y_coords[0][0], 0,0)) # True position on a line in picture
    print(get_all_prev_dis())

In [9]:
def get_features(df,X_df):
    x_centerDVwidth = get_center_division(df, X_df, 'x')
    y_centerDVheight = get_center_division(df,X_df, 'y')

    x_subtractDVwidth = get_subtracted_division(df,X_df, 'x')
    y_subtractDVheight = get_subtracted_division(df,X_df, 'y')
    
    return x_centerDVwidth, y_centerDVheight, x_subtractDVwidth, y_subtractDVheight
def create_newdf(df, X_df):
    x_centerDVwidth, y_centerDVheight, x_subtractDVwidth, y_subtractDVheight = get_features(df, X_df)
    df = df.assign(x_centerDVwidth =x_centerDVwidth )
    df = df.assign(y_centerDVheight =y_centerDVheight )
    df = df.assign(x_subtractDVwidth =x_subtractDVwidth)
    df = df.assign(y_subtractDVheight =y_subtractDVheight )

#     df = df.drop(columns = 'xmin')
#     df = df.drop(columns = 'ymin')
#     df = df.drop(columns = 'xmax')
#     df = df.drop(columns = 'ymax')
#     df = df.drop(columns = 'filename')
#     df = df.drop(columns = 'width')
#     df = df.drop(columns = 'height')

    df = df[['x_centerDVwidth','y_centerDVheight','x_subtractDVwidth','y_subtractDVheight','count', 'Name']]
    return df

In [10]:
new_df_cl = create_newdf(df_classification, X_classification)
new_df_cl = new_df_cl.assign(NumBoxInRow = create_list_sameline())
new_df_cl = new_df_cl.assign(PosBoxOnRow = create_list_posline())
new_df_cl = new_df_cl.assign(PrevDis = get_all_prev_dis())

In [11]:
new_df_dt = create_newdf(df_detection, X_detection)

In [12]:
def get_new_XY(df):
    X = df.iloc[:,:4].values
    Y = df.iloc[:,-1].values
    return X,Y

In [13]:
newX_cl,newY_cl = get_new_XY(new_df_cl)
newX_cl = newX_cl.tolist()

In [14]:
newX_dt,newY_dt = get_new_XY(new_df_dt)
newX_dt = newX_dt.tolist()

In [15]:
def get_detected_class(df_cl,df_dt, X_cl, X_dt, Y_cl, Y_dt):
    """
    Get the detected class from df2 with if equal coordinates
    :param: df1 the classification dataframe
    :param: df2 the detection dataframe
    :return: the list of featured_class taken from df2
    """
    result_featured = []
    pos = 0
    for i in range(0,len(X_cl)):
        if X_cl[i] in X_dt:
            position_of_X_dt = X_dt.index(X_cl[i]) # The position of the desired ROW in X_dt
            result_featured.append(Y_dt[position_of_X_dt]) # Getting the desired feature from the specific position
        else:
            result_featured.append("NaN")
    return result_featured

a = get_detected_class(new_df_cl,new_df_dt,newX_cl,newX_dt,newY_cl,newY_dt)

In [16]:
new_df_cl = new_df_cl.assign(FeaturedClass = a)


new_df_cl = new_df_cl[['x_centerDVwidth','y_centerDVheight','x_subtractDVwidth','y_subtractDVheight','NumBoxInRow','PosBoxOnRow','count','PrevDis','Name','FeaturedClass']]

new_df_cl

Unnamed: 0,x_centerDVwidth,y_centerDVheight,x_subtractDVwidth,y_subtractDVheight,NumBoxInRow,PosBoxOnRow,count,PrevDis,Name,FeaturedClass
0,0.040761,0.154762,0.081522,0.309524,4,1,9,39,product_name,text
1,0.051178,0.150794,0.102355,0.301587,4,3,9,14,product_name,text
2,0.108696,0.182540,0.217391,0.365079,3,2,9,221,product_total_money,price
3,0.035326,0.174603,0.070652,0.349206,1,0,9,0,vat,text
4,0.059330,0.146825,0.118659,0.293651,3,1,9,18,product_name,text
...,...,...,...,...,...,...,...,...,...,...
17193,0.032425,0.147577,0.064851,0.295154,2,1,14,780,vat,text
17194,0.022698,0.147577,0.045396,0.295154,3,1,14,371,product_quantity,text
17195,0.110246,0.189427,0.220493,0.378855,3,2,14,280,product_total_money,price
17196,0.107977,0.147577,0.215953,0.295154,3,0,14,0,product_unit_price,price


In [17]:
new_df_cl.isnull().any()

x_centerDVwidth       False
y_centerDVheight      False
x_subtractDVwidth     False
y_subtractDVheight    False
NumBoxInRow           False
PosBoxOnRow           False
count                 False
PrevDis               False
Name                  False
FeaturedClass         False
dtype: bool

In [27]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
X = new_df_cl.iloc[:,:9].values
lines = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [8])], remainder='passthrough')
X = np.array(lines.fit_transform(X))

from sklearn.preprocessing import LabelEncoder
Y = LabelEncoder().fit_transform(new_df_cl.FeaturedClass)


In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 1)

In [38]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 19:23] = sc.fit_transform(X_train[:, 19:23])
X_test[:, 19:23] = sc.transform(X_test[:, 19:23])
X_train

array([[0.0, 0.0, 0.0, ..., 0.6334584254969565, 0.23113012965149335,
        -0.3727404946733261],
       [0.0, 0.0, 0.0, ..., 0.1154168136673123, -0.07185849070619793,
        -0.41357650016912945],
       [0.0, 0.0, 0.0, ..., 0.6334584254969565, 0.8371073703668759,
        -0.3727404946733261],
       ...,
       [0.0, 0.0, 1.0, ..., -0.9206664099919761, -0.3748471110638892,
        -0.46462150703888366],
       [0.0, 0.0, 0.0, ..., -0.9206664099919761, -0.07185849070619793,
        -0.46462150703888366],
       [0.0, 0.0, 0.0, ..., 0.6334584254969565, -0.07185849070619793,
        -0.4288900022300557]], dtype=object)

In [36]:
from sklearn.ensemble import RandomForestClassifier
classifier =  RandomForestClassifier(n_estimators=200, max_depth=40, min_samples_split= 8, max_features= 4,
                                 class_weight='balanced', min_samples_leaf=1, random_state = 2)
classifier.fit(X_train,y_train)

RandomForestClassifier(class_weight='balanced', max_depth=40, max_features=4,
                       min_samples_split=8, n_estimators=200, random_state=2)

In [37]:
from sklearn.metrics import accuracy_score, classification_report
y_pred = classifier.predict(X_test)
print(accuracy_score(y_test, y_pred)*100,'%', end='')


98.64341085271317 %