Import modules

In [20]:
import numpy as np
import pickle
import plotly.express as px
import gen_features as gf

Processing on data of classification

In [21]:
df = gf.new_df_cl
df

Unnamed: 0,Name,x_centerDVwidth,y_centerDVheight,x_subtractDVwidth,y_subtractDVheight,NumBoxInLine,PosBoxOnLine,count,PrevDis,NextDis,FeaturedClass
0,product_name,0.040761,0.154762,0.081522,0.309524,4,1,9,39,18,text
1,product_name,0.051178,0.150794,0.102355,0.301587,4,3,9,14,0,text
2,product_total_money,0.108696,0.182540,0.217391,0.365079,3,2,9,221,0,price
3,vat,0.035326,0.174603,0.070652,0.349206,1,0,9,0,0,text
4,product_name,0.059330,0.146825,0.118659,0.293651,3,1,9,18,14,text
...,...,...,...,...,...,...,...,...,...,...,...
17193,vat,0.032425,0.147577,0.064851,0.295154,2,1,14,780,0,text
17194,product_quantity,0.022698,0.147577,0.045396,0.295154,3,1,14,371,280,text
17195,product_total_money,0.110246,0.189427,0.220493,0.378855,3,2,14,280,0,price
17196,product_unit_price,0.107977,0.147577,0.215953,0.295154,3,0,14,0,371,price


Checking for missing data

In [22]:
df.isnull().any()

Name                  False
x_centerDVwidth       False
y_centerDVheight      False
x_subtractDVwidth     False
y_subtractDVheight    False
NumBoxInLine          False
PosBoxOnLine          False
count                 False
PrevDis               False
NextDis               False
FeaturedClass         False
dtype: bool

In [23]:
fig = px.scatter(df, x='x_centerDVwidth', y='y_centerDVheight', color = 'PosBoxOnLine', template= 'plotly_dark',labels={
                     "x_centerDVwidth": "X",
                     "y_centerDVheight": "Y"
                 },title="Distribution of boxes")
fig.show()

In [24]:
fig = px.scatter_3d(df, x='NumBoxInLine', y='PosBoxOnLine', z='count', color = 'count', template= 'plotly_dark',
                    title = 'Relationship between positions, total boxes on line, total boxes in images ', labels={
                     "NumBoxInLine": "Total boxes on line",
                     "PosBoxOnLine": "Position of a box on line",
                     "count": "Total boxes in an image" }  )
fig.show()

Generate training data

In [14]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
X = df.iloc[:,:10].values
lines = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(lines.fit_transform(X))

from sklearn.preprocessing import LabelEncoder
Y = LabelEncoder().fit_transform(df.FeaturedClass)

Train test split

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.31, random_state = 1)

Feature scaling

In [16]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, :24] = sc.fit_transform(X_train[:, :24])
X_test[:, :24] = sc.transform(X_test[:, :24])
X_train

array([[-0.08887874268469949, -0.18336764540069436, 2.5130976263188494,
        ..., 1.445203400643282, -0.46505134585076663,
        -0.3473710187228068],
       [-0.08887874268469949, 5.453524790673502, -0.3979153016291657,
        ..., -2.801872785534867, -0.46505134585076663,
        -0.1508531929824561],
       [-0.08887874268469949, -0.18336764540069436, 2.5130976263188494,
        ..., 1.1418408159162712, 1.4547602385146035, 0.8317359357192973],
       ...,
       [-0.08887874268469949, -0.18336764540069436, 2.5130976263188494,
        ..., -0.374972107718782, -0.46505134585076663,
        -0.28531275796269606],
       [-0.08887874268469949, -0.18336764540069436, -0.3979153016291657,
        ..., -0.07160952299177131, -0.46505134585076663,
        0.35078441482843903],
       [-0.08887874268469949, -0.18336764540069436, -0.3979153016291657,
        ..., -0.07160952299177131, -0.42931017273758154,
        -0.3628855839128345]], dtype=object)

Train data using different Machine Learning Classification Algorithms

In [17]:
from sklearn.ensemble import RandomForestClassifier
classifier =  RandomForestClassifier(n_estimators=200, max_depth= 20, min_samples_split= 8, max_features= 2,
                                 class_weight='balanced', min_samples_leaf=1, random_state = 3)
classifier.fit(X_train,y_train)
from sklearn.metrics import accuracy_score, classification_report
y_pred = classifier.predict(X_test)
print(accuracy_score(y_test, y_pred)*100,'%', end='')

98.66841710427607 %

In [18]:
from xgboost import XGBClassifier
model_xgb = XGBClassifier(objective='multi:softprob',learning_rate = 0.05,
                          max_depth= 20,
                          n_estimators = 200,
                          colsample_bytree = 0.5,random_state = 2, use_label_encoder = False)
model_xgb.fit(X_train,y_train)
y_pred = model_xgb.predict(X_test)
print(accuracy_score(y_test,y_pred)*100, '%')

98.70592648162041 %


In [19]:
from sklearn.ensemble import GradientBoostingClassifier
model_GB= GradientBoostingClassifier(random_state=2,n_estimators=100,learning_rate=0.05,loss="deviance")
model_GB.fit(X_train,y_train)
y_pred= model_GB.predict(X_test)
print(accuracy_score(y_test, y_pred)*100, '%')

98.74343585896474 %
