# Overview

We are trying to find the best porforming classification models. We will be training the models to determine the if any particular page on a PDF is a map (aka alignment sheet) or not. 


In this code we will Once the necessary libraries are imported, the following actions are performed:

- <strong>Load labelled data: </strong>
Here we generate the features using "extract_features" function.  

- <strong>Train test split: </strong>
Split the dataset into test and train set. 

- <strong>Prepare validation set: </strong>
Create validation dataframe.

- <strong>Implement classification models: </strong>
Train various classification models and then get accuracy score and confusion matric for test and validation set.  

- <strong>Compare models: </strong>
Compare the accuracy score and the confusion matrix and save the best model for future use. 

In [36]:
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, tree
import xgboost
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import random

In [4]:
from feature_extraction import extract_features

path = os.getcwd()
path = os.path.abspath('..')

### Load labeled data 

Here we have the PDF names (or Data IDs) and we also have manually marked which pages on these PDFs are maps. First, we create the dataframe of the features by extracting the features of each page on these PDFs using the function "extract_features". Then by using the marked values of each page being map or not we create the dataframe for the dependent variable. The variable "dataID_pageNo" is onnly used to identify a certain page. 

In [7]:
path_pdf = path + "\\TrainingSet\\"

DataIDHand = [268712,  486221, 500633, 
               555093, 684494, 
              895015, 2392922, 2445549, 2758927,
              2813701,  
              2967854, 2968069,  
              3891802,
              4036098]
Pages = [[3,4,5,8,9,10,14,15,24,25,26], range(1,5),  [5,9], 
          [6,9,33,34], [12,13,14],
         range(1,11), [1], range(1,4),  [9],
         [40, 92, 95, 143, 170, 180, 216, 217, 218, 219],
         [3,4],range(1,13),  
         [33, 34, 35, 89, 90, 91, 92, 93, 100, 146, 147, 148, 149, 153, 154, 159, 160, 161, 162, 165, 166, 169, 170, 173, 174, 177, 178, 181, 182, 184, 185, 188, 189], 
          []]

print("Number of PDFs: ", len(DataIDHand), ". Len of Pages array:" len(Pages))

14
14


In [8]:
# # No need to run this code if the features are alredy saved
# # Fetching features for each page of the PDF Files and saving them 
# X_df, dataIDs, error_files = extract_features(DataIDHand, path_pdf) 

# print("\n Number of Error files", len(error_files))
# X_df.to_csv(path + "\\data\\features_test_train.csv")
# dataIDs.to_csv(path + "\\data\\dataIDs.csv")

File Starting: 268712. PDF 1 out of 14
File Starting: 486221. PDF 2 out of 14
File Starting: 500633. PDF 3 out of 14
File Starting: 555093. PDF 4 out of 14
File Starting: 684494. PDF 5 out of 14
File Starting: 895015. PDF 6 out of 14
File Starting: 2392922. PDF 7 out of 14
File Starting: 2445549. PDF 8 out of 14
File Starting: 2758927. PDF 9 out of 14


mupdf: invalid page object


File Starting: 2813701. PDF 10 out of 14
File Starting: 2967854. PDF 11 out of 14
File Starting: 2968069. PDF 12 out of 14
File Starting: 3891802. PDF 13 out of 14
File Starting: 4036098. PDF 14 out of 14


In [46]:
X_df = pd.read_csv(path + "\\data\\features_test_train.csv", index_col = 0)
dataIDs = pd.read_csv(path + "\\data\\dataIDs.csv", index_col = 0)

# Keepin only the features (remove index)
X_df_features = X_df.copy()
X_df_features.drop(columns=['dataID_pageNo'], inplace=True)
X_df_features.head()

Unnamed: 0,scale,km_kilometers,m,metres,scale_grp,legend,figure,mapp,alignment_sheet,sheet,figure_grp,north,n,words_in_page,No_of_images,Area_of_images
0,0,0,171,0,0,0,0,0,0,0,0,0,0,1182,1,79376
1,0,0,195,0,0,0,1,0,0,0,1,0,0,1747,1,79376
2,2,1,87,1,1,0,1,0,0,0,1,0,0,755,2,3433
3,2,1,99,1,1,0,1,0,0,0,1,0,0,857,2,3433
4,2,1,89,1,1,0,1,0,0,0,1,0,0,775,2,3433


In [16]:
def get_Y_values(dataIDs, Pages):
    Y_class = []
    dataID_pageNo = []
    j = 0
    for index, row in dataIDs.iterrows():
        #print(row['DataIDs'])
        #print(row['Page_no'])
        for i in range(1,row['Page_no']+1):
            if i in Pages[j]:
                Y_class.append(1)
            else:
                Y_class.append(0)
            dataID_pageNo.append(str(row['DataIDs']) + "_" +str(i))
        j = j+1
    
    Y_df = pd.DataFrame({'dataID_pageNo' : dataID_pageNo, 
                         'Y_class' : Y_class})
    Y_dfclass = pd.DataFrame({'Y_class' : Y_class})
    
    return Y_df, Y_dfclass
    
                
Y_df, Y_dfclass = get_Y_values(dataIDs, Pages)

In [17]:
print(len(Y_df))
print(len(X_df))
print(len(Y_dfclass))

994
994
994


### Train test split

We set the seed value to get the same results when rerunning the code. Then we split the dataset randomly into train set and test set. (train set = 0.75, test set =0.25)

In [18]:
random.seed(19)
X_train, X_test, y_train, y_test = train_test_split(X_df_features,
                                                    Y_dfclass,
                                                    test_size = 0.25,
                                                    random_state = 8)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(745, 16)
(249, 16)
(745, 1)
(249, 1)


In [19]:
print("Training Set: ", len(y_train))
print("Alignment Sheets in Training Set: ", len(y_train[y_train.Y_class > 0]))
print()
print("Test Set: ", len(y_test))
print("Alignment Sheets in Training Set: ", len(y_test[y_test.Y_class > 0]))

Training Set:  745
Alignment Sheets in Training Set:  67

Test Set:  249
Alignment Sheets in Training Set:  29


### Prepare validation set 

In the validation set we make use of new PDFs representing a new real world problem. For validation sets we would except slighly lower level of accuracy. The model which performs better on validation set typically performs better overall. 

We extract features using 'extract_features' function as earlier. Then we create validation dataframes for features and dependent variable. 

In [20]:
DataIDHand = [2968356, 3410189, 3970828]
Pages = [[9,18, 26], 
         [], 
         [29, 35, 51, 59, 100, 101, 108, 109, 165, 179, 225, 231, 293, 294]]

print(len(DataIDHand))
print(len(Pages))

3
3


In [23]:
path_pdf = (path + "\\ValidationSet\\")

# #fetching featuresfor the pages of the PDF Files
X_df_valid, dataIDs_valid, error_files = extract_features(DataIDHand, path_pdf) 
# #Features
# #dataIDs
# #error_files

File Starting: 2968356. PDF 1 out of 3


mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object


File Starting: 3410189. PDF 2 out of 3
File Starting: 3970828. PDF 3 out of 3


In [28]:
X_df_valid.to_csv(path + "\\data\\features_valid.csv")
dataIDs_valid.to_csv(path + "\\data\\dataIDs_valid.csv")
print(len(error_files))

0


In [29]:
X_df_valid = pd.read_csv(path + "\\data\\features_valid.csv", index_col = 0)
dataIDs_valid = pd.read_csv(path + "\\data\\dataIDs_valid.csv", index_col = 0)
X_df_valid.head()

Unnamed: 0,scale,km_kilometers,m,metres,scale_grp,legend,figure,mapp,alignment_sheet,sheet,figure_grp,north,n,words_in_page,No_of_images,Area_of_images,dataID_pageNo
0,0,0,0,0,0,0,0,0,0,0,0,0,0,424,0,0,2968356_1
1,0,0,0,0,0,0,0,0,0,0,0,0,0,23,0,0,2968356_2
2,0,0,0,0,0,0,0,0,0,0,0,0,0,2243,0,0,2968356_3
3,0,0,280,6,1,0,5,0,0,0,1,0,0,2336,0,0,2968356_4
4,0,1,216,7,1,0,0,0,0,0,0,1,0,1149,0,0,2968356_5


In [30]:
X_df_features_valid = X_df_valid.copy()
X_df_features_valid.drop(columns=['dataID_pageNo'], inplace=True)
X_df_features_valid.head()

Unnamed: 0,scale,km_kilometers,m,metres,scale_grp,legend,figure,mapp,alignment_sheet,sheet,figure_grp,north,n,words_in_page,No_of_images,Area_of_images
0,0,0,0,0,0,0,0,0,0,0,0,0,0,424,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,23,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,2243,0,0
3,0,0,280,6,1,0,5,0,0,0,1,0,0,2336,0,0
4,0,1,216,7,1,0,0,0,0,0,0,1,0,1149,0,0


In [31]:
Y_df_valid, Y_dfclass_valid = get_Y_values(dataIDs_valid, Pages)

print(len(Y_df_valid))
print(len(X_df_features_valid))
print(len(X_df_valid))
print(len(Y_dfclass_valid))

555
555
555
555


### Implement classification models

In this section we are using regression models as classification models, hence they are collectively reffered to as classification models. <br>

First we save the classification models and their names in an array. Then for each of these models first we fit the model using the training dataset and then generate the confusion matrix and accuracy score for each of these models. 

In [32]:
classifiers = []
name = []
# we will create an array of Classifiers and append different classification models to our array.
model1 = xgboost.XGBClassifier()
classifiers.append(model1)
name.append("xgboost")

model2 = svm.SVC()
classifiers.append(model2)
name.append("svc")

model3 = tree.DecisionTreeClassifier()
classifiers.append(model3)
name.append("decisiontree")

model4 = RandomForestClassifier()
classifiers.append(model4)
name.append("rfc")


model5 = RandomForestRegressor(n_estimators=5)
classifiers.append(model5)
name.append("rfr5")

model6 = RandomForestRegressor(n_estimators=25)
classifiers.append(model6)
name.append("rfr25")

model7 = RandomForestRegressor(n_estimators=50)
classifiers.append(model7)
name.append("rfr50")

model8 = RandomForestRegressor(n_estimators=75)
classifiers.append(model8)
name.append("rfr75")

model9 = RandomForestRegressor(n_estimators=100)
classifiers.append(model9)
name.append("rfr100")


model10 = XGBRegressor(n_estimators=5)
classifiers.append(model10)
name.append("xgbr5")

model11 = XGBRegressor(n_estimators=25)
classifiers.append(model11)
name.append("xgbr25")

model12 = XGBRegressor(n_estimators=50)
classifiers.append(model12)
name.append("xgbr50")

model13 = XGBRegressor(n_estimators=75)
classifiers.append(model13)
name.append("xgbr75")

model14 = XGBRegressor(n_estimators=100)
classifiers.append(model14)
name.append("xgbr100")

In [33]:
i = 0
random.seed(10)
test_accuracy = []
valid_accuracy = []
cm_test = []
cm_valid = []
for clf in classifiers:
    print("________________________________________________________")
    print("________________________________________________________")
    #fit our algorithms in our Train dataset 
    clf.fit(X_train, y_train)
    
    #get test dataset prediction
    if "rfr" or "xgbr" in name[i]:
        y_pred_nb = clf.predict(X_test)
        #y_pred.shape
        #y_pred
        y_pred = []
        for y in y_pred_nb:
            if y > 0.50:
                y_pred.append(1)
            else:
                y_pred.append(0)
    else:
        y_pred= clf.predict(X_test)
        
    print(name[i])
    acc = accuracy_score(y_test, y_pred)
    test_accuracy.append(acc)
    print("Accuracy of %s is %s"%(clf, acc))
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix of %s is %s"%(clf, cm))
    cm_test.append(cm)
    
    
    print("________________Validation Set ___________________________")
    #get validation accuracy
    if "rfr" or "xgbr" in name[i]:
        y_pred_nb = clf.predict(X_df_features_valid)
        #y_pred.shape
        #y_pred
        y_pred = []
        for y in y_pred_nb:
            if y > 0.50:
                y_pred.append(1)
            else:
                y_pred.append(0)
    else:
        y_pred= clf.predict(X_df_features_valid)
        
    print(name[i])
    acc = accuracy_score(Y_dfclass_valid["Y_class"], y_pred)
    valid_accuracy.append(acc)
    print("Accuracy of %s is %s"%(clf, acc))
    cm = confusion_matrix(Y_dfclass_valid["Y_class"], y_pred)
    print("Confusion Matrix of %s is %s"%(clf, cm))
    cm_valid.append(cm)
    i = i +1

________________________________________________________
________________________________________________________
xgboost
Accuracy of XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=24, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None) is 1.0
Confusion Matrix of XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_de

  return f(**kwargs)
  # This is added back by InteractiveShellApp.init_path()



Accuracy of DecisionTreeClassifier() is 0.9919678714859438
Confusion Matrix of DecisionTreeClassifier() is [[220   0]
 [  2  27]]
________________Validation Set ___________________________
decisiontree
Accuracy of DecisionTreeClassifier() is 0.972972972972973
Confusion Matrix of DecisionTreeClassifier() is [[523  15]
 [  0  17]]
________________________________________________________
________________________________________________________
rfc
Accuracy of RandomForestClassifier() is 1.0
Confusion Matrix of RandomForestClassifier() is [[220   0]
 [  0  29]]
________________Validation Set ___________________________
rfc
Accuracy of RandomForestClassifier() is 0.9963963963963964
Confusion Matrix of RandomForestClassifier() is [[536   2]
 [  0  17]]
________________________________________________________
________________________________________________________
rfr5
Accuracy of RandomForestRegressor(n_estimators=5) is 1.0
Confusion Matrix of RandomForestRegressor(n_estimators=5) is [[220

  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()


rfr50
Accuracy of RandomForestRegressor(n_estimators=50) is 0.9919678714859438
Confusion Matrix of RandomForestRegressor(n_estimators=50) is [[220   0]
 [  2  27]]
________________Validation Set ___________________________
rfr50
Accuracy of RandomForestRegressor(n_estimators=50) is 0.972972972972973
Confusion Matrix of RandomForestRegressor(n_estimators=50) is [[523  15]
 [  0  17]]
________________________________________________________
________________________________________________________


  # This is added back by InteractiveShellApp.init_path()


rfr75
Accuracy of RandomForestRegressor(n_estimators=75) is 0.9919678714859438
Confusion Matrix of RandomForestRegressor(n_estimators=75) is [[220   0]
 [  2  27]]
________________Validation Set ___________________________
rfr75
Accuracy of RandomForestRegressor(n_estimators=75) is 0.972972972972973
Confusion Matrix of RandomForestRegressor(n_estimators=75) is [[523  15]
 [  0  17]]
________________________________________________________
________________________________________________________


  # This is added back by InteractiveShellApp.init_path()


rfr100
Accuracy of RandomForestRegressor() is 0.9919678714859438
Confusion Matrix of RandomForestRegressor() is [[220   0]
 [  2  27]]
________________Validation Set ___________________________
rfr100
Accuracy of RandomForestRegressor() is 0.972972972972973
Confusion Matrix of RandomForestRegressor() is [[523  15]
 [  0  17]]
________________________________________________________
________________________________________________________
xgbr5
Accuracy of XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=5, n_jobs=24, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', valida

  "because it will generate extra copies and increase " +


xgbr50
Accuracy of XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=50, n_jobs=24, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None) is 0.9919678714859438
Confusion Matrix of XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_

### Compare classification models

Here we save the accuracy score and the confusion matric for the test and validation sets. Then we save the pickled version of the best classification model for future use. 

In [34]:
classification_models = pd.DataFrame({'name': name, 
                                     'test_accuracy': test_accuracy,
                                     'test_cm': cm_test, 
                                     'valid_accuracy':valid_accuracy,
                                     'valid_cm': cm_valid})
classification_models["product"] = classification_models["test_accuracy"]*classification_models["valid_accuracy"]

classification_models = classification_models.sort_values(by=['product'])
classification_models.head(15)

Unnamed: 0,name,test_accuracy,test_cm,valid_accuracy,valid_cm,product
1,svc,0.939759,"[[220, 0], [15, 14]]",0.969369,"[[538, 0], [17, 0]]",0.910974
9,xgbr5,0.991968,"[[220, 0], [2, 27]]",0.969369,"[[521, 17], [0, 17]]",0.961583
10,xgbr25,0.991968,"[[220, 0], [2, 27]]",0.969369,"[[521, 17], [0, 17]]",0.961583
11,xgbr50,0.991968,"[[220, 0], [2, 27]]",0.969369,"[[521, 17], [0, 17]]",0.961583
12,xgbr75,0.991968,"[[220, 0], [2, 27]]",0.969369,"[[521, 17], [0, 17]]",0.961583
13,xgbr100,0.991968,"[[220, 0], [2, 27]]",0.969369,"[[521, 17], [0, 17]]",0.961583
2,decisiontree,0.991968,"[[220, 0], [2, 27]]",0.972973,"[[523, 15], [0, 17]]",0.965158
5,rfr25,0.991968,"[[220, 0], [2, 27]]",0.972973,"[[523, 15], [0, 17]]",0.965158
6,rfr50,0.991968,"[[220, 0], [2, 27]]",0.972973,"[[523, 15], [0, 17]]",0.965158
7,rfr75,0.991968,"[[220, 0], [2, 27]]",0.972973,"[[523, 15], [0, 17]]",0.965158


In [37]:
i = 0
random.seed(10)

for clf in classifiers:
    
    if name[i] != "rfc":
        i = i +1
        continue
    print(name[i])
    clf.fit(X_train, y_train)
    filename = path + "\\data\\alignment_sheet_classifier_rfc.sav"
    pickle.dump(clf, open(filename, 'wb'))
    
    filename = "alignment_sheet_classifier_rfr50.sav"
    pickle.dump(clf, open(filename, 'wb'))
    i = i +1

rfc


  # Remove the CWD from sys.path while we load stuff.


### Further analysis 

Here we try to observe the features and their importance score for classification model. 

In [39]:
f_importance = clf.feature_importances_
feature = []

for col in X_df_features:
    feature.append(col)
    
df_f_importance = pd.DataFrame({'Feature_Name' :  feature, 
                                'Importance':  f_importance})
df_f_importance

Unnamed: 0,Feature_Name,Importance
0,scale,0.1084597
1,km_kilometers,0.06070672
2,m,0.01152203
3,metres,0.0
4,scale_grp,0.0
5,legend,0.0
6,figure,1.440063e-05
7,mapp,0.0
8,alignment_sheet,0.0
9,sheet,0.0


In [42]:
df_f_importance = df_f_importance.sort_values(by=['Importance'])
df_f_importance

Unnamed: 0,Feature_Name,Importance
3,metres,0.0
4,scale_grp,0.0
5,legend,0.0
7,mapp,0.0
8,alignment_sheet,0.0
9,sheet,0.0
12,n,0.0
11,north,3.803738e-07
13,words_in_page,1.236311e-05
6,figure,1.440063e-05
