# Example Code

The purpose of this code is to illustrate the functionality of the classification system described in *"A Machine Learning Approach to Classifying Construction Cost Documents into the International Construction Measurement Standard"* by J. I Deza, H. Ihshaish and L. Mahdjoubi.


## Instructions 

1. Run the first Cell of this Jupyter notebook
2. Edit the BoQ_text string (some examples are provided)
3. Run predict on that string.
4. A dictionary containing the ICMS number and it's definition

In [43]:
import re,joblib  
import pandas as pd
## for processing
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
from nltk.corpus import stopwords
import warnings
warnings.simplefilter('ignore')

from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

stop_words = set(stopwords.words('english'))
lst_stopwords = nltk.corpus.stopwords.words("english") + ['x', 'mm', 'cm', 'ref', 'wb', 'xx', 'per', 'm']
max_features = 6000
accepted_words = joblib.load('accepted_words.joblib')
clf = joblib.load('Random_Forests_trained_model.joblib.gz')
icms_dct = joblib.load('icms1_dictionnary.joblib') 
Cat = joblib.load('Categories_ML.joblib')

def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    #remove if only two characters
    text_big = re.sub(r'\W*\b\w{1,2}\b', '', text) 
          
    ## Tokenize (convert from string to list)
    lst_text = text_big.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]
    
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]

    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
                 ## removing tags
        ## removing digits

    ## back to string from list
    text = " ".join(lst_text)
    return text

def clean_text_data(df,col):
    # Separate data
    desc_lower = df[col]
    # Remove text before "|" character
    desc_split = desc_lower.str.split("|")
    desc_strip = desc_split.apply(lambda x: x[1] if len(x) > 1 else x[0])
    # Removing digits and words containing digits
    desc_nodigits = desc_strip.apply(lambda x: re.sub("\w*\d\w*", "", x))
    # Removing punctuation
    desc_nopunc = desc_nodigits.apply(lambda x: re.sub(r"[^\w\s]", "", x))
    # Removing additional whitespace
    desc_clean = desc_nopunc.apply(lambda x: re.sub(' +', ' ', x))
    return desc_clean


def goodCode(name,code,desc):
    A = {'success' : 'true' , "Message" : name ,'ICMS' : code ,"Description" : desc}
    R = A['ICMS'].split('.')
    D = [x.strip() for x  in A['Description'].split('\\')]
    A['R2'] = R[0]
    A['R3'] = R[1]
    A['R4'] = R[2]

    A['Desc2'] = D[0]
    A['Desc3'] = D[1]
    A['Desc4'] = D[2]
    return A

def predict(names,clf):
    if type(names) == str:
        names = [names]
    A = pd.DataFrame()
    A['comment_list'] = names
    A['comment_list'] = A.comment_list.apply(lambda x: utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, lst_stopwords=lst_stopwords))
    A['comment_list_new'] = clean_text_data(A,'comment_list')
    vectorizer = CountVectorizer(max_features=max_features,strip_accents='ascii',vocabulary=accepted_words)
    X = vectorizer.fit_transform(A['comment_list_new']).toarray()
    if X.sum() == 0:
        return [{'success' : 'false' , "Message" : names[0] ,'ICMS' : 'XX.XX.XXX' ,"Description" : 'No ICMS Code'}]
    y_pred = clf.predict(X)
    codes = [Cat[x] for x in y_pred]
    descs =[icms_dct[c] for c in codes]
    answer = [goodCode(names[i],codes[i],descs[i]) for i,_ in enumerate(names)]
    return answer

[nltk_data] Downloading package punkt to /Users/ignacio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ignacio/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/ignacio/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/ignacio/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ignacio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [53]:
BoQ_text = 'Clear site of all signs and equipment (including diversion routes)'
BoQ_text = 'Geophysical Survey in accordance with drawing XX'
BoQ_text = 'Termination of optic fibre cable to XX equipment cabinet Type YY'
BoQ_text = 'Power reduction joint of XX mm2 to XX mm2'
BoQ_text = 'Take down and remove to tip off Site unlit traffic sign including 4 posts'
BoQ_text = 'Galvanised high adherence reinforcing strips acting as soil reinforcement'
BoQ_text = 'Installation of wildlife tunnel XX m in length as per diagram XX'
BoQ_text = 'Clear site of all signs and equipment (including diversion routes)'

predict(BoQ_text,clf)[0]


{'success': 'true',
 'Message': 'Clear site of all signs and equipment (including diversion routes)',
 'ICMS': '1.01.060',
 'Description': 'Capital Construction Costs \\ Demolition, site preparation and formation \\ Site surface clearance (clearing, grubbing, topsoil stripping, tree felling, minor earthwork, removal)',
 'R2': '1',
 'R3': '01',
 'R4': '060',
 'Desc2': 'Capital Construction Costs',
 'Desc3': 'Demolition, site preparation and formation',
 'Desc4': 'Site surface clearance (clearing, grubbing, topsoil stripping, tree felling, minor earthwork, removal)'}