  NOTE:
  The code is written in **vscode** and suitable to run with it

   # Library, Frameworks import statements

In [1]:
import random
import os
import re
import math
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import graphviz 


   ## Functions definitions

In [2]:
def pre_process():
    cwd = os.getcwd()
    n_files = 0     # files count
    for file in os.listdir('row_data'):
        file_path = cwd + '\\row_data\\' + file
        n_files += 1
        with open(file_path, 'r') as in_f:
            regex = r'<form(.|\n)*<\/form>'
            content = in_f.read()
            for match in re.finditer(regex, content):
                with open(cwd+'\data\\'+file, 'w') as out_f:
                    out_f.write(match.group())
    return n_files

def extract_label_input(content, features, labels):
    '''
    Function to catch features of the form
    <label>..</label>
    <input>..</input>
    IN: content: <div>..</div> which contains both input, label tags
        features: string list to store the features
        labels: string list to store the labels
    OUT: the updated lists: features, labels
    '''
    # Match the pattern label-input
    for match in re.finditer(r'<label[\s\S]*?<input[\s\S]*?>', content):
        #print("Inner \n",match.group(), 50*"-", "\n")
        Label = re.findall(r'<label[\s\S]*?</label>', match.group())[0]
        Input = re.findall(r'<input[\s\S]*"[\s\S]*>', match.group())[0]
        #print("Label: ", Label, "\nInput: ", Input, "\n")

        # Now, check if the matched don't follow input, label pattern
        # Then, return None
        if None in (Input, Label):
            return None

        #print("Match: ", match.group())
        # Check if label is for this input
        label_for = re.findall(r'(?<=for=")[\s\S]*?(?=")', match.group())[0]
        input_id = re.findall(r'(?<=id=")[\s\S]*?(?=")', match.group())[0]
        #print("\nLabel for: ", label_for, "\tInput id= ", input_id)
        if(label_for != input_id):
            return None
        
        # Extract input type, and text btw <label> tags 
        input_type = re.findall(r'(?<=type=")[\s\S]*?(?=")', Input)[0]
        labl_txt = re.findall(r'(?<=>)[\w\s/\*]+(?=\s*</label>)', Label)[0].strip(' .*#!').lower()

        #print("\nInput Type: ", input_type, "\tLabel text: ", labl_txt)
      
        # append
        features.append([label_for, labl_txt, input_type])
        labels.append([input_id])    

def build_features_labels(feature_lst, label_lst):
    cwd = os.getcwd()
    n_files = 0     # files count
    for file in os.listdir('data'):
        file_path = cwd + '\\data\\' + file
        with open(file_path, 'r') as in_f:
            # Catch input, label inside <div> tags
            div_regex = r'<div[\s\S]*?<\/div>'
            content = in_f.read()
            for match in re.finditer(div_regex, content):
                # Now, check if the code between <div> tags contains <input>
                if "<input" in match.group():
                    #print(match.group())
                    #print("\n",50*"=", "\n")
                    #print("File name: ", file)
                    extract_label_input(match.group(), feature_lst, label_lst)
        
                
def divide_corpus(n_files):
    # A function used to divide teh corpus into train & test datasets
    # In: # of files 
    # Out: train_files: set of file numbers to be used in training
    #      test_files: set of file numbers to be used in testing

    # Select randomly the train, test dataset 
    train_files = set()
    test_files = set()
    
    for i in range(math.floor(n_files*0.6+1)):
        rand = random.randint(1, n_files+1)
        while rand in train_files:
            rand = random.randint(1, n_files+1)
        train_files.add(rand)
    
    test_files = set([x for x in range(1, n_files+1)]) - train_files
    
    return train_files, test_files
    
def predict(div):
    # In:   div tag
    # Out:  Prediction probability
    # Extract features using regex
    div_regex = r'<div[\s\S]*?<\/div>'
    for match in re.finditer(div_regex, div):
        # Now, check if the code between <div> tags contains <input>
        if "<input" in match.group():
            # Match the pattern label-input
            for match in re.finditer(r'<label[\s\S]*?<input[\s\S]*?>', div):
                #print("Inner \n",match.group(), 50*"-", "\n")
                Label = re.findall(r'<label[\s\S]*?</label>', match.group())[0]
                Input = re.findall(r'<input[\s\S]*"[\s\S]*>', match.group())[0]
                #print("Label: ", Label, "\nInput: ", Input, "\n")

                # Now, check if the matched don't follow input, label pattern
                # Then, return None
                if None in (Input, Label):
                    return "Invalid Input 'pattern label-input'"

                #print("Match: ", match.group())
                # Check if label is for this input
                label_for = re.findall(r'(?<=for=")[\s\S]*?(?=")', match.group())[0]
                
                # Extract input type, and text btw <label> tags 
                input_type = re.findall(r'(?<=type=")[\s\S]*?(?=")', Input)[0]
                labl_txt = re.findall(r'(?<=>)[\w\s/\*]+(?=\s*</label>)', Label)[0].strip(' .*#!').lower()
                
                # Now, encode strings into numeric
                feature = [label_for, labl_txt, input_type]

                x = enc.transform([feature])

                y = clf.predict(x)
                p = clf.predict_proba(x)
                print("Input code: ",feature ,"\nPredicted input_id or input_name: ", le.inverse_transform(y), "\nProbability of prediction: \n")
                prob_df = pd.DataFrame(p, columns=[le.classes_])
    return prob_df



   # Main App:
    1. Preprocessing:

In [3]:
files_count = pre_process() 


    2. Building the model
  Encode the string labels & features into numerical values

In [4]:
feature_lst= []     # List of feature classes
label_lst = []      # List of label classes
build_features_labels(feature_lst, label_lst)
feature_df = pd.DataFrame(feature_lst, columns=['label_for', 'label_text', 'input_type'])
label_df = pd.DataFrame(label_lst, columns=['input_id'])
# Encode the string labels & features into numerical values
enc = OrdinalEncoder()
enc.fit(feature_lst)
x = enc.transform(feature_lst)   # numerical features

le = LabelEncoder()
le.fit(label_lst)
y = le.transform(label_lst)     # numerical labels



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [5]:
feature_df.style



Unnamed: 0,label_for,label_text,input_type
0,username,username,text
1,password,password,password
2,username,username,text
3,password,password,password
4,confirm-password,confirm password,password
5,full-name,name,text
6,phone,phone,tel
7,email,email,email
8,first-name,first name,text
9,last-name,last name,text


In [6]:
label_df.style



Unnamed: 0,input_id
0,username
1,password
2,username
3,password
4,confirm-password
5,full-name
6,phone
7,email
8,first-name
9,last-name


 # Classification & Cross Validation

In [12]:
clf = DecisionTreeClassifier()
scores = cross_val_score(clf, x, y, cv=4)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


Accuracy: 0.93 (+/- 0.25)




 ## Manual test


In [13]:
classifier = clf.fit(x, y)
predict('<div>    <label for="address">Address</label>    <input type="text" name="" id=""></div>')



Input code:  ['address', 'address', 'text'] 
Predicted input_id or input_name:  ['address'] 
Probability of prediction: 



Unnamed: 0,address,city,confirm-password,email,first-name,full-name,last-name,password,phone,postcode,state,username
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
