# Feature Engineering

In [11]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import GroupShuffleSplit

In [None]:
pd.set_option('display.max_colwidth', 100)

In [4]:
pd.set_option('display.max_columns', None)

In [None]:
pd.set_option('display.max_rows', None)

## Load Data

In [40]:
file_path= 'data/train.feather'
ori_train_df = pd.read_feather(file_path)

In [41]:
ori_train_df.sample(3)

Unnamed: 0,query,url_page,title,source,alt,src,crossorigin,height,ismap,loading,longdesc,referrerpolicy,sizes,srcset,width,class,style,tree_path,deg,text_tag,text,id,is_relevant
105428,bigbluebug,https://www.wpri.com/dont-miss/for-the-first-t...,"For the first time, Big Blue Bug takes part in...",wpri.com,,https://www.wpri.com/wp-content/uploads/sites/...,,,,lazy,,,"(max-width: 899px) 50vw, 876px",https://www.wpri.com/wp-content/uploads/sites/...,,,,"[div, div, a, figure, article, div, section, s...",2.0,div,Video,756d2ce57954b4328d5cd8d27af07c9c,0
584554,war3,https://medium.com/@andrewow/the-three-hour-wa...,"The Three Hour Warcraft 3 Game. In 2006, my br...",medium.com,,https://miro.medium.com/fit/c/140/140/0*4DBlri...,,70.0,,,,,70px,https://miro.medium.com/fit/c/96/140/0*4DBlriP...,70.0,sc sd se sf sg sh si sj sk sl,,"[noscript, div, div, a, div, div, div, div, di...",,,,5314556949a2c1b4943e5bb8d109b794,0
354543,librium,https://www.uk-rehab.com/benzodiazepine-addict...,Librium Addiction | UK Rehab,uk-rehab.com,UK Rehab,https://www.uk-rehab.com/wp-content/uploads/20...,,,,,,,,,,,,"[div, div, div, article, div, div, div, main, ...",2.0,div,Other names for Librium In the same way that t...,37a37aa717db88d8b8e1ebbdfdf05c8c,0


## Train Test Split

In [39]:
gss = GroupShuffleSplit(test_size=.40, n_splits=1, random_state = 7).split(train_df, groups=train_df['query'])

X_train_inds, X_hold_out_inds = next(gss)

train_data= train_df.iloc[X_train_inds]
X_train = train_data.loc[:, ~train_data.columns.isin(['is_relevant'])]
y_train = train_data.loc[:, train_data.columns.isin(['is_relevant'])]

hold_out_data= train_df.iloc[X_hold_out_inds]

#We need to keep the query for later predictions
X_hold_out = hold_out_data.loc[:, ~hold_out_data.columns.isin(['is_relevant'])]
y_hold_out = hold_out_data.loc[:, hold_out_data.columns.isin(['is_relevant'])]

## Text Count

In [31]:
X_train.columns

Index(['query', 'url_page', 'title', 'source', 'alt', 'src', 'crossorigin',
       'height', 'ismap', 'loading', 'longdesc', 'referrerpolicy', 'sizes',
       'srcset', 'width', 'class', 'style', 'tree_path', 'deg', 'text_tag',
       'text', 'id'],
      dtype='object')

In [32]:
text_cols = ['url_page', 'title', 'source', 'alt', 'src', 'srcset']

In [66]:
# Function to count appearances of text in each cell of a column and check if greater than 0
def check_text(row):
    # Count occurrences of text from text_col in target_col, case-insensitive
    count = len(re.findall(re.escape(str(row[col])), str(row[col]), flags=re.IGNORECASE))
    # Return 1 if count > 0, else return 0
    return 1 if count > 0 else 0

def dynamic_check_text(df, target_col, col):  
    # Apply check_text to each row and return the result
    return df.apply(check_text, axis=1)

def add_count_columns(df, text_cols, text_column):    
    # Apply the function to each target column and create new columns
    for column in text_cols:
        new_col_name = f'check_{column}'
        df[new_col_name] = dynamic_check_text(df, column, text_column)
    return df

In [72]:
df = X_train
text_cols = ['url_page', 'title', 'source', 'alt', 'src', 'srcset']
col = 'query'

X_train = add_count_columns(df, text_cols, col)

In [73]:
X_train.head(3)

Unnamed: 0,query,url_page,title,source,alt,src,crossorigin,height,ismap,loading,longdesc,referrerpolicy,sizes,srcset,width,class,style,tree_path,deg,text_tag,text,id,check_a,check_li,check_ul,check_main,check_article,check_url_page,check_title,check_source,check_alt,check_src,check_srcset
0,11103093,https://bank-routing.org/011103093-td-bank-na-...,Routing Number 011103093 - TD Bank NA in LEWIS...,bank-routing.org,Bank Routing,https://bank-routing.org/img/logo.png,,,,,,,,,,,,"[a, div, nav, header, body, html]",2.0,div,Blog Contacts,3f5e067733178cc0f4b3a97cfc9c62e1,1,1,1,1,1,1,1,1,1,1,1
1,11103093,https://bank-routing.org/011103093-td-bank-na-...,Routing Number 011103093 - TD Bank NA in LEWIS...,bank-routing.org,011103093 routing number on TD Bank NA check,https://bank-routing.org/img/routing_numbers/r...,,,,,,,,,,img-fluid,,"[div, div, div, main, main, body, html]",1.0,div,Aside from the SWIFT code which each bank is d...,fe64168fa7afeefa73708113c4bc2730,1,1,1,1,1,1,1,1,1,1,1
2,11103093,https://bank-routing.org/011103093-td-bank-na-...,Routing Number 011103093 - TD Bank NA in LEWIS...,bank-routing.org,DMCA.com Protection Status,https://images.dmca.com/Badges/dmca_protected_...,,,,,,,,,,,,"[a, p, div, div, div, footer, body, html]",,,,5b5c3b6560ae158bf7ecd09f5ead1841,1,1,1,1,1,1,1,1,1,1,1


## Tree Path

In [44]:
relevant_df = train_data[train_data['is_relevant'] == 1].loc[:, ~train_data.columns.isin(['is_relevant'])]
irrelevant_df = train_data[train_data['is_relevant'] == 0].loc[:, ~train_data.columns.isin(['is_relevant'])]

In [46]:
def find_unique_values(df, col):
    unique_sections = set()
    for path in df[col]:
        unique_sections.update(path)
    return unique_sections

col = 'tree_path'
path_rel_set = find_unique_values(relevant_df, col)
path_irrel_set = find_unique_values(irrelevant_df, col)
path_overlaps_set = path_rel_set & path_irrel_set
path_unique_rel_set = path_rel_set - path_irrel_set
path_unique_irrel_set = path_irrel_set - path_rel_set

In [65]:
text_cols = ['a', 'li', 'ul', 'main', 'article']

In [68]:
df = X_train
text_cols = ['a', 'li', 'ul', 'main', 'article']
col = 'tree_path'

X_train = add_count_columns(df, text_cols, col)

In [70]:
X_train.head(3)

Unnamed: 0,query,url_page,title,source,alt,src,crossorigin,height,ismap,loading,longdesc,referrerpolicy,sizes,srcset,width,class,style,tree_path,deg,text_tag,text,id,check_a,check_li,check_ul,check_main,check_article
0,11103093,https://bank-routing.org/011103093-td-bank-na-...,Routing Number 011103093 - TD Bank NA in LEWIS...,bank-routing.org,Bank Routing,https://bank-routing.org/img/logo.png,,,,,,,,,,,,"[a, div, nav, header, body, html]",2.0,div,Blog Contacts,3f5e067733178cc0f4b3a97cfc9c62e1,1,1,1,1,1
1,11103093,https://bank-routing.org/011103093-td-bank-na-...,Routing Number 011103093 - TD Bank NA in LEWIS...,bank-routing.org,011103093 routing number on TD Bank NA check,https://bank-routing.org/img/routing_numbers/r...,,,,,,,,,,img-fluid,,"[div, div, div, main, main, body, html]",1.0,div,Aside from the SWIFT code which each bank is d...,fe64168fa7afeefa73708113c4bc2730,1,1,1,1,1
2,11103093,https://bank-routing.org/011103093-td-bank-na-...,Routing Number 011103093 - TD Bank NA in LEWIS...,bank-routing.org,DMCA.com Protection Status,https://images.dmca.com/Badges/dmca_protected_...,,,,,,,,,,,,"[a, p, div, div, div, footer, body, html]",,,,5b5c3b6560ae158bf7ecd09f5ead1841,1,1,1,1,1


In [53]:
# def analyser_generic(df, text_set, column):
#     # Initialize containers for percentage and count
#     percentages = {}
#     counts = {}
#     text_set.discard(None)
#     # Analyze by section
#     for text in text_set:
#         # Determine if each path contains the section
#         contains = df[column].apply(lambda x: text in x if x is not None else False)
#         # Calculate and store percentage and count
#         percentages[text] = contains.mean() * 100  # Convert fraction to percentage
#         counts[text] = contains.sum()  # Total occurrences

#     # Create a DataFrame from the calculated data
#     results_df = pd.DataFrame([percentages, counts], index=['percentage', 'total_count'])

#     # Transpose the DataFrame to have sections as rows and metrics (Percentage, Total Count) as columns
#     transposed_results_df = results_df.T

#     return transposed_results_df

In [None]:
# text_set = path_overlaps_set
# column = 'tree_path'
# path_rel_count = analyser_generic(relevant_df, text_set, column)
# path_irrel_count = analyser_generic(irrelevant_df, text_set, column)

# path_rel_count_renamed = path_rel_count.rename(columns=lambda x: f'{column}_rel_{x}')
# path_irrel_count_renamed = path_irrel_count.rename(columns=lambda x: f'{column}_irrel_{x}')

In [63]:
# path_combined_df = pd.concat([path_rel_count_renamed, path_irrel_count_renamed], axis=1)
# path_combined_df[f'{column}_total_count'] = path_combined_df[f'{column}_rel_total_count'] + path_combined_df[f'{column}_irrel_total_count']
# path_combined_df.sort_values(by=f'{column}_total_count', ascending=False)

Unnamed: 0,tree_path_rel_percentage,tree_path_rel_total_count,tree_path_irrel_percentage,tree_path_irrel_total_count,tree_path_total_count
html,100.000000,12857.0,100.000000,344536.0,357393.0
body,99.883332,12842.0,99.223013,341859.0,354701.0
div,98.475539,12661.0,97.434521,335697.0,348358.0
a,28.319204,3641.0,64.348863,221705.0,225346.0
li,5.810065,747.0,26.320326,90683.0,91430.0
...,...,...,...,...,...
b-container,0.007778,1.0,0.000580,2.0,3.0
articleclass,0.007778,1.0,0.000290,1.0,2.0
mospace-heroarea,0.007778,1.0,0.000290,1.0,2.0
ps-article-page,0.007778,1.0,0.000290,1.0,2.0
