# 01 Get Data

## Imports and setup

In [4]:
%matplotlib inline

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
import json, pickle, random, re, requests
from typing import List, Dict, Any

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP

In [6]:
SEED = 2023
random.seed(SEED)

## Final functions

In [54]:
# Secondary functions for the individual preprocessing steps, no need to run standalone

def scrape_links(url: str) -> List[str]:
    """
    Extracts all hyperlinks from a single url.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    links = [a_tag.get('href') for a_tag in soup.find_all('a')]

    return links


def scrape_links_from_multiple_urls(urls: List[str]) -> pd.DataFrame:
    """
    Extract all links from multiple urls and return pd.DataFrame.
    """
    df = pd.DataFrame(columns=['link', 'source_url'])
    for url in urls:
        new_rows = pd.DataFrame({'link': scrape_links(url), 'source_url': url})
        df = pd.concat([df, new_rows], ignore_index=True)
        
    return df


def clean_links(df: pd.DataFrame) -> pd.DataFrame:
    PREFIXES = [
        'Category:',
        'File:',
        'Help:',
        'Special:',
        'Talk:',
        'Wikipedia:',
        'Main_Page|Portal:',
        'List_of',
    ]
    LINKS_TO_REMOVE_PATTERN = fr'^/wiki/(?!{"|".join(PREFIXES)}).*'

    return (df
            .drop_duplicates()
            .dropna()
            # Filter links
            .loc[lambda df_: df_['link'].str.contains(LINKS_TO_REMOVE_PATTERN)]
            # Add `title` column 
            .assign(title=lambda df_: (df_['link']
                                       .str.replace('/wiki/', '')
                                       .str.replace('_', ' ')
                                       .str.capitalize()))
            .reset_index(drop=True))


def deep_dictionary_search(my_dict: Dict[Any, Any], my_key: Any) -> Any:
    """
    Utility function that recursively searches for a key at any level and returns its value.
    """
    for key, value in my_dict.items():
        if key == my_key:
            return value
        elif isinstance(value, dict):
            result = deep_dictionary_search(value, my_key)
            if result is not None:
                return result
    return None


def retrieve_summary(title: str) -> str:
    """
    Retrieves a summary of a Wikipedia article based on a given article title.
    """
    api_request_url = f'https://en.wikipedia.org/w/api.php?action=query&prop=extracts&titles={title}&format=json'
    response = requests.get(api_request_url)
    try:
        json_data = response.json()
        summary = deep_dictionary_search(json_data['query'], 'extract')
    except json.JSONDecodeError:
        summary = ''

    return summary


def retrieve_and_clean_summaries(df: pd.DataFrame, debug=True) -> pd.DataFrame:
    """
    Download and preprocess the article summaries.
    Caution! Long execution time!
    
    Args:
    `debug` - retrieve summaries for only 10 articles
    
    """
    HTML_CONTENT_PATTERN = re.compile('<.*?>')
    
    # 10 articles in case debug=True
    df = df.iloc[:10] if debug else df
    
    return (df
            # Retrieve summaries
            .assign(summary=lambda df_: df_['title'].apply(retrieve_summary))

            # Drop improperlly-retrieved summaries - they start with identical starting characters.
            .loc[lambda df_: ~df_['summary'].str[:20].duplicated()]

            # Strip HTML and white spaces
            .assign(summary=lambda df_: (df_['summary']
                                       .str.replace(HTML_CONTENT_PATTERN, '')
                                       .str.strip()))

            .reset_index(drop=True)
            )

In [64]:
# Main Function, combining the secondary functions from above.
# Run to scrape article links, retrive and clean article summaries.

def retrieve_manufacturing_articles(debug=False) -> pd.DataFrame:
    """
    Retrieves summaries of Wikipedia articles related to manufacturing.
    Relevant articles are obtained by scraping list-type articles with references.
    
    Parameters:
        debug (bool): If True, retrieves a small subset of articles for debugging purposes.
    
    Returns:
        pd.DataFrame: DataFrame containing the retrieved articles and their summaries.
    """
    
    # List of URLs for list-type articles related to manufacturing
    article_urls = [
        'https://en.wikipedia.org/wiki/List_of_welding_processes',
        'https://en.wikipedia.org/wiki/List_of_manufacturing_processes',
        'https://en.wikipedia.org/wiki/Industrial_processes',
    ]
    
    print('Scraping article links...')
    scraped_links = scrape_links_from_multiple_urls(article_urls)
    print('Completed!')
    
    # Filtering the scraped links to leave only the relevant ones
    print('Filtering relevant links...')
    cleaned_links = clean_links(scraped_links)
    print('Completed!')
        
    # Retrieve and clean the summaries of the articles
    print('Retrieving article summaries...')
    retrieved_articles = retrieve_and_clean_summaries(cleaned_links, debug=debug)
    print('Completed!')
    
    return retrieved_articles


# Test
retrieved_articles = retrieve_manufacturing_articles(debug=True)
retrieved_articles

Unnamed: 0,link,source_url,title,summary
0,/wiki/Carbon_Arc_Welding,https://en.wikipedia.org/wiki/List_of_welding_...,Carbon arc welding,Carbon arc welding (CAW) is a process which pr...
1,/wiki/Flux_Cored_Arc_Welding,https://en.wikipedia.org/wiki/List_of_welding_...,Flux cored arc welding,<!-- \nNewPP limit report\nParsed by mw1406\nC...
2,/wiki/Gas_Metal_Arc_Welding,https://en.wikipedia.org/wiki/List_of_welding_...,Gas metal arc welding,"Gas metal arc welding (GMAW), sometimes referr..."
3,/wiki/Shielding_gas,https://en.wikipedia.org/wiki/List_of_welding_...,Shielding gas,Shielding gases are inert or semi-inert gases ...
4,/wiki/Plasma_Arc_Welding,https://en.wikipedia.org/wiki/List_of_welding_...,Plasma arc welding,Plasma arc welding (PAW) is an arc welding pro...
5,/wiki/Shielded_Metal_Arc_Welding,https://en.wikipedia.org/wiki/List_of_welding_...,Shielded metal arc welding,"Shielded metal arc welding (SMAW), also known ..."
6,/wiki/Submerged_Arc_Welding,https://en.wikipedia.org/wiki/List_of_welding_...,Submerged arc welding,Submerged arc welding (SAW) is a common arc we...
7,/wiki/Atomic_Hydrogen_Welding,https://en.wikipedia.org/wiki/List_of_welding_...,Atomic hydrogen welding,Atomic hydrogen welding (AHW) is an arc weldin...


In [65]:
# Full donwload
retrieved_articles = retrieve_manufacturing_articles(debug=False)

In [67]:
# Optionally save as csv
retrieved_articles.to_csv('data/manufacturing_articles.csv', index=False)

## Developing the algorithm

In [43]:
def scrape_links(url: str) -> List[str]:
    """
    Extracts all hyperlinks from a single url.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    links = [a_tag.get('href') for a_tag in soup.find_all('a')]

    return links


# Test
test_url = 'https://en.wikipedia.org/wiki/List_of_manufacturing_processes'

print(f'Starting test scraping from {test_url}...')
links = scrape_links(test_url)
print(f'Extracted total of {len(links)} links.')

Starting test scraping from https://en.wikipedia.org/wiki/List_of_manufacturing_processes...
Extracted total of 516 links.


In [44]:
def scrape_links_from_multiple_urls(urls: List[str]) -> pd.DataFrame:
    """
    Extract all links from multiple urls and return pd.DataFrame.
    """
    df = pd.DataFrame(columns=['link', 'source_url'])
    for url in urls:
        new_rows = pd.DataFrame({'link': scrape_links(url), 'source_url': url})
        df = pd.concat([df, new_rows], ignore_index=True)
        
    return df


urls = [
    'https://en.wikipedia.org/wiki/List_of_welding_processes',
    'https://en.wikipedia.org/wiki/List_of_manufacturing_processes',
    'https://en.wikipedia.org/wiki/Industrial_processes',
]

print(f'Starting scraping from {len(urls)} urls...')
all_links = scrape_links_from_multiple_urls(urls)
print(f'Extracted the following:')
all_links.groupby('source_url').count()

Starting scraping from 3 urls...
Extracted the following:


Unnamed: 0_level_0,link
source_url,Unnamed: 1_level_1
https://en.wikipedia.org/wiki/Industrial_processes,977
https://en.wikipedia.org/wiki/List_of_manufacturing_processes,516
https://en.wikipedia.org/wiki/List_of_welding_processes,225


In [45]:
def clean_links(df: pd.DataFrame) -> pd.DataFrame:
    PREFIXES = [
        'Category:',
        'File:',
        'Help:',
        'Special:',
        'Talk:',
        'Wikipedia:',
        'Main_Page|Portal:',
        'List_of',
    ]
    LINKS_TO_REMOVE_PATTERN = fr'^/wiki/(?!{"|".join(PREFIXES)}).*'

    return (df
            .drop_duplicates()
            .dropna()
            # Filter links
            .loc[lambda df_: df_['link'].str.contains(LINKS_TO_REMOVE_PATTERN)]
            # Add `title` column 
            .assign(title=lambda df_: (df_['link']
                                       .str.replace('/wiki/', '')
                                       .str.replace('_', ' ')
                                       .str.capitalize()))
            .reset_index(drop=True))


print(f'Starting cleaning...')

cleaned_links = clean_links(all_links)

print(f'Clean complete! Remaining links:')
cleaned_links.groupby('source_url').count()

Starting cleaning...
Clean complete! Remaining links:


Unnamed: 0_level_0,link,title
source_url,Unnamed: 1_level_1,Unnamed: 2_level_1
https://en.wikipedia.org/wiki/Industrial_processes,790,790
https://en.wikipedia.org/wiki/List_of_manufacturing_processes,300,300
https://en.wikipedia.org/wiki/List_of_welding_processes,82,82


In [46]:
# Optional: store the DataFrame
# df.to_csv('data/manufacturing_processes_links.csv', index=False)

In [53]:
def deep_dictionary_search(my_dict: Dict[Any, Any], my_key: Any) -> Any:
    """
    Utility function that recursively searches for a key at any level and returns its value.
    """
    for key, value in my_dict.items():
        if key == my_key:
            return value
        elif isinstance(value, dict):
            result = deep_dictionary_search(value, my_key)
            if result is not None:
                return result
    return None


def retrieve_summary(title: str) -> str:
    """
    Retrieves a summary of a Wikipedia article based on a given article title.
    """
    api_request_url = f'https://en.wikipedia.org/w/api.php?action=query&prop=extracts&titles={title}&format=json'
    response = requests.get(api_request_url)
    try:
        json_data = response.json()
        summary = deep_dictionary_search(json_data['query'], 'extract')
    except json.JSONDecodeError:
        summary = ''

    return summary


def retrieve_and_clean_summaries(df: pd.DataFrame, debug=True) -> pd.DataFrame:
    """
    Download and preprocess the article summaries.
    Caution! Long execution time!
    
    Args:
    `debug` - retrieve summaries for only 10 articles
    
    """
    HTML_CONTENT_PATTERN = re.compile('<.*?>')
    
    # 10 articles in case debug=True
    df = df.iloc[:10] if debug else df
    
    return (df
            # Retrieve summaries
            .assign(summary=lambda df_: df_['title'].apply(retrieve_summary))

            # Drop improperlly-retrieved summaries - they start with identical starting characters.
            .loc[lambda df_: ~df_['summary'].str[:20].duplicated()]

            # Strip HTML and white spaces
            .assign(summary=lambda df_: (df_['summary']
                                       .str.replace(HTML_CONTENT_PATTERN, '')
                                       .str.strip()))

            .reset_index(drop=True)
            )
    
# Test
print(f'Starting retrieving article summaries from Wikipedia...')

links_with_summaries = retrieve_and_clean_summaries(cleaned_links)

print(f'Done! Sample articles and their summaries:')
pd.options.display.max_colwidth = 400
display(links_with_summaries.loc[:, ['title', 'summary']].sample(2))
pd.options.display.max_colwidth = 50

Starting retrieving article summaries from Wikipedia...
Done! Sample articles and their summaries:


Unnamed: 0,title,summary
7,Atomic hydrogen welding,"Atomic hydrogen welding (AHW) is an arc welding process that uses an arc between two tungsten electrodes in a shielding atmosphere of hydrogen. The process was invented by Irving Langmuir in the course of his studies of atomic hydrogen. The electric arc efficiently breaks up the hydrogen molecules, which later recombine with tremendous release of heat, reaching temperatures from 3400 to 4000 °..."
5,Shielded metal arc welding,"Shielded metal arc welding (SMAW), also known as manual metal arc welding (MMA or MMAW), flux shielded arc welding or informally as stick welding, is a manual arc welding process that uses a consumable electrode covered with a flux to lay the weld.\nAn electric current, in the form of either alternating current or direct current from a welding power supply, is used to form an electric arc betw..."
