In [1]:
%matplotlib inline

In [107]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
import json
import random
import re
import requests
from typing import List, Dict, Any

In [13]:
SEED = 2023
random.seed(SEED)

# Classification of Manufacturing Processes

## Problem


## Objectives
Give ideas to design engineers by visualizing processes. Represent the manufacturing processes as points in 2D space. Examples include drilling, polishing, bending, extrusion est.

## Data

https://en.wikipedia.org/wiki/List_of_manufacturing_processes

https://en.wikipedia.org/wiki/List_of_ISO_standards

## Download Data

In [91]:
def extract_hyperlinks(url: str) -> List[str]:
    """
    Extracts all hyperlinks from a web page and returns them as a list of strings.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    hyperlinks = [a_tag.get('href') for a_tag in soup.find_all('a')]

    return hyperlinks

In [92]:
def filter_hyperlinks(hyperlinks: List[str]) -> List[str]:
    """
    Filters a list of hyperlinks based on certain criteria.
    """
    # Drop duplicates
    valid_hyperlinks = list(set(hyperlinks))

    # Drop None values
    valid_hyperlinks = [h for h in valid_hyperlinks if h is not None]

    # Drop unrelated links
    PREFIXES = ['Category:',
                'File:',
                'Help:',
                'Special:',
                'Talk:',
                'Wikipedia:',
                'Main_Page|Portal:',
                'List_of']
    PATTERN = fr'^/wiki/(?!{"|".join(PREFIXES)}).*'
    valid_hyperlinks = [h for h in valid_hyperlinks if re.match(PATTERN, h)]

    return valid_hyperlinks

In [98]:
urls = [
    'https://en.wikipedia.org/wiki/List_of_welding_processes',
    'https://en.wikipedia.org/wiki/List_of_manufacturing_processes',
    'https://en.wikipedia.org/wiki/Industrial_processes'
]

# Collect hyperlinks
hyperlinks = []
for url in urls:
    hyperlinks.extend(extract_hyperlinks(url))
print(f'{len(hyperlinks)} hyperlinks extracted.')

1716 hyperlinks extracted.


In [99]:
# Filter hyperlinks
hyperlinks = filter_hyperlinks(hyperlinks)
print(f'{len(hyperlinks)} hyperlinks remain after filtering.')
print('Example: ', random.sample(hyperlinks, 3))

1111 hyperlinks remain after filtering.
Example:  ['/wiki/System_testing', '/wiki/Real_estate_agent', '/wiki/Hunting']


In [84]:
def href_to_title(href):
    """Converts a wikipedia href to wikipedia title."""
    return href.replace('/wiki/', '').replace('_', ' ')

In [105]:
# Helper function
def recursive_dict_search(data: Dict[Any, Any], key: Any) -> Any:
    """
    Recursively searches a nested dictionary for a given key and returns its value. 
    """
    for k, v in data.items():
        if k == key:
            return v
        elif isinstance(v, dict):
            result = recursive_dict_search(v, key)
            if result is not None:
                return result
    return None


def retrieve_summary(title: str) -> str:
    """
    Retrieves a summary of a Wikipedia article based on a given article title.
    """
    api_url = f'https://en.wikipedia.org/w/api.php?action=query&prop=extracts&titles={title}&format=json'
    response = requests.get(api_url)
    try:
        json_data = response.json()
        summary = recursive_dict_search(json_data['query'], 'extract')
    except SyntaxError:
        summary = ''

    return summary

In [104]:
df = pd.DataFrame({
    'href': pd.Series(hrefs),
    'title': pd.Series(hrefs).apply(href_to_title)
})

df.head()

Unnamed: 0,href,title
0,/wiki/Carbon_Arc_Welding,Carbon Arc Welding
1,/wiki/Flux_Cored_Arc_Welding,Flux Cored Arc Welding
2,/wiki/Gas_Metal_Arc_Welding,Gas Metal Arc Welding
3,/wiki/Shielding_gas,Shielding gas
4,/wiki/Gas_Tungsten_Arc_Welding,Gas Tungsten Arc Welding


In [106]:
retrieve_summary(df.title)

'<!-- \nNewPP limit report\nParsed by mw1485\nCached time: 20230511104759\nCache expiry: 1814400\nReduced expiry: false\nComplications: []\nCPU time usage: 0.044 seconds\nReal time usage: 0.060 seconds\nPreprocessor visited node count: 108/1000000\nPost‐expand include size: 10397/2097152 bytes\nTemplate argument size: 3555/2097152 bytes\nHighest expansion depth: 11/100\nExpensive parser function count: 1/500\nUnstrip recursion depth: 0/20\nUnstrip post‐expand size: 1653/5000000 bytes\nLua time usage: 0.023/10.000 seconds\nLua memory usage: 781916/52428800 bytes\nNumber of Wikibase entities loaded: 0/400\n--><!--\nTransclusion expansion time report (%,ms,calls,template)\n100.00%   52.296      1 -total\n100.00%   52.296      1 Template:Redirect_category_shell\n 96.06%   50.235      1 Template:Mbox\n 14.77%    7.724      1 Template:R_from_other_capitalisation\n 12.16%    6.357      1 Template:Redirect_template\n  3.88%    2.031      2 Template:Tl\n  2.74%    1.432      1 Template:Talk_oth

In [254]:
summaries = [retrieve_summary(title) for title in titles[:10]]

In [255]:
df = pd.DataFrame({'title': titles})
df['summary'] = df.title.apply(retrieve_summary)

df.head()

Unnamed: 0,title,summary
0,Casting,"<link rel=""mw-deduplicated-inline-style"" href=..."
1,Centrifugal casting (industrial),<p><b>Centrifugal casting</b> or <b>rotocastin...
2,Continuous casting,"<p><b>Continuous casting</b>, also called <b>s..."
3,Die casting,"<p class=""mw-empty-elt"">\n</p>\n\n<p><b>Die ca..."
4,Evaporative-pattern casting,<p><b>Evaporative-pattern casting</b> is a typ...


In [256]:
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [257]:
type(re.compile('<.*?>'))

re.Pattern

In [258]:
df['clean_summary'] = df.summary.apply(remove_html_tags)

In [259]:
df.clean_summary = df.clean_summary.str.replace('\n+', ' ')

  df.clean_summary = df.clean_summary.str.replace('\n+', ' ')


In [260]:
df.clean_summary = df.clean_summary.str.replace('\s ', ' ')

  df.clean_summary = df.clean_summary.str.replace('\s ', ' ')


In [261]:
df.clean_summary = df.clean_summary.str.replace('^ ', '')

  df.clean_summary = df.clean_summary.str.replace('^ ', '')


In [262]:
df

Unnamed: 0,title,summary,clean_summary
0,Casting,"<link rel=""mw-deduplicated-inline-style"" href=...",Casting is a manufacturing process in which a ...
1,Centrifugal casting (industrial),<p><b>Centrifugal casting</b> or <b>rotocastin...,Centrifugal casting or rotocasting is a castin...
2,Continuous casting,"<p><b>Continuous casting</b>, also called <b>s...","Continuous casting, also called strand casting..."
3,Die casting,"<p class=""mw-empty-elt"">\n</p>\n\n<p><b>Die ca...",Die casting is a metal casting process that is...
4,Evaporative-pattern casting,<p><b>Evaporative-pattern casting</b> is a typ...,Evaporative-pattern casting is a type of casti...
...,...,...,...
304,Router (woodworking),<p>The <b>router</b> is a power tool with a fl...,The router is a power tool with a flat base an...
305,Biscuit joiner,<p>A <b>biscuit joiner</b> or <b>biscuit joint...,A biscuit joiner or biscuit jointer (or someti...
306,Vulcanization,"<p class=""mw-empty-elt"">\n\n</p>\n\n<p><b>Vulc...",Vulcanization (British: Vulcanisation) is a ra...
307,Heat treating,<p><b>Heat treating</b> (or <b>heat treatment<...,Heat treating (or heat treatment) is a group o...


In [265]:
df['words_count'] = df.clean_summary.str.split().str.len()

In [269]:
df = df.drop(columns='summary')

In [274]:
df = df.rename({'clean_summary': 'summary'}, axis=1)

In [275]:
df

Unnamed: 0,title,summary,words_count
0,Casting,Casting is a manufacturing process in which a ...,1547
1,Centrifugal casting (industrial),Centrifugal casting or rotocasting is a castin...,784
2,Continuous casting,"Continuous casting, also called strand casting...",3824
3,Die casting,Die casting is a metal casting process that is...,3588
4,Evaporative-pattern casting,Evaporative-pattern casting is a type of casti...,351
...,...,...,...
304,Router (woodworking),The router is a power tool with a flat base an...,2571
305,Biscuit joiner,A biscuit joiner or biscuit jointer (or someti...,813
306,Vulcanization,Vulcanization (British: Vulcanisation) is a ra...,1023
307,Heat treating,Heat treating (or heat treatment) is a group o...,5153


In [277]:
np.argpartition(np.array([15, 20, -10, 50]), 2)

array([2, 0, 1, 3], dtype=int64)

In [278]:
np.argpartition(np.array([15, 20, -10, 50]), 3)

array([2, 0, 1, 3], dtype=int64)

In [280]:
np.argpartition(np.array([15, 20, -10, 50]), (1, 2))

array([2, 0, 1, 3], dtype=int64)

In [283]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

## Obsolete cells

In [251]:
def filter_hrefs(hrefs: List[str],
                 start_str: str,
                 end_str: str,
                 starts_with: str) -> List[str]:
    """
    Extract a sublist of hrefs that occur between two specific strings and starts with specific string.
    """
    start_index = hrefs.index(start_str)
    end_index = hrefs.index(end_str)
    hrefs = hrefs[start_index: end_index + 1]

    hrefs = [h for h in hrefs if h.startswith(starts_with)]

    return hrefs


hrefs = extract_hrefs(url)
hrefs = filter_hrefs(hrefs, start_str='/wiki/Casting',
                     end_str='/wiki/Bake-out', starts_with='/wiki')
hrefs[:5]

['/wiki/Casting',
 '/wiki/Centrifugal_casting_(industrial)',
 '/wiki/Continuous_casting',
 '/wiki/Die_casting',
 '/wiki/Evaporative-pattern_casting']