In [4]:
%matplotlib inline

In [94]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
import json
import pickle
import random
import re
import requests
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict, Any
from umap import UMAP

In [13]:
SEED = 2023
random.seed(SEED)

# Classification of Manufacturing Processes

## Problem


## Objectives
Give ideas to design engineers by visualizing processes. Represent the manufacturing processes as points in 2D space. Examples include drilling, polishing, bending, extrusion est.

## Data

https://en.wikipedia.org/wiki/List_of_manufacturing_processes

https://en.wikipedia.org/wiki/List_of_ISO_standards

## Download DataUMAP

In [14]:
def extract_hyperlinks(url: str) -> pd.Series:
    """
    Extracts all hyperlinks from a web page.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    hyperlinks = [a_tag.get('href') for a_tag in soup.find_all('a')]

    return pd.Series(hyperlinks)


# List of Wikipedia pages to be scraped
source_pages = [
    'https://en.wikipedia.org/wiki/List_of_welding_processes',
    'https://en.wikipedia.org/wiki/List_of_manufacturing_processes',
    'https://en.wikipedia.org/wiki/Industrial_processes'
]

# Extract hyperlinks into a DataFrame
df = pd.DataFrame(columns=['hyperlink', 'source_page_id'])
for i, url in enumerate(source_pages):
    new_rows = pd.DataFrame({
        'hyperlink': extract_hyperlinks(url),
        'source_page_id': i,
    })
    df = pd.concat([df, new_rows], ignore_index=True)

# Print the number of extracted hyperlinks
for i, cnt in df.source_page_id.value_counts().to_dict().items():
    print(f'{cnt} extracted from {source_pages[i]}.')
print(f'Total: {df.shape[0]}.')

# Sample 4 rows from the DataFrame
df.sample(4)

976 extracted from https://en.wikipedia.org/wiki/Industrial_processes.
515 extracted from https://en.wikipedia.org/wiki/List_of_manufacturing_processes.
225 extracted from https://en.wikipedia.org/wiki/List_of_welding_processes.
Total: 1716.


Unnamed: 0,hyperlink,source_page_id
271,/w/index.php?title=List_of_manufacturing_proce...,1
1561,/wiki/Public_relations,2
164,/wiki/Gas_tungsten_arc_welding,0
1461,/wiki/Commercial_property,2


In [15]:
# Regex patter for irrelevant hyperlinks
IRRELEVANT_PREFIXES = ['Category:',
                       'File:',
                       'Help:',
                       'Special:',
                       'Talk:',
                       'Wikipedia:',
                       'Main_Page|Portal:',
                       'List_of']
PATTERN = fr'^/wiki/(?!{"|".join(IRRELEVANT_PREFIXES)}).*'

# Update the DataFrame
df = (df.
      drop_duplicates().
      dropna().

      # Drop irrelevant hyperlinks
      loc[lambda df_: df_['hyperlink'].str.contains(PATTERN)].

      # Generate article title later used to retrieve the article summary
      assign(title=lambda df_: df_['hyperlink'].str.replace(
          '/wiki/', '').str.replace('_', ' ').str.capitalize()).

      reset_index(drop=True)
      )

# Print the outcome after filtering
print(f'{df.shape[0]} hyperlinks remain after filtering.')
df.sample(4)

1172 hyperlinks remain after filtering.


Unnamed: 0,hyperlink,source_page_id,title
778,/wiki/Electronic_component,2,Electronic component
792,/wiki/Satellite_navigation_device#Consumer_app...,2,Satellite navigation device#consumer applications
826,/wiki/Commodity_chemicals,2,Commodity chemicals
570,/wiki/Vacuum_metalising,2,Vacuum metalising


In [141]:
# Optional: store the DataFrame
# df.to_csv('data/manufacturing_processes_links.csv', index=False)

In [16]:
def search_dict(my_dict: Dict[Any, Any], my_key: Any) -> Any:
    """
    Helper function. Recursively searches a nested dictionary for a given key and returns its value. 
    """
    for key, value in my_dict.items():
        if key == my_key:
            return value
        elif isinstance(value, dict):
            result = search_dict(value, my_key)
            if result is not None:
                return result
    return None


def retrieve_summary(title: str) -> str:
    """
    Retrieves a summary of a Wikipedia article based on a given article title.
    """
    api_url = f'https://en.wikipedia.org/w/api.php?action=query&prop=extracts&titles={title}&format=json'
    response = requests.get(api_url)
    try:
        json_data = response.json()
        summary = search_dict(json_data['query'], 'extract')
    except json.JSONDecodeError:
        summary = ''

    return summary

In [144]:
# Download and preprocess the article summaries
# Caution! Long execution time!
HTML_PATTERN = re.compile('<.*?>')
df_new = (df.
          # Use small subset to debug (optional)
          # loc[:20].

          # Retrieve summaries
          assign(summary=lambda df_: df_['title'].apply(retrieve_summary)).

          # Drop rows where a summary was not acquired - all such rows start with the same characters
          loc[lambda df_: ~df_['summary'].str[:20].duplicated()].

          # Strip HTML
          assign(summary=lambda df_: df_['summary'].str.replace(HTML_PATTERN, '')).

          reset_index(drop=True)
          )

In [145]:
df_new.head()

Unnamed: 0,hyperlink,source_page_id,title,summary
0,/wiki/Carbon_Arc_Welding,0,Carbon arc welding,Carbon arc welding (CAW) is a process which pr...
1,/wiki/Flux_Cored_Arc_Welding,0,Flux cored arc welding,<!-- \nNewPP limit report\nParsed by mw1493\nC...
2,/wiki/Gas_Metal_Arc_Welding,0,Gas metal arc welding,"\n\n\nGas metal arc welding (GMAW), sometimes ..."
3,/wiki/Shielding_gas,0,Shielding gas,Shielding gases are inert or semi-inert gases ...
4,/wiki/Plasma_Arc_Welding,0,Plasma arc welding,Plasma arc welding (PAW) is an arc welding pro...


In [17]:
# Save locally (optional)
# df_new.to_csv('data/process_summaries.csv')

In [18]:
df_new = pd.read_csv('data/process_summaries.csv')
df_new.head()

Unnamed: 0.1,Unnamed: 0,hyperlink,source_page_id,title,summary
0,0,/wiki/Carbon_Arc_Welding,0,Carbon arc welding,Carbon arc welding (CAW) is a process which pr...
1,1,/wiki/Flux_Cored_Arc_Welding,0,Flux cored arc welding,<!-- \nNewPP limit report\nParsed by mw1493\nC...
2,2,/wiki/Gas_Metal_Arc_Welding,0,Gas metal arc welding,"\n\n\nGas metal arc welding (GMAW), sometimes ..."
3,3,/wiki/Shielding_gas,0,Shielding gas,Shielding gases are inert or semi-inert gases ...
4,4,/wiki/Plasma_Arc_Welding,0,Plasma arc welding,Plasma arc welding (PAW) is an arc welding pro...


In [19]:
df_new['summary'] = df_new['summary'].str.split().str[:200].str.join(' ')

In [20]:
vectoriser = SentenceTransformer("all-MiniLM-L12-v2")

# Ensure that the model vectorises up to 512 tokens
# vectoriser.max_seq_length = 512

embeddings = vectoriser.encode(df_new['summary'].tolist(), show_progress_bar=True)
print(embeddings.shape)

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

(530, 384)


In [93]:
# Serialize embeddings and save (optional)
# with open('data/embeddings.pickle', 'wb') as f:
#     pickle.dump(embeddings, f)

## Predictions

Two use cases.

In [85]:
unknown_process = ' finishing operation of high-precision holes performed with a multi-edge tool. High surface finish, superb hole quality, and close dimensional tolerance are achieved at high penetration rates and small depths of cut.'
unknown_process_embedding = vectoriser.encode(unknown_process)

cosine_similarities = cosine_similarity(X=embeddings, Y=unknown_process_embedding[np.newaxis,:])

top5 = np.argsort(cosine_similarities.flatten())[-5:][::-1]

df_new[['title', 'summary']].iloc[top5, :]

Unnamed: 0,title,summary
141,Laser drilling,Laser drilling is the process of creating thru...
118,Drilling,Drilling is a cutting process where a drill bi...
124,Sharpening,Sharpening is the process of creating or refin...
89,Notching,Notching is a metal-cutting process used on sh...
344,Surface finishing,Surface finishing is a broad range of industri...


In [91]:
desired_process = 'I want to produce small metal part with complex geometry. I dont like to use cutting processes. Cost needs to be low. The volume is high.'
desired_process_embedding = vectoriser.encode(desired_process)

cosine_similarities = cosine_similarity(X=embeddings, Y=desired_process_embedding[np.newaxis,:])

top10 = np.argsort(cosine_similarities.flatten())[-20:][::-1]

df_new[['title', 'summary']].iloc[top10, :]

Unnamed: 0,title,summary
48,Metal fabrication,Metal fabrication is the creation of metal str...
72,Forming (metalworking),"In metalworking, forming is the fashioning of ..."
52,Outline of metalworking,This article is a list of terms commonly used ...
33,Metalworking,Metalworking is the process of shaping and res...
50,Machining,Machining is a process in which a material (of...
212,Plasma cutting,Plasma cutting is a process that cuts through ...
49,Forming processes,Some of example of forming processes are: Forg...
178,Laser engineered net shaping,"Laser powder forming, also known by the propri..."
88,Cutting,Cutting is the separation or opening of a phys...
55,Continuous casting,"Continuous casting, also called strand casting..."


## Visualize embeddings

In [102]:
my_umap = UMAP(n_components=2)
umap_embeddings = my_umap.fit_transform(embeddings)

In [106]:
texts = pd.DataFrame({
    'text': df_new.title,
    'x': umap_embeddings[:, 0],
    'y': umap_embeddings[:, 1],
})

texts.head()

Unnamed: 0,text,x,y
0,Carbon arc welding,8.858342,6.32082
1,Flux cored arc welding,5.421583,2.265293
2,Gas metal arc welding,8.949512,6.420838
3,Shielding gas,8.898228,6.489328
4,Plasma arc welding,8.949606,6.551714


In [107]:
texts.to_csv("data/bulk_st.csv")