In [1]:
%matplotlib inline

In [107]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
import json
import random
import re
import requests
from typing import List, Dict, Any

In [13]:
SEED = 2023
random.seed(SEED)

# Classification of Manufacturing Processes

## Problem


## Objectives
Give ideas to design engineers by visualizing processes. Represent the manufacturing processes as points in 2D space. Examples include drilling, polishing, bending, extrusion est.

## Data

https://en.wikipedia.org/wiki/List_of_manufacturing_processes

https://en.wikipedia.org/wiki/List_of_ISO_standards

## Download Data

In [91]:
def extract_hyperlinks(url: str) -> List[str]:
    """
    Extracts all hyperlinks from a web page and returns them as a list of strings.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    hyperlinks = [a_tag.get('href') for a_tag in soup.find_all('a')]

    return hyperlinks

In [92]:
def filter_hyperlinks(hyperlinks: List[str]) -> List[str]:
    """
    Filters a list of hyperlinks based on certain criteria.
    """
    # Drop duplicates
    valid_hyperlinks = list(set(hyperlinks))

    # Drop None values
    valid_hyperlinks = [h for h in valid_hyperlinks if h is not None]

    # Drop unrelated links
    PREFIXES = ['Category:',
                'File:',
                'Help:',
                'Special:',
                'Talk:',
                'Wikipedia:',
                'Main_Page|Portal:',
                'List_of']
    PATTERN = fr'^/wiki/(?!{"|".join(PREFIXES)}).*'
    valid_hyperlinks = [h for h in valid_hyperlinks if re.match(PATTERN, h)]

    return valid_hyperlinks

In [98]:
urls = [
    'https://en.wikipedia.org/wiki/List_of_welding_processes',
    'https://en.wikipedia.org/wiki/List_of_manufacturing_processes',
    'https://en.wikipedia.org/wiki/Industrial_processes'
]

# Collect hyperlinks
hyperlinks = []
for url in urls:
    hyperlinks.extend(extract_hyperlinks(url))
print(f'{len(hyperlinks)} hyperlinks extracted.')

1716 hyperlinks extracted.


In [99]:
# Filter hyperlinks
hyperlinks = filter_hyperlinks(hyperlinks)
print(f'{len(hyperlinks)} hyperlinks remain after filtering.')
print('Example: ', random.sample(hyperlinks, 3))

1111 hyperlinks remain after filtering.
Example:  ['/wiki/System_testing', '/wiki/Real_estate_agent', '/wiki/Hunting']


In [112]:
def hyperlinks_to_titles(s: pd.Series) -> pd.Series:
    """Converts a wikipedia hyperlinks to wikipedia titles."""
    return s.str.replace('/wiki/', '').str.replace('_', ' ')

In [116]:
df = pd.DataFrame({
    'hyperlink': pd.Series(hyperlinks),
})

df['title'] = hyperlinks_to_titles(df['hyperlink'])
df.head()

Unnamed: 0,hyperlink,title
0,/wiki/Foundry,Foundry
1,/wiki/Poultry_farming,Poultry farming
2,/wiki/Friction_stir_welding,Friction stir welding
3,/wiki/Car_dealership,Car dealership
4,/wiki/Lease,Lease


In [117]:
def recursive_dict_search(data: Dict[Any, Any], key: Any) -> Any:
    """
    Recursively searches a nested dictionary for a given key and returns its value. 
    """
    for k, v in data.items():
        if k == key:
            return v
        elif isinstance(v, dict):
            result = recursive_dict_search(v, key)
            if result is not None:
                return result
    return None


def retrieve_summary(title: str) -> str:
    """
    Retrieves a summary of a Wikipedia article based on a given article title.
    """
    api_url = f'https://en.wikipedia.org/w/api.php?action=query&prop=extracts&titles={title}&format=json'
    response = requests.get(api_url)
    try:
        json_data = response.json()
        summary = recursive_dict_search(json_data['query'], 'extract')
    except json.JSONDecodeError:
        summary = ''

    return summary

In [118]:
df['summary'] = df['title'].apply(retrieve_summary)

df.head()


KeyboardInterrupt



In [256]:
def clean_summary(summary: str) -> str:
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [265]:
df['words_count'] = df.clean_summary.str.split().str.len()

## Obsolete cells

In [251]:
def filter_hrefs(hrefs: List[str],
                 start_str: str,
                 end_str: str,
                 starts_with: str) -> List[str]:
    """
    Extract a sublist of hrefs that occur between two specific strings and starts with specific string.
    """
    start_index = hrefs.index(start_str)
    end_index = hrefs.index(end_str)
    hrefs = hrefs[start_index: end_index + 1]

    hrefs = [h for h in hrefs if h.startswith(starts_with)]

    return hrefs


hrefs = extract_hrefs(url)
hrefs = filter_hrefs(hrefs, start_str='/wiki/Casting',
                     end_str='/wiki/Bake-out', starts_with='/wiki')
hrefs[:5]

['/wiki/Casting',
 '/wiki/Centrifugal_casting_(industrial)',
 '/wiki/Continuous_casting',
 '/wiki/Die_casting',
 '/wiki/Evaporative-pattern_casting']