# 02 Make Embeddings

## Imports and setup

In [1]:
%matplotlib inline

In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pickle, textwrap, random, os

from sentence_transformers import SentenceTransformer, util

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk

In [8]:
pd.options.display.max_colwidth = 400

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hrist\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
SEED = 2023
random.seed(SEED)

## Load data

In [11]:
df = (pd
      .read_csv('data/manufacturing_articles.csv')
      .loc[:,['title', 'summary']]
     )

display(df.shape)
display(df.sample(2))

(527, 2)

Unnamed: 0,title,summary
426,Water industry,"The water industry provides drinking water and wastewater services (including sewage treatment) to residential, commercial, and industrial sectors of the economy. Typically public utilities operate water supply networks. The water industry does not include manufacturers and suppliers of bottled water, which is part of the beverage production and belongs to the food sector.\nThe water industry ..."
465,Professional services,"Professional services are occupations in the service sector requiring special training in liberal arts and pure sciences education or professional development education. Some professional services, such as architects, accountants, engineers, doctors, and lawyers require the practitioner to hold professional degrees or licenses and possess specific skills. Other professional services involve pr..."


## Calculate embeddings

### Sentence Transformer

In [39]:
output_filepath = 'data/transformer_embeddings.pickle'

# Load if embeddings already exit.
if os.path.exists(output_filepath):
    with open(output_filepath, 'rb') as f:
        transformer_embeddings = pickle.load(f)
        print(f'Loaded embeddings from "{output_filepath}".')
        
# Otherwise:
else:
    transformer_vectoriser = SentenceTransformer("all-MiniLM-L12-v2")
    transformer_vectoriser.max_seq_length = 512

    # 1) calculate the endodings
    print('Start calculating embeddings...')
    transformer_embeddings = transformer_vectoriser.encode(df['summary'].tolist(), show_progress_bar=True)
    print(f'Completed! Embeddings shape:{transformer_embeddings.shape}')
    
    # 2) save them locally
    with open(output_filepath, 'wb') as f:
        pickle.dump(transformer_embeddings, f)
        print(f'Embeddings saved to "{output_filepath}".')


print(f'Transformer embeddings shape: {transformer_embeddings.shape}')

Loaded embeddings from "data/transformer_embeddings.pickle".
Transformer embeddings shape: (527, 384)


### TF-IDF

In [109]:
output_filepath = 'data/tfidf_embeddings.pickle'

# Load if embeddings already exit.
if os.path.exists(output_filepath):
    with open(output_filepath, 'rb') as f:
        tfidf_embeddings = pickle.load(f)
        print(f'Loaded embeddings from "{output_filepath}".')
        
# Otherwise create embeddings
else:
    stopwords = (nltk
             .corpus
             .stopwords
             .words(fileids=['english',])
            )

    tfidf_vectorizer = TfidfVectorizer(
        input = "content", 
        analyzer = "word", 
        min_df = 2, 
        max_df = 0.95,
        ngram_range = (1,2),
        stop_words = stopwords, 
        sublinear_tf = True,
    )
    
    tfidf_embeddings = tfidf_vectorizer.fit_transform(df['summary'].dropna().tolist())
    
    with open(output_filepath, 'wb') as f:
        pickle.dump(tfidf_embeddings, f)
        print(f'Embeddings saved to "{output_filepath}".')

Embeddings saved to "data/tfidf_embeddings.pickle".


## Semantic seach

### Search functions

In [110]:
def transformer_semantic_search(
    query: str,
    top_n: int,
    df_data: pd.DataFrame,
    embeddings: np.array,
    vectoriser,
) -> None:
    """
    Perform semantic search to find the most similar articles to a search query.

    Args:
        query (str): The description of the unseen process.
        n_results (int): The number of most similar processes to retrieve.
        article_df (pd.DataFrame): The DataFrame containing the process articles.
        article_embeddings (np.array): The array of embeddings for the process articles.
    """
    query_embedding = vectoriser.encode(query)
    cosine_similarities = util.cos_sim(embeddings, query_embedding[np.newaxis,:])
    top_n_indexes = np.argsort(cosine_similarities.flatten())[-top_n:]

    print(f'Top {top_n} most similar articles using Sentence Transformer encodings: \n', '-'*40)
    
    return (df_data
             .iloc[top_n_indexes]
             .assign(similarity = cosine_similarities[top_n_indexes])
             .loc[:, ['title', 'similarity']]
             .sort_values('similarity', ascending = False)
             .reset_index(drop=True)
             )

In [111]:
def tfidf_semantic_search(
    query: str,
    top_n: int,
    df_data: pd.DataFrame,
    embeddings: np.array,
    vectoriser,
) -> None:
    """
    Perform semantic search to find the most similar articles to a search query.

    Args:
        query (str): The description of the unseen process.
        n_results (int): The number of most similar processes to retrieve.
        article_df (pd.DataFrame): The DataFrame containing the process articles.
        article_embeddings (np.array): The array of embeddings for the process articles.
    """
    query_embedding = vectoriser.transform([query,])
    cosine_similarities = cosine_similarity(embeddings, query_embedding)
    top_n_indexes = np.argsort(cosine_similarities.flatten())[-top_n:]

    print(f'Top {top_n} most similar articles using TFIDF encodings: \n', '-'*40)
    
    return (df_data
             .iloc[top_n_indexes]
             .assign(similarity = cosine_similarities[top_n_indexes])
             .loc[:, ['title', 'similarity']]
             .sort_values('similarity', ascending = False)
             .reset_index(drop=True)
             )

### Expected result - hydroforming

In [112]:
def search_for(query):
    display(transformer_semantic_search(
                query, 
                3, 
                df[['title']], 
                transformer_embeddings,
                transformer_vectoriser))

    display(tfidf_semantic_search(
                query, 
                3, 
                df[['title']], 
                tfidf_embeddings, 
                tfidf_vectorizer))
    
query = textwrap.dedent("""
    Manufacturing process used to shape metal components using high-pressure fluid. 
    It involves placing a metal sheet or tube over a die and sealing it with a fluid-pressurized bladder. 
    As the fluid pressure increases, the metal deforms to take the shape of the die, 
    resulting in complex and seamless structures with enhanced strength and structural integrity.
""")

search_for(query)

Top 3 most similar articles using Sentence Transformer encodings: 
 ----------------------------------------


Unnamed: 0,title,similarity
0,Hydroforming,0.605506
1,Metal fabrication,0.596422
2,Explosive forming,0.592556


Top 3 most similar articles using TFIDF encodings: 
 ----------------------------------------


Unnamed: 0,title,similarity
0,Hot metal gas forming,0.124376
1,Screw thread,0.08456
2,Cold sizing,0.078514


### Expected result - sheet metal blank

In [113]:
query = textwrap.dedent("""
    manufacturing operation that involves cutting out flat shapes from a sheet of metal material.
    It is typically achieved by using a specialized tool and die set. 
    The process helps create precise and identical parts, and the cut-out pieces, known as blanks, 
    can be further processed or used as-is for various applications
""")

search_for(query)

Top 3 most similar articles using Sentence Transformer encodings: 
 ----------------------------------------


Unnamed: 0,title,similarity
0,Metal fabrication,0.597248
1,Cutting,0.532732
2,Blanking and piercing,0.52007


Top 3 most similar articles using TFIDF encodings: 
 ----------------------------------------


Unnamed: 0,title,similarity
0,Impact extrusion,0.107972
1,Drawing (manufacturing),0.083101
2,Cutting,0.076372


### Expected result - metal spinning

In [114]:
query = textwrap.dedent("""
    fabrication technique that utilizes rotational forces to shape a workpiece into a desired form. 
    It involves the use of a lathe-like machine where a malleable material is rotated at high speed while gradually
    formed into a three-dimensional shape. 
    This process enables the production of curved and symmetrical objects with excellent precision and surface finish.
""")

search_for(query)

Top 3 most similar articles using Sentence Transformer encodings: 
 ----------------------------------------


Unnamed: 0,title,similarity
0,Laminated object manufacturing,0.513743
1,Metal fabrication,0.508376
2,Metal spinning,0.502959


Top 3 most similar articles using TFIDF encodings: 
 ----------------------------------------


Unnamed: 0,title,similarity
0,Flattening,0.109792
1,Abrasive jet machining,0.099775
2,Refining,0.092425


In [None]:
query = textwrap.dedent("""
    I want to seal two metal surfaces - casted aluminum box and an aluminum cover.
    I want tight water-proof seal, no need to seal pressure. However, it needs to be robust,
    last long (e.g. 10 years), and be relatively cheap and fast to assemble.
""")
search_for(query)

Top 3 most similar articles using Sentence Transformer encodings: 
 ----------------------------------------


Unnamed: 0,title,similarity
0,Hydroforming,0.334074
1,Metallizing,0.329335
2,Coating,0.3147


Top 3 most similar articles using TFIDF encodings: 
 ----------------------------------------


Unnamed: 0,title,similarity
0,Ultrasonic welding,0.03174
1,Etching,0.026784
2,Permanent mold casting,0.025766
