# Yake: Keyword extraction and HS-Code predicting

In [None]:
pip install git+https://github.com/LIAAD/yake

Collecting git+https://github.com/LIAAD/yake
  Cloning https://github.com/LIAAD/yake to /tmp/pip-req-build-dzf520kd
  Running command git clone --filter=blob:none --quiet https://github.com/LIAAD/yake /tmp/pip-req-build-dzf520kd
  Resolved https://github.com/LIAAD/yake to commit 374fc1c1c19eb080d5b6115cbb8d4a4324392e54
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting segtok (from yake==0.4.8)
  Downloading segtok-1.5.11-py3-none-any.whl (24 kB)
Collecting jellyfish (from yake==0.4.8)
  Downloading jellyfish-1.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: yake
  Building wheel for yake (setup.py) ... [?25l[?25hdone
  Created wheel for yake: filename=yake-0.4.8-py2.py3-none-any.whl size=62570 sha256=a476bdba5f75bdb24c5cf24e1cad25f32aa35d9f3e84a98291e20ba8d585fe68
  Stored in directory: /tm

In [None]:
import yake
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
kw_extractor = yake.KeywordExtractor()


In [None]:
# Replace 'your_file.csv' with the path to your CSV file.
file_path = 'main_dataset.csv'

# Load the CSV into a Pandas DataFrame
df = pd.read_csv(file_path)
# df = df.sample(10000)
test_df = df.sample(2000)

In [None]:
unique_code = list(set(df['HS_Code'].tolist()))
print(f"unique code number: ", len(set(unique_code)))

unique code number:  520


In [None]:
# create a hscode - keyword map
code_keyword_map = {}
for code in tqdm(unique_code):
    concat_description = " ".join(df[df['HS_Code'] == code]['Description'].tolist())
    keywords = kw_extractor.extract_keywords(concat_description)[:8]
    key_word_list = [i[0].lower() for i in keywords]
    code_keyword_map[code] = key_word_list

# save it to json
import json
# Saving the dictionary to a JSON file
with open('code_keyword_map.json', 'w') as file:
    json.dump(code_keyword_map, file)

print("Saved code_keyword_map to code_keyword_map.json")

100%|██████████| 520/520 [02:40<00:00,  3.24it/s]


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def jaccard_similarity(sentence1, sentence2):
    # Tokenize the sentences into words
    set1 = set(sentence1.split())
    set2 = set(sentence2.split())

    # Calculate intersection and union
    intersection = set1.intersection(set2)
    union = set1.union(set2)

    # Compute Jaccard similarity
    jaccard = len(intersection) / len(union)

    return jaccard

def calculate_accuracy(list1, list2):
    # Ensure the two lists are of the same length
    if len(list1) != len(list2):
        raise ValueError("The two lists must have the same length")

    # Calculate the number of matches
    matching_elements = sum([i == j for i, j in zip(list1, list2)])

    # Calculate accuracy
    accuracy = matching_elements / len(list1)

    return accuracy

def calculate_cosine_similarity(sentence1, sentence2):
    # Create a CountVectorizer to count the word frequencies
    vectorizer = CountVectorizer()

    # Vectorize the sentences
    vectors = vectorizer.fit_transform([sentence1, sentence2]).toarray()

    # Calculate cosine similarity
    cos_sim = cosine_similarity(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1))

    # Since cosine_similarity returns a matrix, we'll get the single value out
    return cos_sim[0][0]

Cosine Similarity: 0.5163977794943223


In [None]:
import re

def clean(input_str):
    # Remove all numbers
    input_str = input_str.lower()
    input_str = re.sub(r'\d+', '', input_str)

    # Remove specific substrings
    patterns = ["hs", "h\.s\.", "code"]
    for pattern in patterns:
        input_str = re.sub(pattern, '', input_str, flags=re.IGNORECASE)

    # Return the modified string
    return input_str.strip()

# Example usage:
string = "This is a test string123 with hs, h.s., and code inside."
print(clean(string))

this is a test string with , , and  inside.


In [None]:
# you can use calculate_cosine_similarity or jaccard_similarity
def find_most_matching_keyword(description, code_keyword_map):
    best_match = None
    max_similarity = 0
    for code, keyword_list in code_keyword_map.items():
        similarity = calculate_cosine_similarity(clean(description), " ".join(keyword_list))
        # print(distance, description.lower(), " ".join(keyword_list))
        if similarity > max_similarity:
            max_similarity = similarity
            best_match = code
    # print(f"Best: {code}, \n{' '.join(keyword_list)}, \n{description.lower()}\n")
    return best_match


# find top 5 best code by similarity metrics
def find_alternate_code(description, code_keyword_map):
    alternate = {}

    # Loop through each code and its associated keyword list
    for code, keyword_list in code_keyword_map.items():
        similarity = calculate_cosine_similarity(clean(description), " ".join(keyword_list))

        # If there's some similarity, add to the alternate dictionary
        if similarity > 0:
            alternate[code] = similarity

    # Sort the alternate dictionary by similarity scores in descending order and take the top 5
    sorted_alternate = dict(sorted(alternate.items(), key=lambda item: item[1], reverse=True)[:5])

    return sorted_alternate



# Apply the function to each row in the DataFrame
test_df['Prediction'] = test_df['Description'].apply(lambda x: find_most_matching_keyword(x, code_keyword_map))
test_df['KeyWord'] = test_df['HS_Code'].apply(lambda x: " ".join(code_keyword_map[x]) if not pd.isna(x) else None)
test_df['Prediction_Keyword'] = test_df['Prediction'].apply(lambda x: " ".join(code_keyword_map[x]) if not pd.isna(x) else None)
test_df['Alternate'] = test_df['Description'].apply(lambda x: find_alternate_code(x, code_keyword_map))

# only keep high confidence
# test_df = test_df[test_df['Alternate'].apply(lambda x: max(x.values()) >= 0.3)]
# drop na
test_df = test_df.dropna()

test_df

Unnamed: 0,HS_Code,Description,Prediction,KeyWord,Prediction_Keyword,Alternate
15268,560122,FOODGRADE OTHER MANUFACTURER TOBACCO 56012210 ...,520622,goods acrylic yarn piece goods acrylic textile...,kgs freight prepaid contamination free cotton ...,"{550320: 0.12121212121212122, 520544: 0.117647..."
34232,540793,"FURNISHING FABRICS . H.S.CODE NO: 54079300, 55...",551692,yarns yarns yarns yarns synthetic yarns synthe...,furnishing fabrics commercial furnishing fabri...,"{540793: 0.2857142857142857, 540792: 0.25, 551..."
10307,551521,"WOMENS MAN-MADE FIBERS, KNIT SWEATER50% NYLON4...",611780,chair acrylic stool lucite chair acrylic acryl...,wood packing material solid wood packing wood ...,"{611780: 0.16666666666666666, 551521: 0.16, 61..."
10284,551521,"WOMENS MAN-MADE FIBERS, KNIT SWEATER50% NYLON4...",611780,chair acrylic stool lucite chair acrylic acryl...,wood packing material solid wood packing wood ...,"{611780: 0.16666666666666666, 551521: 0.16, 61..."
7926,540742,FABRIC 49 ROLLS (3 PLTS) HS CODE: 5407.42 AMS#...,540771,color multicam lot nylon color multicam percen...,woven fabric rolls fabric rolls fabric rolls g...,"{551321: 0.15384615384615385, 540771: 0.142857..."
...,...,...,...,...,...,...
42688,550942,FABRIC HS 55094200,551614,fiber hs code kynol fiber black hs code kevlar...,fabric woven fabric fabric satin stripe woven ...,"{511130: 0.3333333333333333, 551633: 0.25, 540..."
23575,560392,NONWOVEN FABRICS HS:56039290,551692,woven fabric hs-code spunlace nonwoven fabric ...,furnishing fabrics commercial furnishing fabri...,"{560392: 0.2, 540793: 0.16666666666666666, 540..."
11585,551612,100/ 100 POLYESTER CURTAIN FABRICS HTS CODE: 5...,551519,polyester curtain fabrics fabrics hts code cur...,polyester fabrics polyester polyester woven fa...,"{540771: 0.25, 551612: 0.25, 540753: 0.2, 5111..."
11861,551613,WOVEN FABRICS OF ARTIFICIAL STAPLE FIBERS,560314,grass yarnsartificial grass yarnsartificial gr...,woven fabrics laminated woven coated fabrics c...,"{551519: 0.3, 551613: 0.3, 520952: 0.3, 550820..."


In [None]:
test_df = test_df.dropna()

In [None]:
calculate_accuracy(test_df['Prediction'].tolist(), test_df['HS_Code'].tolist())

0.42520491803278687

In [None]:
from sklearn.metrics import precision_score, f1_score

def calculate_precision_f1(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average="macro")

    # Calculate F1 score
    f1 = f1_score(y_true, y_pred, average='macro')

    return precision, f1

precision, f1 = calculate_precision_f1(test_df['Prediction_Keyword'].tolist(), test_df['KeyWord'].tolist())

print("Precision:", precision)
print("F1 Score:", f1)

# Other/Improving

In [None]:
import pandas as pd

def rank_labels_by_accuracy(predicted_labels, true_labels):
    # Check if lengths of both lists are the same
    if len(predicted_labels) != len(true_labels):
        raise ValueError("Both lists must have the same length.")

    # Dictionary to store the count of correct predictions for each label
    correct_predictions = {}
    # Dictionary to store the total count of each label in true_labels
    label_counts = {}

    # Iterate through both lists
    for pred, true in zip(predicted_labels, true_labels):
        # If prediction is correct, increment the count in correct_predictions
        if pred == true:
            correct_predictions[pred] = correct_predictions.get(pred, 0) + 1
        # Increment the count in label_counts for the true label
        label_counts[true] = label_counts.get(true, 0) + 1

    # Prepare data for DataFrame
    data = []
    for label, count in label_counts.items():
        accuracy = correct_predictions.get(label, 0) / count
        success = f"{correct_predictions.get(label, 0)} out of {count}"
        data.append([label, accuracy, success])

    # Create DataFrame and sort by accuracy in descending order
    df = pd.DataFrame(data, columns=["Label", "Accuracy", "Success Rate"])
    df = df.sort_values(by="Accuracy", ascending=False)

    return df

# Example usage:
predicted = ["A", "B", "A", "C", "B", "A", "C", "C"]
true = ["A", "B", "B", "C", "A", "A", "B", "C"]

rank_labels_by_accuracy(test_df['Prediction'].tolist(), test_df['HS_Code'].tolist())


In [None]:
# Filter rows where HS_Code is not in Alternate.keys()
filtered_df = wrong_df[~wrong_df.apply(lambda row: row['HS_Code'] in row['Alternate'].keys(), axis=1)]
filtered_df

In [None]:
print(precision_recall(test_df['Prediction'].tolist(), test_df['HS_Code'].tolist()))

In [None]:
test_df

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def calc_precision(extr_keywords, ref_keywords):

    # Out of the extracted keywords, checks how many are in the reference
    set1 = set(extr_keywords.split(' '))
    set2 = set(ref_keywords.split(' '))
    intersection = set1.intersection(set2)

    return len(intersection) / len(set1)
def avg_precision(ex_kw_list, ref_kw_list):
    prec_sum = 0
    for ex_kws, ref_kws in zip(ex_kw_list, ref_kw_list):
        prec_sum += calc_precision(ex_kws, ref_kws)

    return round(prec_sum/len(ex_kw_list),3)
def avg_cosine_sim(extr_sentence_embeddings, ref_sentence_embeddings):
    # Finds the average cosine similarity between the extracted keywords and the reference keywords
    cos_sim = 0
    for ex_kws, ref_kws in zip(extr_sentence_embeddings, ref_sentence_embeddings):
        cos_sim += cosine_similarity([ex_kws],[ref_kws])[0][0]

    return round(cos_sim/len(extr_sentence_embeddings),3)

print(avg_precision(test_df['Prediction_Keyword'].tolist(), test_df['KeyWord'].tolist()))

0.563


In [None]:
dfff = test_df.sample(1000)

In [None]:
model = SentenceTransformer('bert-base-nli-mean-tokens')


In [None]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

extr_sentence_embeddings = model.encode(test_df['Prediction_Keyword'].tolist())
ref_sentence_embeddings = model.encode(test_df['KeyWord'].tolist())

Downloading (…)821d1/.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)8d01e821d1/README.md:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading (…)d1/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)01e821d1/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)821d1/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading (…)8d01e821d1/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)1e821d1/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

KeyboardInterrupt: ignored

In [None]:
avg_cosine_sim(extr_sentence_embeddings, ref_sentence_embeddings)

0.778

In [None]:
extr_sentence_embeddings = model.encode(dfff['Prediction_Keyword'].tolist())
ref_sentence_embeddings = model.encode(dfff['KeyWord'].tolist())

# Analyze Wrong Result

In [None]:
wrong_df = test_df[test_df['HS_Code'] != test_df['Prediction']]

def check_hs_code(row):
    max_key = max(row['Alternate'], key=row['Alternate'].get)
    return row['HS_Code'] in row['Alternate'] and row['HS_Code'] != max_key

wrong_df['Is_In_Alternate'] = wrong_df.apply(check_hs_code, axis=1)

# Calculate percentage
percentage = 100 * wrong_df['Is_In_Alternate'].mean()

print(f"{percentage:.2f}% of values in 'HS_Code' are keys in 'Alternate' but not the highest one.")
