In [21]:
# import library for manipulating data
import numpy as np
import pandas as pd

import re # regex
import ast

from datetime import datetime

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def removeSpecial(text):
    # Using regular expression to remove special characters and double whitespace
    cleaned_text = re.sub(r'[^\w\s]', '', str(text))  # Remove special characters
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces with a single space
    return cleaned_text.strip()  # Remove leading and trailing spaces

def removeSpecialNumbers(text):
    # Using regular expression to remove special characters and double whitespace
    cleaned_text = re.sub(r'\w*\d\w*', ' ', text).strip()
    cleaned_text = re.sub(r'[^\w\s]', '', str(cleaned_text))  # Remove special characters
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces with a single space
    return cleaned_text.strip()  # Remove leading and trailing spaces

stop_words = set(stopwords.words('indonesian'))
def tokenize_word(string):
    str_clean = removeSpecial(str(string).lower())
    tokenized = word_tokenize(str_clean)
    return [word for word in set(tokenized) if word not in stop_words]

def tokenize_word_number(string):
    stop_words.update(['pupuk', 'gram', 'lt', 'liter', 'litre', 'gr', 'kg', 'ml', 'l', 'ltr'])
    str_clean = removeSpecialNumbers(str(string).lower())
    tokenized = word_tokenize(str_clean)
    return [word for word in set(tokenized) if word not in stop_words]

# Function to calculate Jaccard similarity
def jaccard_similarity(a, b):
    intersection = len(set(a).intersection(b))
    union = len(a) + len(b) - intersection
    return intersection / union

df_pupuk = pd.read_pickle('data_pupuk_all_ready.pkl')\
    .reset_index()
descriptions = [scr for scr in df_pupuk.T.to_dict().values()]

def getSimilarProduct(pos):
    product_name_clean = tokenize_word(pos)
    product_name_nonumber = tokenize_word_number(pos)

    local_match = []
    for desc in descriptions:
        similarity_lv1 = jaccard_similarity(product_name_nonumber, desc['description_nonumber'])
        if similarity_lv1 > 0:
            local_match_temp = {'pos_name': pos,
                                'pos_rawtext_clean': product_name_clean,
                                'pos_rawtext_nonumber': product_name_nonumber,
                                'similarity_lv1': similarity_lv1,
                                'match_id': desc['index'],
                                'match_text': desc['Product SKU'],
                                'match_raw_clean': desc['description_clean'],
                                'match_raw_nonumber': desc['description_nonumber']}
            local_match.append(local_match_temp)

    if local_match.__len__() == 0:
        match_out = {'pos_name': pos,
                        'pos_rawtext_clean': product_name_clean,
                        'pos_rawtext_nonumber': product_name_nonumber,
                        'similarity_lv1': 0,
                        'match_id': None,
                        'match_text': None,
                        'match_raw_clean': None,
                        'match_raw_nonumber': None,
                        'similarity_lv2': 0}
    else:
        max_similarity = 0
        for match in local_match:
            similarity_lv2 = jaccard_similarity(match['pos_rawtext_clean'], match['match_raw_clean'])
            if similarity_lv2 > max_similarity:
                match_out = match
                match_out['similarity_lv2'] = similarity_lv2
                
                max_similarity = similarity_lv2
    
    classification = 'match' if match_out['similarity_lv2'] >= .2 else 'not match'
    best_match = descriptions[match_out['match_id']] if local_match.__len__() else None
    output = {'time' : datetime.today(),
              'product': pos,
              'classification': classification,
              'best_match': best_match,
              'similarity': match_out['similarity_lv2']}
    return output

In [11]:
from main import getSimilarProduct

In [12]:
import pandas as pd

# random generate from PoS
df_pos = pd.read_excel('Product Name from PoS Transactions.xlsx')\
    .sample()\
    .reset_index()
pos = df_pos.head()['Product Name'][0]
print(pos)

gandasil B 500g


In [15]:
df_pupuk = pd.read_pickle('data_pupuk_all_ready.pkl')\
    .reset_index()
descriptions = [scr for scr in df_pupuk.T.to_dict().values()]

descriptions[1374]

{'index': 1374,
 'Product SKU': 'Gandasil B NPK 6-20-30 (Tepung)',
 'Brand': 'Gandasil B',
 'Type': 'NPK 6-20-30 (Tepung)',
 'Formula': '6-20-30',
 'new_sku': 1.0,
 'description': 'Gandasil B NPK 6-20-30 (Tepung) Gandasil B NPK 6-20-30 (Tepung) 6-20-30',
 'description_clean': ['npk', 'b', '62030', 'gandasil', 'tepung'],
 'description_nonumber': ['npk', 'b', 'tepung', 'gandasil']}

In [23]:
getSimilarProduct(pos)

{'time': datetime.datetime(2023, 11, 21, 7, 27, 48, 518479),
 'product': 'gandasil B 500g',
 'classification': 'match',
 'match_to': {'index': 1374,
  'Product SKU': 'Gandasil B NPK 6-20-30 (Tepung)',
  'Brand': 'Gandasil B',
  'Type': 'NPK 6-20-30 (Tepung)',
  'Formula': '6-20-30',
  'new_sku': 1.0,
  'description': 'Gandasil B NPK 6-20-30 (Tepung) Gandasil B NPK 6-20-30 (Tepung) 6-20-30',
  'description_clean': ['npk', 'b', '62030', 'gandasil', 'tepung'],
  'description_nonumber': ['npk', 'b', 'tepung', 'gandasil']},
 'similarity': 0.3333333333333333}

In [25]:
getSimilarProduct('shampoo')

{'time': datetime.datetime(2023, 11, 21, 7, 28, 2, 914037),
 'product': 'shampoo',
 'classification': 'not match',
 'match_to': None,
 'similarity': 0}

In [27]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Number of rows in the DataFrame
num_rows = 10

# Generate random datetime values within a range
start_date = pd.to_datetime('2023-01-01')
end_date = pd.to_datetime('2023-12-31')

date_range = (end_date - start_date).days

# Generate random datetime values within the date range with time component
random_dates = [start_date + timedelta(days=random.randint(0, date_range),
                                       hours=random.randint(0, 23),
                                       minutes=random.randint(0, 59),
                                       seconds=random.randint(0, 59))
                for _ in range(num_rows)]

# Create other random data columns (example: random integers)
random_values = np.random.randint(0, 100, size=num_rows)

# Create a DataFrame with random datetime values including time
data = {'RandomDateTime': random_dates, 'RandomValue': random_values}
df = pd.DataFrame(data)

print(df)

       RandomDateTime  RandomValue
0 2023-06-10 15:56:54            6
1 2023-11-18 06:37:54            1
2 2023-06-29 12:44:10           99
3 2023-11-02 12:43:33           29
4 2023-09-07 09:58:21           67
5 2023-12-04 21:56:42            3
6 2023-07-11 19:44:49           54
7 2023-03-02 14:22:07           80
8 2023-07-15 09:35:16           32
9 2023-09-27 11:49:16           63


In [28]:
df_result = pd.read_pickle('data_result_similarities.pkl')
df_result.sample(5)

Unnamed: 0_level_0,pos_name,pos_rawtext_clean,pos_rawtext_nonumber,similarity_lv1,match_id,match_text,match_raw_clean,match_raw_nonumber,similarity_lv2
pos_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
5221,Lannate 40 SP 100 grm,"['lannate', '40', 'grm', '100', 'sp']","['grm', 'lannate', 'sp']",0.166667,4907.0,GREAT 40 SP ukuran 100 gram,"['40', 'pestisida', 'ukuran', 'great', 'gram',...","['sp', 'ukuran', 'pestisida', 'great']",0.333333
26979,basmilang500ml,['basmilang500ml'],[],,,,,,
19868,Seltima@500ml,['seltima500ml'],['seltima'],,,,,,
21077,sprayer msa 2 in 1,"['1', 'in', 'sprayer', '2', 'msa']","['sprayer', 'msa', 'in']",0.166667,3489.0,Pomi 4 in 1 Organik Cair,"['pomi', '1', 'in', 'organik', 'cair', '4']","['pomi', 'organik', 'cair', 'in']",0.222222
12434,bablas 5,"['bablas', '5']",['bablas'],,,,,,
