In [6]:
import json
import csv
from difflib import SequenceMatcher

# Load the JSON file
json_file_path = 'product-item.json'
with open(json_file_path, 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

# Function to calculate similarity
def get_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

# Load the CSV file and create a list of products
csv_file_path = 'danawa-products.csv'
products = []
with open(csv_file_path, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        # Combine fields into a single string
        combined_string = f"{row['name']} {row['quantity']} {row['amount']}"
        products.append(combined_string)

# Open the output file
output_file_path = 'matching_results.txt'
with open(output_file_path, 'w', encoding='utf-8') as output_file:

    # Match each community title with the best product from the CSV
    for entry in data:
        community_title = entry['community-title']
        mall_title = entry['mall-title']
        mall_product_titles = entry['mall-product-titles']

        # Add mall_title to the mall_product_titles list
        combined_titles = mall_product_titles + [mall_title]

        # Find the most similar product title in the JSON data
        best_match_json = None
        highest_similarity_json = 0

        for title in combined_titles:
            similarity = get_similarity(community_title, title)
            if (similarity > highest_similarity_json):
                highest_similarity_json = similarity
                best_match_json = title

        # Now match the best JSON product with the best product in the CSV
        best_match_csv = None
        highest_similarity_csv = 0

        for product in products:
            similarity = get_similarity(best_match_json, product)
            if similarity > highest_similarity_csv:
                highest_similarity_csv = similarity
                best_match_csv = product

        # Match community title directly with the products in the CSV
        best_match_community_csv = None
        highest_similarity_community_csv = 0

        for product in products:
            similarity = get_similarity(community_title, product)
            if similarity > highest_similarity_community_csv:
                highest_similarity_community_csv = similarity
                best_match_community_csv = product

        # Write the most similar item to the output file
        output_file.write(f"Community Title: {community_title}\n")
        output_file.write(f"Most Similar Mall Title: {best_match_json}\n")
        output_file.write(f"Best Matching Danawa Product (Mall): {best_match_csv} - Similarity: {highest_similarity_csv:.2f}\n")
        output_file.write(f"Best Matching Danawa Product (Community): {best_match_community_csv} - Similarity: {highest_similarity_community_csv:.2f}\n")
        output_file.write("\n")

print(f"Matching results have been saved to {output_file_path}")


KeyboardInterrupt: 

In [5]:
import json
import csv
from difflib import SequenceMatcher

# Load the JSON file
json_file_path = '/Users/juwonkim/Desktop/지름알림/단어수 찾기/product-item.json'
with open(json_file_path, 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

# Function to calculate similarity
def get_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

# Load the CSV file and create a list of products
#csv_file_path = 'match-community-mall-danawa/danawa-products.csv'
csv_file_path ='/Users/juwonkim/Desktop/지름알림/단어수 찾기/danawa-products.csv'
products = []
with open(csv_file_path, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        # Combine fields into a single string
        combined_string = f"{row['name']} {row['quantity']} {row['amount']}"
        products.append(combined_string)

# Limit the number of entries to process
num_entries_to_process = 5  # Adjust this value to control how many entries to process

# Match each community title with the best product from the CSV
for i, entry in enumerate(data):
    if i >= num_entries_to_process:
        break

    community_title = entry['community-title']
    mall_title = entry['mall-title']
    mall_product_titles = entry['mall-product-titles']

    # Add mall_title to the mall_product_titles list
    combined_titles = mall_product_titles + [mall_title]

    # Find the most similar product title in the JSON data
    best_match_json = None
    highest_similarity_json = 0

    for title in combined_titles:
        similarity = get_similarity(community_title, title)
        if similarity > highest_similarity_json:
            highest_similarity_json = similarity
            best_match_json = title

    # Now match the best JSON product with the best product in the CSV
    best_match_csv = None
    highest_similarity_csv = 0

    for product in products:
        similarity = get_similarity(best_match_json, product)
        if similarity > highest_similarity_csv:
            highest_similarity_csv = similarity
            best_match_csv = product

    # Print the results to the console
    print(f"Community Title: {community_title}")
    print(f"Most Similar Mall Title: {best_match_json}")
    print(f"Best Matching Danawa Product (Mall): {best_match_csv} - Similarity: {highest_similarity_csv:.2f}")
    print("\n")

print("Matching results have been displayed.")


Community Title: 한성기업 크래미 몬스터크랩 72g X 15개 (12,900원/무료)
Most Similar Mall Title: 한성기업 크래미 몬스터크랩 72g X 15개
Best Matching Danawa Product (Mall): 크래미 몬스터크랩 72g 15개 - Similarity: 0.83


Community Title: 리바이스 여자 노와이어 브라/팬티 (8,010/4,010원/무료배송)
Most Similar Mall Title: (20%쿠폰) Levis리바이스 여성 와이어/ 노와이어 브라팬티 특가 무료배송
Best Matching Danawa Product (Mall): 바이오스톤 워싱 섬머 쿨 와이드 데님 팬츠  섬머블랙 - Similarity: 0.33


Community Title: 산리오 점풍 라텍스 키즈 쿨매트+베개세트 (14,690/무료)
Most Similar Mall Title: 산리오 정품 쿨매트+베개세트 냉감매트 안전인증필 쿨링매트 냉감패드 시나모롤 쿠로미 마이멜로디
Best Matching Danawa Product (Mall): 산리오 캐릭터즈 윙크 데스크패드  마이멜로디 - Similarity: 0.40


Community Title: 비쵸비 5P 125g X 6박스 외 다수품목  (개당 1830원 / 무료배송)
Most Similar Mall Title: 비쵸비 5P 125g X 6박스
Best Matching Danawa Product (Mall): 비쵸비 5개입 125g 6개 - Similarity: 0.75


Community Title: 무결점 43인치 구글 스마트TV(199,000원/무료배송)
Most Similar Mall Title: HP43G3 무결점 프리즘 바이런 43인치 베젤리스 4K 구글OS 안드로이드 스마트TV
Best Matching Danawa Product (Mall): 바이런 HP43G3 무결점  이동형 스탠드 - Similarity: 0.42


Matching r

In [7]:
import json
from collections import Counter
import re

# Load the JSON file
json_file_path = 'product-item.json'
with open(json_file_path, 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

# Function to count words in a text
def count_words(text):
    # Convert to lowercase, remove punctuation, and split into words
    words = re.findall(r'\b\w+\b', text.lower())
    return Counter(words)

# Initialize a Counter to track word frequencies
word_frequencies = Counter()

# Traverse JSON data to accumulate word frequencies
def traverse_json(data):
    if isinstance(data, dict):
        for value in data.values():
            traverse_json(value)
    elif isinstance(data, list):
        for item in data:
            traverse_json(item)
    elif isinstance(data, str):
        word_frequencies.update(count_words(data))

# Start traversing the JSON structure
traverse_json(data)

# Print word frequencies
print("Word Frequencies:")
for word, count in word_frequencies.most_common():
    print(f"{word}: {count}")


Word Frequencies:
1: 3152
x: 2217
총: 1043
2개: 739
무료: 674
1kg: 670
10: 642
5입: 632
증정: 586
2: 568
3개: 519
대용량: 510
해외직구: 510
아이허브: 509
세트: 495
할인: 480
100: 456
1박스: 455
배홍동: 453
15: 450
4개: 447
2kg: 442
4입: 438
중복쿠폰: 421
무선: 418
모니터: 411
캠핑: 410
신세계푸드: 397
10개: 375
화이트: 372
미니: 371
남성: 366
3: 366
5: 357
8입: 353
신라면: 349
국내산: 342
제로: 341
500g: 338
무배: 337
네이처하이크: 336
블랙: 333
짜파게티: 327
프리미엄: 325
게이밍: 320
500ml: 310
ips: 301
20개: 300
너구리: 296
여성: 292
휴대용: 285
4k: 279
안성탕면: 271
355ml: 270
티셔츠: 270
국산: 269
오리지널: 264
10입: 258
블루투스: 257
캡슐: 253
1개: 248
190ml: 248
쿠폰: 247
쫄쫄면: 243
20: 237
0: 237
무결점: 234
전용: 232
무료배송: 230
5kg: 229
삼성: 229
포스트: 228
msi: 226
100g: 223
반팔: 220
2개x: 219
오늘출발: 217
정품: 215
골라담기: 211
2팩: 209
ua: 208
여름: 206
uhd: 206
오렌지: 204
ssd: 204
200g: 203
2박스: 202
남녀공용: 202
언더아머: 201
방수: 199
오리온: 196
4: 195
24캔: 193
케이블: 193
6개월분: 192
김치: 189
텐트: 189
6개: 188
4봉: 188
카프리썬: 187
가방: 186
300g: 184
샤오미: 184
2종: 181
접이식: 181
코카콜라: 181
추가: 179
섬유유연제: 178
23: 177
3kg: 173
200ml: 173
24개

In [8]:
import json
from collections import defaultdict, Counter
from difflib import SequenceMatcher
import re

# Load the JSON file
json_file_path = 'product-item.json'
with open(json_file_path, 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

# Function to calculate similarity
def get_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

# Function to count words in a text
def count_words(text):
    words = re.findall(r'\b\w+\b', text.lower())
    return Counter(words)

# Group products by similarity threshold
def cluster_titles(titles, threshold=0.7):
    clusters = []
    while titles:
        current_title = titles.pop(0)
        cluster = [current_title]
        similar_titles = []

        for title in titles:
            if get_similarity(current_title, title) >= threshold:
                cluster.append(title)
                similar_titles.append(title)

        # Remove clustered titles from the original list
        for title in similar_titles:
            titles.remove(title)

        clusters.append(cluster)
    return clusters

# Extract titles from JSON
titles = []
for entry in data:
    titles.append(entry['community-title'])
    titles.append(entry['mall-title'])
    titles.extend(entry['mall-product-titles'])

# Cluster the titles
clusters = cluster_titles(titles)

# Analyze word frequencies in each cluster
cluster_word_frequencies = []
for cluster in clusters:
    combined_text = ' '.join(cluster)
    word_freq = count_words(combined_text)
    cluster_word_frequencies.append(word_freq)

# Print cluster results and word frequencies
for i, word_freq in enumerate(cluster_word_frequencies):
    print(f"Cluster {i+1} Word Frequencies:")
    for word, count in word_freq.most_common():
        print(f"{word}: {count}")
    print("\n")


Cluster 1 Word Frequencies:
한성기업: 5
크래미: 5
몬스터크랩: 4
72g: 4
x: 4
15개: 4
무료: 2
12: 1
900원: 1
스노우치즈72g: 1
12개: 1
14: 1
940원: 1


Cluster 2 Word Frequencies:
쿠폰가13770원: 1
한성기업: 1
크래미: 1
몬스터크랩: 1
72g: 1
x: 1
15개: 1


Cluster 3 Word Frequencies:
게맛살: 9
한성: 9
크래미: 9
x: 9
6개: 9
180g: 6
오리지널: 3
몬스터크랩: 3
142g: 3
와일드: 3


Cluster 4 Word Frequencies:
한성기업: 10
크래미: 10
x: 10
15개: 6
스노우치즈: 4
72g: 4
12개: 4
h: 3
90g: 3
와일드: 3
블랙: 3
트러플: 3
70g: 3


Cluster 5 Word Frequencies:
x: 12
한성기업: 6
크래미: 6
몬스터크랩72g: 6
10개: 6
게맛살: 3
h90g: 3
2개: 3
스노우치즈72g: 3
4개: 3


Cluster 6 Word Frequencies:
한성기업: 5
와일드: 5
크래미: 5
x: 5
6개: 5
쉬림프: 3
150g: 3
블랙: 2
트러플: 2
140g: 2


Cluster 7 Word Frequencies:
간식: 3
한성: 3
크래미: 3
치즈볼: 3
48g: 3
x: 3
20개: 3


Cluster 8 Word Frequencies:
한성기업: 6
명란: 6
톡톡: 6
떡갈비: 6
x: 6
10개: 6
핫바: 3
70g: 3
120g: 3


Cluster 9 Word Frequencies:
한성기업: 14
x: 14
4개: 14
부어스트: 11
비바크: 6
260g: 5
도이치: 3
310g: 3
미니윈너: 3
210g: 3
콤비: 3
파티: 3
330g: 3
직화: 2
할라피뇨: 2


Cluster 10 Word Frequencies:
후랑크: 7
70g: 7
x: 7
20개

In [9]:
import json
from collections import defaultdict, Counter
from difflib import SequenceMatcher
import re

# Load the JSON file
json_file_path = 'product-item.json'
with open(json_file_path, 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)
# 유사도 계산 함수
def get_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

# 단어를 세는 함수
def count_words(text):
    words = re.findall(r'\b\w+\b', text.lower())
    return Counter(words)

# 제품명을 클러스터로 묶는 함수
def cluster_titles(titles, threshold=0.7):
    clusters = []
    while titles:
        current_title = titles.pop(0)
        cluster = [current_title]
        similar_titles = []

        for title in titles:
            if get_similarity(current_title, title) >= threshold:
                cluster.append(title)
                similar_titles.append(title)

        for title in similar_titles:
            titles.remove(title)

        clusters.append(cluster)
    return clusters

# JSON 데이터에서 제목 추출
titles = []
for entry in data:
    titles.append(entry['community-title'])
    titles.append(entry['mall-title'])
    titles.extend(entry['mall-product-titles'])

# 제목 클러스터링
clusters = cluster_titles(titles)

# 각 클러스터에서 중요 단어 추출
cluster_important_words = []
for cluster in clusters:
    combined_text = ' '.join(cluster)
    word_freq = count_words(combined_text)
    
    # 중요 단어를 빈도가 높은 순으로 추출
    # 예를 들어, 빈도가 2 이상인 단어를 중요 단어로 간주
    important_words = {word: count for word, count in word_freq.items() if count > 1}
    cluster_important_words.append(important_words)

# 클러스터 결과 및 중요 단어 출력
for i, important_words in enumerate(cluster_important_words):
    print(f"Cluster {i+1} Important Words:")
    for word, count in important_words.items():
        print(f"{word}: {count}")
    print("\n")

Cluster 1 Important Words:
한성기업: 5
크래미: 5
몬스터크랩: 4
72g: 4
x: 4
15개: 4
무료: 2


Cluster 2 Important Words:


Cluster 3 Important Words:
게맛살: 9
한성: 9
크래미: 9
오리지널: 3
180g: 6
x: 9
6개: 9
몬스터크랩: 3
142g: 3
와일드: 3


Cluster 4 Important Words:
한성기업: 10
크래미: 10
h: 3
90g: 3
x: 10
15개: 6
스노우치즈: 4
72g: 4
12개: 4
와일드: 3
블랙: 3
트러플: 3
70g: 3


Cluster 5 Important Words:
한성기업: 6
게맛살: 3
크래미: 6
몬스터크랩72g: 6
x: 12
10개: 6
h90g: 3
2개: 3
스노우치즈72g: 3
4개: 3


Cluster 6 Important Words:
한성기업: 5
와일드: 5
크래미: 5
쉬림프: 3
150g: 3
x: 5
6개: 5
블랙: 2
트러플: 2
140g: 2


Cluster 7 Important Words:
간식: 3
한성: 3
크래미: 3
치즈볼: 3
48g: 3
x: 3
20개: 3


Cluster 8 Important Words:
한성기업: 6
명란: 6
톡톡: 6
떡갈비: 6
핫바: 3
70g: 3
x: 6
10개: 6
120g: 3


Cluster 9 Important Words:
한성기업: 14
비바크: 6
부어스트: 11
260g: 5
x: 14
4개: 14
도이치: 3
310g: 3
미니윈너: 3
210g: 3
콤비: 3
파티: 3
330g: 3
직화: 2
할라피뇨: 2


Cluster 10 Important Words:
한성기업: 6
배즙숙성: 5
숯불구이맛: 5
후랑크: 7
70g: 7
x: 7
20개: 7
고추장: 2
불고기맛: 2
직화구이: 2
핫바: 2


Cluster 11 Important Words:
핫바: 3
한성: 3
맵사이신: 3
화끈구이: