In [34]:
import pandas as pd

df = pd.read_excel('../85data_final.xlsx', index_col=0)

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 76906 entries, 0 to 76905
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   hs code      76906 non-null  int64 
 1   description  76906 non-null  object
 2   reason       76906 non-null  object
 3   product      76906 non-null  object
 4   name_des     76906 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.5+ MB


In [36]:
df.drop(['description', 'reason', 'product'], axis =1, inplace=True)

In [37]:
df = df.groupby('hs code')['name_des'].sum().reset_index()

In [38]:
df.head(10)

Unnamed: 0,hs code,name_des
0,8501,ELECTRIC MOTORS GEARS AND GEARING ELECTRIC ...
1,8502,GENERATORS ELECTRIC GENERATING SETS Suction ...
2,8503,SUPPORTED CATALYSTS SECTIONS GENERATOR PARTS...
3,8504,LOADERS BASE FOR LOADING TERMINAL PAYMENT MOBI...
4,8505,MAGNETS AS STALKS SPRINGS POLYPROPYLENE OF...
5,8506,ELECTRICAL CELLS BUTTONS CONTAINING MANGANES...
6,8507,LITHIUM ACCUMULATORS Lenovo ThinkPad Battery ...
7,8508,VACUUM CLEANERS WITH ELECTRIC MOTOR ELECTRIC...
8,8509,AUTOMATIC GOODS VENDING MACHINES SOAPS ELECTR...
9,8510,HAIR CLIPPERS WITH ELECTRIC MOTOR GOODS PUT ...


In [39]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')  # pos_tag를 위한 데이터 다운로드

headings = []
keyword = []
cnt = 0

# Initialize WordNet Lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

for i in df['name_des']:
    # 텍스트 전처리: 소문자 변환, 토큰화, 불용어 제거
    tokens = word_tokenize(i.lower())
    tagged_tokens = nltk.pos_tag(tokens)  # 단어 토큰에 품사 태깅 추가

    # 품사를 고려하여 단어의 원형 복원 (명사와 동사만 고려)
    filtered_tokens = []
    for word, tag in tagged_tokens:
        if (tag.startswith('NN') or tag.startswith('VB')) and word not in stop_words:  # 명사 또는 동사인 경우
            lemma = lemmatizer.lemmatize(word, pos='n' if tag.startswith('NN') else 'v')
            filtered_tokens.append(lemma)
        elif word.isalpha() and word not in stop_words:
            filtered_tokens.append(word)

    # TF-IDF 벡터화
    tfidf = TfidfVectorizer(ngram_range=(1, 2))
    tfidf_matrix = tfidf.fit_transform([' '.join(filtered_tokens)])

    # TF-IDF 행렬에서 가장 중요한 단어 추출
    feature_names = tfidf.get_feature_names_out()

    # TF-IDF 값이 높은 단어 순으로 정렬
    dense = tfidf_matrix.todense()
    keywords = sorted(zip(dense[0].tolist()[0], feature_names), reverse=True)

    # 상위 키워드를 출력하거나 리스트로 가져올 수 있습니다.
    top_keywords = [word for score, word in keywords[:200]]  # 상위 5개의 키워드를 가져옴
    print(f"{df['hs code'][cnt]} 키워드:", top_keywords)

    headings.append(df['hs code'][cnt])
    keyword.append(top_keywords)
    cnt += 1

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


8501 키워드: ['motor', 'electric', 'dc', 'power', 'electric motor', 'dc motor', 'output', 'mm', 'voltage', 'generator', 'current', 'gear', 'solar', 'housing', 'use', 'product', 'phase', 'engine', 'less', 'supply', 'motor electric', 'direct', 'drive', 'ac', 'speed', 'diameter', 'motor output', 'cable', 'approx', 'motor dc', 'classify', 'electrical', 'direct current', 'vehicle', 'good', 'battery', 'motor power', 'multi', 'cell', 'device', 'rpm', 'magnet', 'part', 'classify electric', 'mount', 'panel', 'control', 'maximum', 'main', 'multi phase', 'dimension', 'ac motor', 'supply voltage', 'connection', 'shaft', 'kw', 'vdc', 'synchronous', 'diameter mm', 'energy', 'machine', 'plastic', 'motor vehicle', 'consist', 'call', 'usb', 'permanent', 'two', 'combination', 'permanent magnet', 'system', 'approx mm', 'outer', 'subheading', 'unit', 'exceed', 'taric', 'rotor', 'form', 'charge', 'dc generator', 'function', 'speed rpm', 'outer diameter', 'charger', 'watt', 'actuator', 'accord', 'power supply'

In [41]:
data = {'hs_code': headings, 'keyword': keyword}
df = pd.DataFrame(data)

In [42]:
df.head()

Unnamed: 0,hs_code,keyword
0,8501,"[motor, electric, dc, power, electric motor, d..."
1,8502,"[generator, power, electric, generate, engine,..."
2,8503,"[motor, electric, electric motor, part, steel,..."
3,8504,"[power, converter, voltage, use, electric, dc,..."
4,8505,"[magnet, permanent, permanent magnet, metal, m..."


In [43]:
df.to_excel('85des_keywords_200.xlsx')