In [10]:
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [6]:
# Google Vision으로 확인된 라벨값을 menu, vibe로 직접 분류시켜서, 해당 csv파일을 로드. 
df = pd.read_csv('./에프스토리_rank_분위기맛집.csv', index_col=[0])
df

Unnamed: 0,label,count,vibe_menu
98,Food,34,menu
238,Tree,28,vibe
190,Room,28,vibe
60,Cuisine,27,menu
72,Dish,26,menu
...,...,...,...
116,Handrail,1,vibe
117,Handwriting,1,vibe
119,Head,1,vibe
120,Holiday,1,vibe


In [11]:
# Train, Test 나누기
X_train, X_test, y_train, y_test = train_test_split(
df.label, df.vibe_menu, test_size=0.1, random_state=1)
len(X_train), len(X_test), len(y_train), len(y_test)

(233, 26, 233, 26)

In [12]:
clf = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', MultinomialNB(alpha=0.1)),
])

In [13]:
model = clf.fit(X_train.values.astype("str"), y_train)

In [14]:
y_pred = model.predict(X_test)

In [15]:
list(y_test[:5]), list(y_pred[:5])

(['vibe', 'vibe', 'menu', 'menu', 'menu'],
 ['vibe', 'vibe', 'vibe', 'vibe', 'menu'])

In [16]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        menu       1.00      0.38      0.55         8
        vibe       0.78      1.00      0.88        18

    accuracy                           0.81        26
   macro avg       0.89      0.69      0.71        26
weighted avg       0.85      0.81      0.78        26



In [17]:
categories = {
'vibe':'vibe',
    'menu':'menu'
}

In [32]:
# Test해볼 라벨 임의로 지정
contents = [
    'urban area', 
    'tree',
    'dish',
    'room'
    ]

In [33]:
# Test 라벨의 예측된 category 확인
datas = {'label':contents,
        'category':model.predict(contents)}
df = pd.DataFrame(datas)
df

Unnamed: 0,label,category
0,urban area,vibe
1,tree,vibe
2,dish,menu
3,room,vibe
