In [442]:
import os
import glob
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import skmultilearn
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.adapt import MLkNN
import logging
from gensim.models import word2vec

In [243]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score, hamming_loss, classification_report

In [244]:
DATA_IN_PATH = os.getcwd() + '\\crawling_result\\youtuber_text'
CRAWLER_PATH = os.getcwd() + '\\crawling_result'

train_data = pd.read_csv(CRAWLER_PATH + '\\train_data.csv',index_col=0, engine='python', encoding = "utf-8")

train_data.head()

Unnamed: 0,review,/m/025zzc,/m/02ntfj,/m/0b1vjn,/m/02hygl,/m/04q1x3q,/m/01sjng,/m/0403l3g,/m/021bp2,/m/022dc6,/m/03hf_rm
UC-Zedn7a_RJyb5hUQ-aGZog,서양권 인물의 성씨로 쓰인다 자세한 내용은 머독인터넷 방송인 문서를의 번째 문단을의...,True,False,False,False,False,False,True,False,False,False
UC1dG3vI9FfHnH3YgyeKUz_A,말부터 시작한 아프리카tv의 리그 오브 레전드 bj 챌린저 정글러다 콘텐츠로 강의방...,True,False,False,False,False,False,True,False,False,False
UC1MO5uem_t8lRgvIBF9u83w,게임 영상 투고 및 게임 번역을 주로 하는 유튜버ai설이 나돌 정도로 과묵하고 묵묵...,True,True,False,False,False,False,True,False,False,False
UC1q4Ihlv_YhLELw-ijE0Diw,마인크래프트를 주로 하는 팀 샐러드 소속의 유튜버이자 트위치 스트리머마인애플은 마인...,True,False,False,False,False,False,True,False,False,False
UC2FDVyrQnjoZCUyk9fmqd9g,중순부터 본격적으로 방송을 시작한 크로아티아 출신 인터넷 방송인 일명 푸른 눈의 팟...,True,False,False,False,False,False,False,False,False,False


In [245]:
train_text = list(train_data['review'])
train_labels = train_data.iloc[:, 1:]

### TFIDF

In [432]:
tfidf = TfidfVectorizer(min_df = 0.0, analyzer='char', sublinear_tf=True, ngram_range=(1,3), max_features=10)
X = tfidf.fit_transform(train_text)
X.shape

(100, 10)

### word2vec

In [444]:
num_features = 300
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [448]:
sentences = []
for t in train_text:
    sentences.append(t.split())

labels = train_labels.values

print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, min_count= min_word_count, window=context, sample=downsampling )

2020-12-21 05:16:39,722 : INFO : collecting all words and their counts
2020-12-21 05:16:39,728 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-12-21 05:16:39,847 : INFO : collected 97499 word types from a corpus of 322129 raw words and 100 sentences
2020-12-21 05:16:39,847 : INFO : Loading a fresh vocabulary


Training model...


2020-12-21 05:16:39,908 : INFO : effective_min_count=40 retains 935 unique words (0% of original 97499, drops 96564)
2020-12-21 05:16:39,909 : INFO : effective_min_count=40 leaves 111112 word corpus (34% of original 322129, drops 211017)
2020-12-21 05:16:39,911 : INFO : deleting the raw counts dictionary of 97499 items
2020-12-21 05:16:39,917 : INFO : sample=0.001 downsamples 63 most-common words
2020-12-21 05:16:39,923 : INFO : downsampling leaves estimated 97523 word corpus (87.8% of prior 111112)
2020-12-21 05:16:39,927 : INFO : estimated required memory for 935 words and 300 dimensions: 2711500 bytes
2020-12-21 05:16:39,928 : INFO : resetting layer weights
2020-12-21 05:16:40,126 : INFO : training model with 4 workers on 935 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
2020-12-21 05:16:40,229 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-12-21 05:16:40,230 : INFO : worker thread finished; awaiting finish of 2 more threads


In [449]:
model_name = "300features_40minwords10context"
model.save(model_name)

2020-12-21 05:17:26,198 : INFO : saving Word2Vec object under 300features_40minwords10context, separately None
2020-12-21 05:17:26,199 : INFO : not storing attribute vectors_norm
2020-12-21 05:17:26,202 : INFO : not storing attribute cum_table
2020-12-21 05:17:26,226 : INFO : saved 300features_40minwords10context


In [456]:
def get_features(words, model, num_features):
    feature_vector = np.zeros((num_features), dtype=np.float32)
    
    num_words = 0
    
    index2word_set = set(model.wv.index2word)
    
    for w in words:
        if w in index2word_set:
            num_words += 1
            feature_vector = np.add(feature_vector, model[w])
            
    feature_vector = np.divide(feature_vector, num_words)
    return feature_vector

In [457]:
def get_dataset(reviews, model, num_features):
    dataset = list()
    
    for s in reviews:
        dataset.append(get_features(s, model, num_features))
        
    reviewFeatureVecs = np.stack(dataset)
    
    return reviewFeatureVecs

In [477]:
X = get_dataset(sentences, model, num_features)

  feature_vector = np.add(feature_vector, model[w])


In [478]:
X

array([[ 0.02854413, -0.13910031, -0.10310345, ...,  0.02278697,
        -0.10987526, -0.04159285],
       [-0.01004763, -0.0394398 ,  0.01493916, ...,  0.08809038,
        -0.10209024, -0.02917306],
       [-0.04095584, -0.03055699,  0.01647139, ...,  0.10322094,
        -0.12138531, -0.02858691],
       ...,
       [-0.04654637, -0.01385875,  0.00698293, ...,  0.09984439,
        -0.12048138, -0.02880032],
       [ 0.03087607, -0.12843002, -0.07048663, ...,  0.06692038,
        -0.3457006 , -0.01524172],
       [-0.05248855,  0.01018768, -0.01950465, ...,  0.08926157,
        -0.13405801, -0.0255207 ]], dtype=float32)

In [479]:
RANDOM_SEED = 42
TEST_SPLIT = 0.2

y = train_labels

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = TEST_SPLIT, random_state=RANDOM_SEED)

### Models

In [481]:
def build_model(model, mlb_estimator, xtrain, ytrain, xtest, ytest):
    clf = mlb_estimator(model)
    clf.fit(xtrain, ytrain)
    clf_predictions = clf.predict(xtest)
    print(clf_predictions.toarray())
    acc = accuracy_score(ytest,clf_predictions)
    ham = hamming_loss(ytest, clf_predictions)
    result = {"accuracy": acc, "hamming score": ham}
    return result

In [482]:
clf_binary_model = build_model(GaussianNB(), BinaryRelevance, X_train, Y_train, X_test, Y_test)

[[ True  True False False False False False False False False]
 [ True  True  True False False False False False False  True]
 [ True  True  True False False False False False False  True]
 [ True False  True False False False  True False False False]
 [ True  True  True False False  True False False False  True]
 [ True  True  True False False False False False False  True]
 [False False False False False False  True False False False]
 [ True  True  True False False  True False False False  True]
 [ True  True  True False False  True False False False  True]
 [ True False False False False False  True False False False]
 [ True  True  True False False  True False False False  True]
 [ True False False False False  True  True False False  True]
 [ True False False False False False  True False False False]
 [ True  True  True False False False False False False False]
 [ True False False False False False  True False False  True]
 [ True  True False False False False False False False

In [483]:
clf_binary_model

{'accuracy': 0.2, 'hamming score': 0.215}

In [484]:
clf_chain_model = build_model(GaussianNB(), ClassifierChain, X_train, Y_train, X_test, Y_test)

[[1. 1. 0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 0. 1. 0. 0. 0. 1. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 1. 1. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 1. 0. 0. 1.]
 [1. 1. 0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 0. 1. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 0. 1. 0. 0. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 1. 1. 0. 0. 0.]]


In [485]:
clf_chain_model

{'accuracy': 0.15, 'hamming score': 0.115}

In [486]:
# LabelPowerset
clf_labelP_model = build_model(GaussianNB(), LabelPowerset, X_train, Y_train, X_test, Y_test)

[[1 1 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [1 0 1 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 0 0]
 [1 1 0 0 0 0 1 0 0 0]
 [1 0 1 0 0 0 1 0 0 0]
 [1 0 0 0 0 0 1 0 0 0]
 [1 0 1 0 0 0 1 0 0 0]
 [1 0 0 0 0 0 1 0 0 1]
 [1 0 0 0 0 0 1 0 0 0]
 [1 0 1 0 0 0 1 0 0 0]
 [1 0 0 0 0 0 1 0 0 1]
 [1 1 0 0 0 0 1 0 0 0]
 [1 0 1 0 0 0 1 0 0 0]
 [1 0 0 0 0 0 1 0 0 0]
 [1 0 1 0 0 0 1 0 0 0]
 [1 0 0 0 0 0 1 0 0 0]]


In [487]:
clf_labelP_model

{'accuracy': 0.25, 'hamming score': 0.125}