# Youtube

## Load

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./database/youtube.csv', dtype={'description':str})

In [3]:
df.tail()

Unnamed: 0,publishedAt,title,viewCount,likeCount,commentCount,duration,description
3908,2018-10-05T02:23:28Z,빅토리의5번째주식단타매매영상,47,1,0,PT1M1S,하나제약
3909,2018-10-04T15:55:58Z,빅토리의4번째주식단타매매영상,55,0,0,PT16M1S,비츠로시스
3910,2018-10-04T11:03:20Z,빅토리의3번째주식단타매매영상,18,0,0,PT3M9S,한국내화
3911,2018-10-04T10:48:14Z,빅토리의2번째주식단타매매영상,97,1,0,PT6M40S,아난티
3912,2018-10-04T10:06:05Z,빅토리의1번째주식단타매매영상,284,4,0,PT3M12S,현성바이탈


## KoNLPy

In [4]:
!pip install konlpy



In [5]:
from konlpy.tag import Okt

In [6]:
okt = Okt()

In [7]:
df['title'] = df['title'].apply(lambda x : okt.morphs(x, stem=True))
df['title'] = df['title'].apply(lambda x : ' '.join(x))

In [8]:
df.tail()

Unnamed: 0,publishedAt,title,viewCount,likeCount,commentCount,duration,description
3908,2018-10-05T02:23:28Z,빅토리 의 5 번 째 주 식단 타 매 매 영상,47,1,0,PT1M1S,하나제약
3909,2018-10-04T15:55:58Z,빅토리 의 4 번 째 주 식단 타 매 매 영상,55,0,0,PT16M1S,비츠로시스
3910,2018-10-04T11:03:20Z,빅토리 의 3 번 째 주 식단 타 매 매 영상,18,0,0,PT3M9S,한국내화
3911,2018-10-04T10:48:14Z,빅토리 의 2 번 째 주 식단 타 매 매 영상,97,1,0,PT6M40S,아난티
3912,2018-10-04T10:06:05Z,빅토리 의 1 번 째 주 식단 타 매 매 영상,284,4,0,PT3M12S,현성바이탈


## X and y
X : ```title```  
y : ```viewcount```

In [9]:
import numpy as np

In [10]:
X = np.array(df['title'])
y = np.array(df['viewCount'])
print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (3913,)
y shape: (3913,)


In [11]:
print(len(X))
print(len(y))

3913
3913


X의 데이터와 데이터타입

In [12]:
print(X[-1])

빅토리 의 1 번 째 주 식단 타 매 매 영상


In [13]:
type(X[-1])

str

y의 데이터와 데이터타입

In [14]:
print(y[-1])

284


In [15]:
type(y[-1])

numpy.int64

```y``` 라벨 중앙값(median)으로 binary classification의 threshold를 정합니다.

In [16]:
threshold = np.median(y)
print(threshold)

2498.0


In [17]:
y = np.where(y>=threshold,1,0)
print("Positive Label:", len(y[y == 1]))
print("Negative Label:", len(y[y == 0]))

Positive Label: 1957
Negative Label: 1956


수정된 y의 데이터와 데이터타입

In [18]:
print(y[-1])

0


In [19]:
type(y[-1])

numpy.int32

## Train and Test

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test,y_train, y_test = train_test_split(X, y, 
                                                   random_state=42, 
                                                   test_size = 0.3)
print("X 훈련데이터 shape:", X_train.shape)
print("y 훈련데이터 shape:", y_train.shape)
print("X 실험데이터 shape:", X_test.shape)
print("y 실험데이터 shape:", y_test.shape)

X 훈련데이터 shape: (2739,)
y 훈련데이터 shape: (2739,)
X 실험데이터 shape: (1174,)
y 실험데이터 shape: (1174,)


## Vectorization
```CountVectorizer```를 사용하거나 ```TfidfVectorizer```를 사용합니다.

#### CountVector

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X_train = cv.fit_transform(X_train).toarray()
print("단어사전의 단어개수:", len(cv.vocabulary_))
print("X 훈련데이터 shape:", X_train.shape)
input_length = len(cv.vocabulary_)

단어사전의 단어개수: 5171
X 훈련데이터 shape: (2739, 5171)


#### TF-IDFVector

In [23]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf = TfidfVectorizer()
# X_train = tfidf.fit_transform(X_train).toarray()
# print("단어사전의 단어개수:", len(tfidf.vocabulary_))
# print("X 훈련데이터 shape:", X_train.shape)
# input_length = len(tfidf.vocabulary_)

적용결과

In [24]:
len(X_train[0])

5171

In [25]:
print(X_train[0])

[0 0 0 ... 0 0 0]


In [26]:
len(X_train)

2739

In [27]:
print(X_train)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## Model

In [28]:
from keras.optimizers import Adagrad
from keras.models import Sequential
from keras.layers import Dropout
from keras.layers import Dense

In [29]:
model = Sequential()
model.add(Dense(4, input_shape=(input_length,), activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(4, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

In [30]:
optimizer = Adagrad(learning_rate=0.3)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [31]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 4)                 20688     
                                                                 
 dropout (Dropout)           (None, 4)                 0         
                                                                 
 dense_1 (Dense)             (None, 4)                 20        
                                                                 
 dropout_1 (Dropout)         (None, 4)                 0         
                                                                 
 dense_2 (Dense)             (None, 1)                 5         
                                                                 
Total params: 20,713
Trainable params: 20,713
Non-trainable params: 0
_________________________________________________________________


#### Learning

In [32]:
model.fit(X_train, y_train, epochs=20, batch_size=50)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1bc6caa46a0>

#### Evaluation

In [33]:
X_test = cv.transform(X_test).toarray()

In [34]:
from sklearn import metrics
_, accuracy = model.evaluate(X_test, y_test)



In [35]:
print(accuracy)

0.8347529768943787


Confusion Matrix

In [36]:
y_pred = model.predict(X_test)
confusion_matrix = metrics.confusion_matrix(y_test, np.rint(y_pred))



In [37]:
print(confusion_matrix)

[[466 151]
 [ 43 514]]


#### Save

In [38]:
from joblib import dump

In [39]:
dump(cv, "youtube_scaler.pkl")

['youtube_scaler.pkl']

In [40]:
from tensorflow import keras

In [41]:
model.save("youtube_model.h5")

## Prediction

#### load

In [42]:
from joblib import load

In [43]:
scaler = load("./database/youtube_scaler.pkl")

In [44]:
from tensorflow import keras

In [45]:
model = keras.models.load_model("./database/youtube_model.h5")

#### text 1

In [46]:
text = "인공지능으로 유튜브 조회수 예측하기 #2"

In [47]:
text = scaler.transform([text]).toarray()
print(text)

[[0 0 0 ... 0 0 0]]


In [48]:
prediction = model.predict(text)
print(prediction)

[[0.06669771]]


In [49]:
prediction = np.rint(prediction)
print(prediction)

[[0.]]


#### text 2

In [50]:
text = "hlb"

In [51]:
text = scaler.transform([text]).toarray()
print(text)

[[0 0 0 ... 0 0 0]]


In [52]:
prediction = model.predict(text)
print(prediction)

[[0.9973022]]


In [53]:
prediction = np.rint(prediction)
print(prediction)

[[1.]]
