# Youtube

## Load

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./database/youtube.csv')

In [3]:
df.tail()

Unnamed: 0,publishedAt,title,viewCount,likeCount,commentCount,duration,description
3803,2018-10-05T02:23:28Z,빅토리의5번째주식단타매매영상,47,1,0,PT1M1S,하나제약
3804,2018-10-04T15:55:58Z,빅토리의4번째주식단타매매영상,55,0,0,PT16M1S,비츠로시스
3805,2018-10-04T11:03:20Z,빅토리의3번째주식단타매매영상,18,0,0,PT3M9S,한국내화
3806,2018-10-04T10:48:14Z,빅토리의2번째주식단타매매영상,97,1,0,PT6M40S,아난티
3807,2018-10-04T10:06:05Z,빅토리의1번째주식단타매매영상,284,4,0,PT3M12S,현성바이탈


## KoNLPy

In [4]:
!pip install konlpy



In [5]:
from konlpy.tag import Okt

In [6]:
okt = Okt()

In [7]:
df['title'] = df['title'].apply(lambda x : okt.morphs(x, stem=True))
df['title'] = df['title'].apply(lambda x : ' '.join(x))

In [8]:
df.tail()

Unnamed: 0,publishedAt,title,viewCount,likeCount,commentCount,duration,description
3803,2018-10-05T02:23:28Z,빅토리 의 5 번 째 주 식단 타 매 매 영상,47,1,0,PT1M1S,하나제약
3804,2018-10-04T15:55:58Z,빅토리 의 4 번 째 주 식단 타 매 매 영상,55,0,0,PT16M1S,비츠로시스
3805,2018-10-04T11:03:20Z,빅토리 의 3 번 째 주 식단 타 매 매 영상,18,0,0,PT3M9S,한국내화
3806,2018-10-04T10:48:14Z,빅토리 의 2 번 째 주 식단 타 매 매 영상,97,1,0,PT6M40S,아난티
3807,2018-10-04T10:06:05Z,빅토리 의 1 번 째 주 식단 타 매 매 영상,284,4,0,PT3M12S,현성바이탈


## X and y
X : ```title```  
y : ```viewcount```

In [9]:
import numpy as np

In [10]:
X = np.array(df['title'])
y = np.array(df['viewCount'])
print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (3808,)
y shape: (3808,)


```y``` 라벨 중앙값(median)으로 binary classification의 threshold를 정합니다.

In [11]:
threshold = np.median(y)
print(threshold)

2626.0


In [12]:
y = np.where(y>=threshold,1,0)
print("Positive Label:", len(y[y == 1]))
print("Negative Label:", len(y[y == 0]))

Positive Label: 1904
Negative Label: 1904


## Train and Test

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test,y_train, y_test = train_test_split(X, y, 
                                                   random_state=42, 
                                                   test_size = 0.3)
print("X 훈련데이터 shape:", X_train.shape)
print("y 훈련데이터 shape:", y_train.shape)
print("X 실험데이터 shape:", X_test.shape)
print("y 실험데이터 shape:", y_test.shape)

X 훈련데이터 shape: (2665,)
y 훈련데이터 shape: (2665,)
X 실험데이터 shape: (1143,)
y 실험데이터 shape: (1143,)


## Vectorization
```CountVectorizer```를 사용하거나 ```TfidfVectorizer```를 사용합니다.

#### CountVector

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X_train = cv.fit_transform(X_train).toarray()
print("단어사전의 단어개수:", len(cv.vocabulary_))
print("X 훈련데이터 shape:", X_train.shape)

단어사전의 단어개수: 5159
X 훈련데이터 shape: (2665, 5159)


#### TF-IDFVector

In [16]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf = TfidfVectorizer()
# X_train = tfidf.fit_transform(X_train).toarray()
# print("단어사전의 단어개수:", len(tfidf.vocabulary_))
# print("X 훈련데이터 shape:", X_train.shape)

## Model

In [17]:
from keras.optimizers import Adagrad
from keras.models import Sequential
from keras.layers import Dropout
from keras.layers import Dense

In [18]:
model = Sequential()
model.add(Dense(4, input_dim=len(cv.vocabulary_), activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(4, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

In [19]:
optimizer = Adagrad(learning_rate=0.3)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 4)                 20640     
                                                                 
 dropout (Dropout)           (None, 4)                 0         
                                                                 
 dense_1 (Dense)             (None, 4)                 20        
                                                                 
 dropout_1 (Dropout)         (None, 4)                 0         
                                                                 
 dense_2 (Dense)             (None, 1)                 5         
                                                                 
Total params: 20,665
Trainable params: 20,665
Non-trainable params: 0
_________________________________________________________________


#### Learning

In [21]:
model.fit(X_train, y_train, epochs=20, batch_size=50)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2a7680a7ac0>

#### Evaluation

In [22]:
X_test = cv.transform(X_test).toarray()

In [23]:
from sklearn import metrics
_, accuracy = model.evaluate(X_test, y_test)



In [24]:
print(accuracy)

0.8223971724510193


Confusion Matrix

In [25]:
y_pred = model.predict(X_test)
confusion_matrix = metrics.confusion_matrix(y_test, np.rint(y_pred))



In [26]:
print(confusion_matrix)

[[431 125]
 [ 78 509]]


#### Save

In [27]:
from joblib import dump

In [28]:
dump(cv, "youtube_scaler.pkl")

['youtube_scaler.pkl']

In [29]:
dump(model, "youtube_model.pkl")

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\dense
......vars
.........0
.........1
...layers\dense_1
......vars
.........0
.........1
...layers\dense_2
......vars
.........0
.........1
...layers\dropout
......vars
...layers\dropout_1
......vars
...metrics\mean
......vars
.........0
.........1
...metrics\mean_metric_wrapper
......vars
.........0
.........1
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2023-01-18 16:02:42         2336
metadata.json                                  2023-01-18 16:02:42           64
variables.h5                                   2023-01-18 16:02:42       103032


['youtube_model.pkl']

## Prediction

In [30]:
from joblib import load

In [31]:
scaler = load("./database/youtube_scaler.pkl")
model = load("./database/youtube_model.pkl")

Keras model archive loading:
File Name                                             Modified             Size
config.json                                    2023-01-18 14:55:42         2336
metadata.json                                  2023-01-18 14:55:42           64
variables.h5                                   2023-01-18 14:55:42       103032
Keras weights file (<HDF5 file "variables.h5" (mode r)>) loading:
...layers\dense
......vars
.........0
.........1
...layers\dense_1
......vars
.........0
.........1
...layers\dense_2
......vars
.........0
.........1
...layers\dropout
......vars
...layers\dropout_1
......vars
...metrics\mean
......vars
.........0
.........1
...metrics\mean_metric_wrapper
......vars
.........0
.........1
...vars


In [32]:
text = "인공지능으로 유튜브 조회수 예측하기 #2"

In [33]:
text = scaler.transform([text])

In [34]:
prediction = model.predict(text)



In [35]:
print(prediction)

[[0.06948145]]


In [36]:
prediction = np.rint(prediction)

In [37]:
print(prediction)

[[0.]]
