In [None]:
!set -x \
&& pip install konlpy \
&& curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh | bash -x

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from konlpy.tag import Mecab
import re

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
################################ 전처리및 형태소분석##########################
train = pd.read_csv("/content/drive/MyDrive/인공지능/open/news_train.csv")
test = pd.read_csv("/content/drive/MyDrive/인공지능/open/news_test.csv")
def text_preprocessing(text_list):
    
    stopwords = ['을', '를', '이', '가', '은', '는', 'null'] #불용어 설정
    tokenizer = Mecab() #형태소 분석 Mecab 함수
    token_list = []
    
    for text in text_list:
        txt = re.sub('[^가-힣a-z]', ' ', text) #한글과 영어 소문자
        token = tokenizer.morphs(txt) #형태소 분석
        token = [t for t in token if t not in stopwords or type(t) != float] 
        token_list.append(token)
        
    return token_list, tokenizer


In [5]:
train['token'], mecab = text_preprocessing(train['content'])


In [6]:
test['token'], mecab2= text_preprocessing(test['content'])


In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
#########################vectorization##########################
def text2sequence(train_text, max_len=100):
    
    tokenizer = Tokenizer() #vectorizing
    tokenizer.fit_on_texts(train_text) #fit
    train_X_seq = tokenizer.texts_to_sequences(train_text) 
    vocab_size = len(tokenizer.word_index) + 1 
    print('vocab_size : ', vocab_size)
    X_train = pad_sequences(train_X_seq, maxlen = max_len) #최대 길이 padding
    
    return X_train, vocab_size, tokenizer
train_y = train['info']
train_x, vocab_size, vectorizer = text2sequence(train['token'], max_len = 100)
test_X_seq=vectorizer.texts_to_sequences(test['token'])
test_x= pad_sequences(test_X_seq, maxlen = 100)
vocab_size = len(vectorizer.word_index) + 1
vocab_size

vocab_size :  35905


35905

In [8]:
import gensim
word2vec = gensim.models.KeyedVectors.load_word2vec_format("/content/drive/MyDrive/GoogleNews-vectors-negative300.bin.gz", binary = True)

In [None]:
########################   임베딩   ##########################
embedding_matrix = np.zeros((vocab_size, 300)) #300차원의 임베딩 매트릭스 생성

for index, word in enumerate(vectorizer.word_index): #vocabulary에 있는 토큰들을 하나씩 넘겨줍니다.
    if word in word2vec: #넘겨 받은 토큰이 word2vec에 존재하면(이미 훈련이 된 토큰이라는 뜻)
        embedding_vector = word2vec[word] #해당 토큰에 해당하는 vector를 불러오고
        embedding_matrix[index] = embedding_vector #해당 위치의 embedding_mxtrix에 저장합니다.
    else:
        print("word2vec에 없는 단어입니다.")

In [11]:
embedding_matrix

array([[-0.06494141, -0.04272461,  0.16601562, ...,  0.02539062,
         0.10986328,  0.29882812],
       [-0.03491211, -0.08642578,  0.14648438, ..., -0.08886719,
         0.15039062,  0.21777344],
       [-0.046875  , -0.03637695,  0.09423828, ..., -0.00408936,
         0.1328125 ,  0.17773438],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [10]:
########################  모델링  ##########################

from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D,SpatialDropout1D,LSTM,Dropout
#model = tf.keras.Sequential()
#model.add(Embedding(vocab_size,128, input_length = 100) )
model = tf.keras.Sequential()
model.add(Embedding(vocab_size, 300, weights = [embedding_matrix], input_length = 100))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu', kernel_regularizer = tf.keras.regularizers.l2(0.001)))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'binary_crossentropy'])

In [None]:
model.summary()

In [None]:
hist=model.fit(
    train_x,
    train_y,
    epochs=7,
    verbose=1,
    validation_split = 0.2)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7

In [None]:
import matplotlib.pyplot as plt   

plt.plot(hist.history['accuracy'])
plt.plot(hist.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['accuracy', 'val_accuracy'], loc='upper left')

In [None]:
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['loss', 'val_loss'], loc='upper left')

In [None]:

sample_submission = pd.read_csv("/content/drive/MyDrive/인공지능/open/sample_submission.csv")

pred_test = model.predict(test_x)
sample_submission.loc[:,'info'] = np.where(pred_test> 0.5, 1,0).reshape(-1)


In [None]:
sample_submission.loc[:,["id","info"]].to_csv("/content/drive/MyDrive/인공지능/LSTM_word2vec.csv", index = False)
sample_submission