In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("/content/drive/MyDrive/명지대학교SW경진대회/데이터/news_title_labeling.csv", encoding="UTF-8")

# 데이터를 7:3 비율로 나누기
train_data, test_data, train_labels, test_labels = train_test_split(df["제목"], df["라벨링"], test_size=0.3, random_state=42)

print("훈련 데이터 개수:", len(train_data))
print("테스트 데이터 개수:", len(test_data))

훈련 데이터 개수: 4568
테스트 데이터 개수: 1958


In [None]:
!pip install konlpy

In [None]:
import konlpy
from konlpy.tag import Okt

stopwords = ["의", "가", "이", "은", "들", "는", "좀", "잘", "걍", "과", "도", "를", "으로", "자", "에", "와", "한", "하다"]
okt = Okt()

# X_train에 대하여 토큰화와 불용어 제거
X_train = []
for sentence in train_data:
  X_temp = []
  X_temp = okt.morphs(sentence, stem=True)
  X_temp = [word for word in X_temp if not word in stopwords]
  X_train.append(X_temp)

# X_test에 대하여 토큰화와 불용어 제거
X_test = []
for sentence in test_data:
  X_temp = []
  X_temp = okt.morphs(sentence, stem=True)
  X_temp = [word for word in X_temp if not word in stopwords]
  X_test.append(X_temp)

In [None]:
from keras.preprocessing.text import Tokenizer

# X_train과 X_test에 대하여 정수 인코딩
max_words = 35000
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
import numpy as np

# y_train에 대하여 라벨링 인식의 위해 one-hot encoding
y_train = []
for i in range(len(train_labels)):
  if train_labels.iloc[i] == 1:
    y_train.append([0, 0, 1])
  elif train_labels.iloc[i] == 0:
    y_train.append([0, 1, 0])
  elif train_labels.iloc[i] == -1:
    y_train.append([1, 0, 0])

# y_test에 대하여 라벨링 인식의 위해 one-hot encoding
y_test = []
for i in range(len(test_labels)):
  if test_labels.iloc[i] == 1:
    y_test.append([0, 0, 1])
  elif test_labels.iloc[i] == 0:
    y_test.append([0, 1, 0])
  elif test_labels.iloc[i] == -1:
    y_test.append([1, 0, 0])

y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
!pip install tensorflow

In [None]:
from keras.layers import Embedding, Dense, LSTM
from keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 최대 시퀀스 길이를 정의합니다
max_len = 30

# 패딩을 적용하여 최대 길이로 적용
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [None]:
# 모델 정의
model = Sequential()
model.add(Embedding(max_words, 100))
model.add(LSTM(128))
model.add(Dense(3, activation="softmax"))

# 모델 학습
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10, batch_size=10, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# 테스트 정확도 확인
print("\n 테스트 정확도: {:.2f}%".format(model.evaluate(X_test, y_test)[1]*100))


 테스트 정확도: 81.26%


In [None]:
# 학습된 모델을 통해 라벨링 데이터 추출
df = df[["날짜", "종목명", "제목"]]
title = df["제목"]

def analyze_emotion(title):
    # 입력 데이터 전처리 및 토큰화
    title_tokens = okt.morphs(title, stem=True)
    title_tokens = [word for word in title_tokens if not word in stopwords]
    title_sequences = tokenizer.texts_to_sequences([title_tokens])
    title_padded = pad_sequences(title_sequences, maxlen=max_len)

    # 감정 예측
    predicted_emotion = model.predict(title_padded)
    predicted = np.argmax(predicted_emotion)

    emotions = ["-1", "0", "1"]
    predicted_emotion = emotions[predicted]

    return predicted_emotion

df["감정"] = df["제목"].apply(lambda title: analyze_emotion(title))
df["감정"] = df["감정"].astype(int)

# "-"을 포함하지 않는 행 제거
df = df.dropna()
df = df[df['날짜'].str.contains('-')]
df["날짜"] = pd.to_datetime(df["날짜"])

In [None]:
df.to_csv("뉴스기사제목_감정분석.csv", encoding="UTF-8", index=False)