In [1]:
from bs4 import BeautifulSoup
import requests
import re
import os
import pandas as pd

In [2]:
os.chdir('your directory')

In [3]:
# aggregated.csv: 수집한 기사 링크와 본문 내용이 모두 있는 파일  
raw_data = pd.read_csv('aggregated.csv', dtype=str, usecols=['date','title','link','content'])
index = pd.DataFrame({'index' : [i for i in range(2500)]}, dtype=str) 
raw_data.insert(0, 'index', index)
content_df = raw_data['content']

In [4]:
from nltk import sent_tokenize, word_tokenize 
from nltk.corpus import stopwords

# NLTK의 불용어(Stopword) 
stops = set(stopwords.words("english"))

Loughran and McDonald(2011) Financial Dictionary 사용: 긍정 및 부정 어휘 

In [5]:
dic_pos = pd.read_csv("fin_pos.txt", names=['word_list'], quoting=3)
dic_pos = dic_pos['word_list'].tolist() 
dic_neg = pd.read_csv("fin_neg.txt", names=['word_list'], quoting=3)
dic_neg = dic_neg['word_list'].tolist() 

날짜 클렌징 작업 

In [6]:
def cleansing_date(row): 
    
    from datetime import datetime  
    
    pattern = re.compile(r'[a-zA-Z]+\s\d{2},\s\d{4}')   
    only_date = ''.join(re.findall(pattern, row))   
    d = datetime.strptime(only_date, '%b %d, %Y')   # unclean date ->  'Mar 24, 2020 at 18:52 | Julianne Geiger'
    d = d.strftime('%Y-%m-%d')      # clean date -> 'Mar 24, 2020'
    
    return d

In [7]:
new_date = []
for row in raw_data['date']: 
    new_date.append(cleansing_date(row))

In [8]:
def calculate_sentiment(text):   
    
    # 클렌징 작업 
    text = text[:-250]
    text = re.sub(r'(?=googletag)(.*)(?=;).+', '', text)
    text = re.sub('[^a-zA-Z\s]', '', text)
    text = text.lower() 
    text = ' '.join(text.split())
    text = text[0:3499]      # 문자열 평균 길이가 약 3500정도 -> content_i == 3500  
    

    # 토크나이징 및 불용어 제거   
    tokenized_text = sent_tokenize(text)  #nltk sent tokenizing
    word_list = [] 
    
    for each in tokenized_text: 
        word_list.extend(word_tokenize(text)) 

    word_list = [w for w in word_list if not w in stops]
    

    # 감성지수(Sentiment Index) 계산
  
    # Positive
    pos_word_list = [word for word in word_list if word in dic_pos]     

    # Negative
    neg_word_list = [word for word in word_list if word in dic_neg]     
  
    # Count 
    total_sent = len(pos_word_list) - len(neg_word_list)
    total_length = len(pos_word_list) + len(neg_word_list)
    
    if total_length == 0:       # Financial Dictionary와 하나도 매칭이 안되는 경우
        
        return 0.00 
    
    else: 
        sent_index = (total_sent / total_length)
    
        return '{:.2f}'.format(sent_index)

In [9]:
sent_list = []
for i in range(0,2500):
    text = content_df[i] 
    sent_list.append(calculate_sentiment(text)) 

In [10]:
sent_df = pd.DataFrame({'index':[i for i in range(2500)], 'date':new_date, 'sentiment' : sent_list})  
sent_df = sent_df.drop(columns='index')
sent_df = sent_df.sort_values('date', ascending=True) # 날짜 기준으로 오름차순 정렬 
sent_df['sentiment'] = pd.to_numeric(sent_df["sentiment"])  # sentiment를 숫자형으로 바꿔주기 

일별기준 총 감성지수 계산  

In [11]:
sent_df_grouped = sent_df.groupby('date') # 데이터를 날짜 그룹으로 묶어주기
sent_df_grouped_sum = sent_df_grouped.sum() # 날짜 기준 총 감성지수 계산
sent_df_grouped_sum.reset_index(level=['date'], inplace = True)

# 저장 
#sent_df_grouped_sum.to_csv('daily_sent.csv', sep=',', index=False, encoding='utf-8')

In [12]:
sent_df_grouped_sum[:10]

Unnamed: 0,date,sentiment
0,2015-12-14,0.0
1,2015-12-15,-0.53
2,2015-12-16,0.09
3,2015-12-18,-0.95
4,2015-12-21,-2.19
5,2015-12-23,-0.33
6,2015-12-28,-2.09
7,2015-12-29,-1.0
8,2016-01-01,-0.78
9,2016-01-02,-0.16
