In [7]:
import subprocess
from flask import Flask, request, jsonify
import numpy as np
import tensorflow as tf

flask_process = subprocess.Popen(['python3', 'apps.py'])

print(f"Flask app is running with PID {flask_process.pid}")

Flask app is running with PID 37102


In [8]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("kakaobank/kf-deberta-base")
model = AutoModelForMaskedLM.from_pretrained("kakaobank/kf-deberta-base")

python3: can't open file '/home/ubuntu/code/DE_ML/HITIT_Server/apps.py': [Errno 2] No such file or directory


KeyboardInterrupt: 

# Tokenize하기

In [None]:
input_text = "서유준의 매출이 하락했으며 장래가 불투명합니다"
tokens = tokenizer.tokenize(input_text) #['서유','##준','##의','매출','##이','하락','##했', ##으며','장래',...] 이런식으로

inputs = tokenizer(input_text, return_tensors="pt") #카카오 그 모델에 등록된 단어와 매칭시켜서 숫자로 

model_output = model(**inputs)

# [MASK]된 단어 예측하기

In [None]:
# 모델을 사용하여 예측 logits 계산

input_text = "매출이 상승했으며 [MASK]가 유망합니다"
tokens = tokenizer.tokenize(input_text)
print(tokens)

inputs = tokenizer(input_text, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)
logits = outputs.logits

# 토큰의 가장 가능성 높은 예측값 가져오기
predicted_token_id = torch.argmax(logits[0, 3]).item()  # [MASK] 위치의 토큰 (여기서 3번째 위치)
predicted_token = tokenizer.convert_ids_to_tokens(predicted_token_id)

# # [MASK] 토큰을 예측된 토큰으로 대체하여 문장 복원
restored_text = input_text.replace("[MASK]", predicted_token)
print(f"Restored text: {restored_text}")

['매출', '##이', '상승', '##했', '##으며', '[MASK]', '가', '유망', '##합니다']
Restored text: 매출이 상승했으며 상승가 유망합니다


# 감정분석하기

In [4]:
from transformers import pipeline
import warnings
warnings.filterwarnings("ignore")
# 감정 분석 파이프라인 로드
sentiment_model = pipeline(model="WhitePeak/bert-base-cased-Korean-sentiment")
classifier = pipeline("text-classification", model="matthewburke/korean_sentiment")

In [32]:
custom_tweet = "증권사 출범 앞둔 우리금융, 전 계열사 조직 진단 나선다"
preds = classifier(custom_tweet, return_all_scores=True)
preds[0][0]['label'], preds[0][1]['label'] = "Negative", "Positive"

flat_data = [item for sublist in preds for item in sublist]
max_score_dict = max(flat_data, key=lambda x: x['score'])

print(max_score_dict)

{'label': 'Positive', 'score': 0.5479901432991028}


In [15]:

is_positive = preds[0][1]['score'] > 0.5
is_positive
def get_sentiment2(text):
    result = classifier(text, return_all_scores=True)
    print(result)
    is_positive = preds[0][1]['score'] > 0.5
    result = {}
    if is_positive == True:
        result['label'] = 'POSITIVE'
    else:
        result['label'] = "NEGATIVE"
    result['score'] = preds[0][1]['score']
    return result
get_sentiment2("야 기분 좋다~")

[[{'label': 'LABEL_0', 'score': 0.03761640191078186}, {'label': 'LABEL_1', 'score': 0.9623836278915405}]]


{'label': 'POSITIVE', 'score': 0.9731518626213074}

In [12]:
def get_sentiment(text):
    result = sentiment_model(text)
    for elem in result :
        if elem['label'] == "LABEL_1" :
            elem['label'] = "POSITIVE"
        else :
            elem['label'] = "NEGATIVE"
        elem['score'] = round(elem['score'],5)
    
    # print(f"Sentiment analysis result: {result}")
    return result[0]

get_sentiment("야 기분 나쁘다")

{'label': 'NEGATIVE', 'score': 0.99221}

# 뉴스데이터 크롤링하기

In [3]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime

from pykrx import stock
from pykrx import bond

current_date = datetime.now().strftime("%Y%m%d")
tickers = stock.get_market_ticker_list(current_date)
print(tickers)
# print(current_date_formatted)

['095570', '006840', '027410', '282330', '138930', '001460', '001465', '001040', '079160', '00104K', '000120', '011150', '011155', '001045', '097950', '097955', '000480', '000590', '012030', '016610', '005830', '000990', '139130', '001530', '000210', '000215', '375500', '37550L', '37550K', '007340', '004840', '155660', '069730', '017860', '017940', '365550', '383220', '007700', '114090', '078930', '006360', '001250', '007070', '078935', '012630', '039570', '089470', '294870', '009540', '267250', '267270', '443060', '010620', '322000', '042670', '267260', '329180', '097230', '014790', '003580', '204320', '060980', '011200', '035000', '003560', '175330', '234080', '001060', '001067', '001065', '096760', '105560', '432320', '002380', '344820', '009070', '009440', '119650', '092220', '003620', '016380', '001390', '033180', '015590', '001940', '025000', '092230', '000040', '044450', '030200', '033780', '058850', '058860', '093050', '003550', '034220', '051900', '051905', '373220', '003555',

In [4]:
def naver_news_crawling(keyword_list, day):
    result_object = {}  # 결과를 저장할 객체

    for keyword in keyword_list:
        url = f"https://search.naver.com/search.naver?where=news&query={keyword}&sm=tab_opt&sort=0&photo=0&field=0&pd=3&ds={day}&de={day}&docid=&related=0&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so%3Ar%2Cp%3Afrom20240101to20240101&is_sug_officeid=0&office_category=0&service_area=0"
        response = requests.get(url, headers={'Content-Type': 'application/json'})
        
        soup = BeautifulSoup(response.content, 'html.parser')
        result = []
        for item in soup.select('.news_wrap'):
            press_element = item.select_one('a.info.press')
            # print()
            press = item.select_one('a.info.press').get_text().replace(' 선정', '').strip()
            anchor = item.select_one('a.news_tit')
            title = anchor.get_text().strip()
            url = anchor['href']
            dsc = item.select_one('.news_dsc').get_text().strip()

            result.append({
                'press': press,
                'title': title,
                'url': url,
                'dsc': dsc,
            })

        result_object[keyword] = result  # 키워드를 키로 사용하여 결과 저장
    return result_object  # 객체 반환

In [10]:
keyword_list = ["삼성전자"]
day = "2024.06.24"
results = naver_news_crawling(keyword_list, day)
print(results)
#뉴스통해서 받은 제목과 기사내용을 바탕으로 감정분석하기
for keyword, articles in results.items():
    print(f"Keyword: {keyword}")
    
    for article in articles:
        print(f"Description: {article['dsc']}")
        print(article['dsc'])
        returned = get_sentiment(article['dsc'])
        print(returned)
        print("\n------------------------------\n")

{'삼성전자': [{'press': '연합뉴스', 'title': '삼성전자, 중소·중견기업에 특허 231건 무상 제공', 'url': 'https://www.yna.co.kr/view/AKR20240624034500003?input=1195m', 'dsc': "삼성전자가 중소·중견기업들에 활용 가치가 높은 특허 231건을 무상으로 제공한다. 산업통상자원부는 24일 '2024년도 산업통상자원부·삼성전자 기술나눔 공고'를 내고 특허를 제공받을 중소·중견기업을 모집한다고 밝혔다. 기술나눔은 동반 성장 차원에서 대기업이나 공공기관이 가진 미활용 우수..."}, {'press': '세계일보언론사', 'title': '다가오는 ‘여름 보너스’ 시즌… 삼성전자 등 중간배당 시행', 'url': 'http://www.segye.com/newsView/20240624502035?OutUrl=naver', 'dsc': '삼성전자를 비롯한 국내 69개 상장사가 6월 말 중간배당을 계획을 밝혔다. 24일 한국거래소에 따르면 지난 20일까지 6월 말 기준 중간배당(분기배당 포함) 권리주주 확정을 위해 주주명부폐쇄 결정을 공시한 12월 결산법인은 코스피 49개, 코스닥 20개 등 총 69개사다. 사진=연합뉴스 유가증권시장에서는...'}, {'press': 'SBS', 'title': "LG·삼성전자, 미 컨슈머리포트 선정 '올해 최고의 TV' 휩쓸어", 'url': 'https://news.sbs.co.kr/news/endPage.do?news_id=N1007695100&plink=ORI&cooper=NAVER', 'dsc': "▲ 2024년형 LG 올레드 에보(G4)를 체험하는 모습 LG전자와 삼성전자가 미국의 유력 소비자 매체인 컨슈머리포트가 발표한 올해 최고의 TV 부문을 휩쓸었습니다. 24일 업계에 따르면 컨슈머리포트가 최근 발표한 '전문가가 평가한 올해 최고의 TV' 8종 중 4종이 LG 올레드(OLED·유기발광다이오드)..."}, {'press': '뉴시스', 'title

In [None]:
connection.close()

In [10]:
from mysqlconnect import connect_to_mysql

connection = connect_to_mysql()
cursor = connection.cursor()
cursor.execute("select stock_code,name from stocks_products where (DATE(sentiment_update) IS NULL OR DATE(sentiment_update) <> DATE(NOW()))")

rows = cursor.fetchall()
stocks = [ elem[:2] for elem in rows] #(종목코드, 종목명 가져와보리기) DB에서

current_date = datetime.now().strftime("%Y.%m.%d")

for elem in stocks:
    results = naver_news_crawling([elem[1]],current_date)
    print(f"index : {stocks.index(elem)}/{len(stocks)}, {keyword}")
    try : 
        for keyword, articles in results.items():
            positive, negative = 0, 0
            for article in articles:          
                # print(f"Description: {article['dsc']}")
                returned = get_sentiment(article['dsc'])
                # print(returned)
                if returned['label'] == "NEGATIVE":
                    positive += 1
                else :
                    negative += 1
            # print(f"positive : {positive}, negative : {negative}")
            sentiment = 1 if positive > negative else 0
            
            query = f"""\
            UPDATE stocks_products 
            SET sentiment = {sentiment} ,
            sentiment_update = NOW()
            where stock_code = {elem[0]}
            """
            # cursor.execute(query)
            connection.commit()
    except:
        print(f"PASS {keyword}")
        continue
        
connection.close()
# for ticker in stock.get_market_ticker_list():
#         종목 = stock.get_market_ticker_name(ticker)
#         print(종목)

USER: hitit-user, HOST: hitit-db-mydata.c9oy8g6q0v76.ap-northeast-2.rds.amazonaws.com, PORT: 3306, DATABASE: ml
Connected to MySQL database
index : 0/2615, 삼성전자
index : 1/2615, 상상인증권
index : 2/2615, 백광산업
index : 3/2615, 삼성제약
index : 4/2615, SG글로벌
index : 5/2615, KG케미칼
index : 6/2615, 태원물산
index : 7/2615, 세아베스틸지주
index : 8/2615, 대한전선
index : 9/2615, 현대해상
index : 10/2615, BYC
index : 11/2615, 삼부토건
PASS 현대차증권
index : 12/2615, 현대차증권
index : 13/2615, SK증권
index : 14/2615, 동양
index : 15/2615, DI동일
index : 16/2615, 안국약품
index : 17/2615, 조비
index : 18/2615, 제일연마
index : 19/2615, 금양
index : 20/2615, 케이비아이동국실업
index : 21/2615, 종근당홀딩스
index : 22/2615, 대상
index : 23/2615, 신영증권


In [None]:
# 모바일 내에서 검색한 뉴스들의 제목 & 내용 가져옴
def mobile_news_crawling(keyword_list, day):
    result_object = {}  # 결과를 저장할 객체

    for keyword in keyword_list:
        url = f"https://m.search.naver.com/search.naver?where=m_news&query={keyword}&sm=mtb_opt&sort=0&photo=0&field=0&pd=3 \
        &ds={day}&de={day}&docid=&related=0&mynews=0&office_type=0&office_section_code=0&news_office_checked=&is_sug_officeid=0 \
        &office_category=0&service_area=0"
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        result = []
        for item in soup.select('.list_news .bx'):
            
            press = item.select_one('a.info.press').get_text().replace(' 선정', '').strip()
            anchor = item.select_one('a.news_tit')
    
            title = anchor.get_text().strip()
            url = anchor['href']
            dsc = item.select_one('.news_dsc').get_text().strip()
            result.append({
                'press': press,
                'title': title,
                'url': url,
                'dsc': dsc,
            })

        result_object[keyword] = result  # 키워드를 키로 사용하여 결과 저장
    
    return result_object  # 객체 반환


In [None]:
from datetime import datetime
current_date_formatted = datetime.now().strftime("%Y.%m.%d")

2024.06.20


In [None]:
import requests
#네이버 모바일에서의 뉴스 포맷을 크롤링
def get_mobile_dsc(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    result = []
    for item in soup.find('article', id='dic_area'):
        result.append(item.get_text(strip=True))
    
    return result
    
mobile_result = mobile_news_crawling(['삼성전자'], "2024.06.01")
get_mobile_dsc("https://n.news.naver.com/article/262/0000017491?sid=101")

print()




In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time
import random

In [None]:
def fetch_page(url):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response
    except requests.RequestException as e:
        print(f"Error fetching page: {url}", e)
        return None

def decode_html(response):
    content_type = response.headers.get("content-type")
    charset = "UTF-8"
    if "charset=" in content_type:
        charset = content_type.split("charset=")[1]
    return response.content.decode(charset)

def fetch_main(url, start_page, end_page):
    response = fetch_page(url)
    if response:
        data = decode_html(response)
        soup = BeautifulSoup(data, 'html.parser')
        title = "뉴스 리스트영역"
        name = "news"; 
        
        news_src = soup.find('iframe', {'title': title, 'name': name})['src']
        news_src = news_src.replace("&page=&clusterId=","")

        results = []
        for page in range(start_page, end_page + 1):
            page_url = f"{base_url}{news_src}&page={page}"
            results.extend(fetch_news(page_url, end_page))
            
        return results
    
#url 하나 당 뉴스 쫘라락 있는데 그거 가져오기
def fetch_news(url, end_page): 
    response = fetch_page(url)
    if response:
        data = decode_html(response)
        soup = BeautifulSoup(data, 'html.parser')
        results = []
        table_rows = soup.select("table.type5 tr")
            
        for row in table_rows:
            cells = row.find_all('td')
            if len(cells) == 3:
                title = cells[0].text.strip()
                link = base_url + cells[0].a['href']
                provider = cells[1].text.strip()
                date = cells[2].text.strip()
                results.append({
                    "title": title,
                    "link": link,
                    "provider": provider,
                    "date": date,
                })
        return results
    return []



In [None]:
code = "005930"
url = f"https://finance.naver.com/item/news.nhn?code={code}"

returned_data = fetch_main(url, 1, 5)
for data in returned_data :
    print(data['title'])
    print(data['link'])

NameError: name 'base_url' is not defined

In [None]:
import socket

def get_local_ip():
    try:
        hostname = socket.gethostname()
        local_ip = socket.gethostbyname(hostname)
        return local_ip
    except socket.error as e:
        print(f"Error fetching local IP: {e}")
        return None

local_ip = get_local_ip()
print(f"Local IP: {local_ip}")

Local IP: 172.31.13.91


In [None]:
from transformers import pipeline
from flask import Flask, request, jsonify
import numpy as np
import tensorflow as tf
import joblib
# 감정 분석 파이프라인 로드

def get_sentiment(text):
    result = sentiment_model(text)
    for elem in result :
        if elem['label'] == "LABEL_1" :
            elem['label'] = "POSITIVE"
        else :
            elem['label'] = "NEGATIVE"
        elem['score'] = round(elem['score'],5)

    print(f"Sentiment analysis result: {result}")
    return result


app = Flask(__name__)

@app.route('/sentiment', methods=['POST'])
def predict():
    global sentiment_model
    try:
        data = request.json['data']
        sentiment_result = get_sentiment(data)
        
        print(sentiment_result)
        return jsonify({'sentiment analysis': sentiment_result })
    except Exception as e:
        return jsonify({'error': str(e)})

if __name__ == "__main__":
    global sentiment_model
    sentiment_model = pipeline(model="WhitePeak/bert-base-cased-Korean-sentiment")
    # print(f'Model loaded from {model_path}')

    from waitress import serve
    serve(app, host="0.0.0.0", port=8080)

Sentiment analysis result: [{'label': 'NEGATIVE', 'score': 0.98365}]
[{'label': 'NEGATIVE', 'score': 0.98365}]
