## 뉴스기사 신뢰도 평가

### Mecab 설치

In [None]:
!apt-get update 
!apt-get install g++ openjdk-8-jdk 
!pip install konlpy JPype1-py3
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

### 모듈 불러오기

In [2]:
import re
import requests
from bs4 import BeautifulSoup
from konlpy.tag import Mecab

### 신뢰도 평가지표 점수 계산 함수

In [3]:
def byline_score(doc):
    has_email = False
    has_name = False
    if "bylines" in doc:
        for bl in doc[DOCKEY_BYLINES]:
            if "name" in bl:
                has_name = True
            if "email" in bl:
                has_email = True

    score_byline = 0
    if has_name:
        if has_email:
            score_byline = 1  # 실명+이메일
        else:
            score_byline = 0.8  # 이름만 있음
    elif has_email:
        score_byline = 0  # 뭐 하나라도 있음
    else:
        score_byline = -1  # 둘다 없음

    return score_byline


# 기사의 길이 (분류, 매체유형에 따라 다름)
def content_len_score(contentLength, avg, sd):
    if contentLength <= avg:
        return 0
    if contentLength <= avg + 0.5 * sd:
        return 0.165
    elif contentLength <= avg + sd:
        return 0.33
    elif contentLength <= avg + 1.5 * sd:
        return 0.495
    elif contentLength <= avg + 2.0 * sd:
        return 0.66
    elif contentLength <= avg + 2.5 * sd:
        return 0.835
    else:
        return 1


# 인용문의 수
def num_quotes_score(numQuotes):

    return numQuotes / 15 if numQuotes < 15 else 1


# 제목의 길이
def title_len_score(lenTitle):
    return 0 if lenTitle <= 45 else -1


# 제목에 물음표/느낌표
def num_title_puncs_score(numTitlePuncs):
    if numTitlePuncs == 0:
        return 0
    elif numTitlePuncs == 1:
        return -0.5
    else:
        return -1


# 수치 인용 수
def num_numberts_score(numNumbers, avg, sd):
    if numNumbers < avg:
        return 0
    elif numNumbers < (avg + 0.5 * sd):
        return 0.33
    elif numNumbers < (avg + sd):
        return 0.66
    else:
        return 1


# 이미지의 수
def image_count_score(imageCount):
    if imageCount <= 0:
        return 0
    elif imageCount == 1:
        return 0.33
    elif imageCount == 2:
        return 0.66
    elif imageCount == 3:
        return 1
    elif imageCount == 4:
        return 0.66
    elif imageCount == 5:
        return 0.33
    else:  # imageCount >= 6:
        return 0


# 평균 문장 길이
def avg_sentence_len_score(avgSentenceLength, avg, sd):
    return -1 if avgSentenceLength >= (avg + sd) else 0


# 제목의 부사수
def title_adverb_count_score(numTitleAdverbs):
    if numTitleAdverbs == 1:
        return -0.5
    elif numTitleAdverbs >= 2:
        return -1
    else:
        return 0


# 문장당 평균 부사수
def avg_adverb_cps_score(avgAdverbsPerSentence, avg, sd):
    return -1 if avgAdverbsPerSentence >= (avg + 2 * sd) else 0


# 인용문 길이 비율
def quote_percent_score(quotePercent):
    if quotePercent < 0.5:
        return 0
    elif quotePercent < 0.8:
        return -0.5
    else:
        return -1


### 기사의 Byline 전처리

In [4]:
RE_EMAIL = re.compile(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+[a-zA-Z0-9-.]+')
RE_ALPHA_NUMERIC_IDENTIFIER = re.compile(r'[a-zA-Z0-9_.+-]+@?')
RE_JOURNALIST_JOB = re.compile(r'인턴기자|기자|특파원|논설위원|논설실장')
RE_TV_JOURNALIST = re.compile(r'(MBC|SBS|KBS|YTN).*?\s(.+)입')
RE_YTN_JOURNALIST = re.compile(r'YTN.*?\s(.+?)\[([a-zA-Z0-9-]*)')
RE_BYLINE_DIV_CHARS = re.compile(r'[·/|\[\]]')

# FIXME: 지역명은 따로 빼야 하는데.
RE_COMPANY_NAMES = re.compile(r'아시아투데이|한경닷컴|충청일보|중부매일|포항|디지털뉴스국')

S_PUNCTUATION_CHARS = """!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~‘’“”▶▷■▦△▲◇"""
S_WHITE_SPACES = " \t\xa0\xeb\u3000\u2028\u2029"

RE_SPACE_CHAR = re.compile(u'( |\t|\xa0|\xeb|\u3000|\u2028|\u2029)+')

# text 에서 꽤나 email 스러운 것을 찾아서 리턴합니다.
def find_likely_email(text):
    m = RE_EMAIL.search(text)
    return m.group(0) if m else None


# text 에서 알파벳과 숫자 _, . 등의 연속으로 이루어진 단어를 찾아 리턴합니다.
# 이메일 같이 생겼는데 @ 문자가 빠진 경우에 사용합니다. (아이디만 있는 경우)
def find_alpha_numeric(text):
    m = RE_ALPHA_NUMERIC_IDENTIFIER.search(text)
    return m.group(0) if m else None


# 바이라인 유추가 가능한 "기자", "특파원", "리포터" 등의 한글 직업명을 찾아서 리턴합니다.
def find_journalist_job(text):
    m = RE_JOURNALIST_JOB.search(text)
    return m.group(0) if m else None


# 주어진 텍스트가 바이라인일 가능성이 있나?
def can_be_byline(text):
    return True if find_likely_email(text) or find_journalist_job(text) else False


def split_get_left_right(long_text, split_word, nostrip=False):
    idx = long_text.rfind(split_word)
    if idx < 0:
        return (None,None)
    idx2 = idx + len(split_word)
    if nostrip:
        return (long_text[:idx], long_text[idx2:])
    return (long_text[:idx].strip(), long_text[idx2:].strip())



def head_to_tail_until_punctuation_char(text):
    if not text:
        return None
    for n, ch in enumerate(text):
        if ch in S_PUNCTUATION_CHARS:
            return text[:n].rstrip()
    return text.rstrip()


def tail_to_head_until_punctuation_char(text):
    if not text:
        return None
    for n, ch in enumerate(text[::-1]):
        if ch in S_PUNCTUATION_CHARS:
            return text[(len(text) - n):].lstrip()
    return text.lstrip()


# "홍길동기자" --> "홍길동 기자"
def normalize_name_with_job(namepart,jobname):
    if not namepart or not jobname:
        return None

    # 아시아투데이 박아람 기자 --> 박아람 기자
    company = RE_COMPANY_NAMES.search(namepart)
    if company:
        namepart = namepart.replace(company.group(0), '').strip()

    if len(namepart) > 0 and namepart[-1] in S_WHITE_SPACES:
        # 직업명 앞에 띄워쓰기가 있으므로 별도 처리가 필요 없다.
        namepart = namepart.strip()
        if not namepart:
            return None # 기자 앞에 이름이 없다.
        return (namepart + ' ' + jobname)

    num_spaces = len(RE_SPACE_CHAR.findall(namepart))
    if num_spaces > 0:
        # "홍길동 수습기자" 같은 경우 .. 새로운 직업명일 수 있다.
        return (namepart + jobname)

    # "홍길동기자" 같은 경우, 띄워쓰기를 넣어준다.
    return (namepart + ' ' + jobname)


class BylineExtractor(object):
    GENERAL = 1
    TV = 2

    def __init__(self, text, type=GENERAL):
        if type == BylineExtractor.TV:
            self.analyze_tv(text)
        else:
            self.analyze(text)


    def analyze(self, text):
        self.original_text = text
        self.components = dict()

        maybe_email = find_likely_email(text)
        if maybe_email:
            self.components["email"] = maybe_email
            left, right = split_get_left_right(text, maybe_email)

            maybe_job = find_journalist_job(left)
            if maybe_job:
                maybe_name = left
                redundant_job = find_journalist_job(right)
            else:
                redundant_job = None
                maybe_job = find_journalist_job(right)
                maybe_name = right if maybe_job else None


            if redundant_job:
                # TODO: log doubtful case
                logging.warning("redundant journalist job found in '%s'", text)

            if maybe_name:
                maybe_name, name_right = split_get_left_right(maybe_name, maybe_job, nostrip=True)
                maybe_name = tail_to_head_until_punctuation_char(maybe_name)
                name_right = head_to_tail_until_punctuation_char(name_right)
                name = normalize_name_with_job(maybe_name, maybe_job)
                if name:
                    # 기자 뒤 email 앞에 텍스트가 있을 경우 기자명에 포함시킴 (예: "증시분석 전문기자 로봇 ET")
                    if name_right and name_right.strip():
                        name = name + name_right
                    self.components["name"] = name
                    return

            # 이메일 앞에 보통 이름이 오는데 "기자" 직업명이 없다. 글자수가 4글자 이하면 이름으로 간주한다.
            name = tail_to_head_until_punctuation_char(left or right)
            if name:
                if len(name) <= 4:
                    self.components["name"] = name
                elif name[-1] == "팀":
                    # 혹시 ~팀 으로 끝나는 경우 이름으로 간주 (예: 산업경제팀)
                    self.components["name"] = name

            return

        # 이메일이 없는 경우 (예: "홍길동 기자" 또는 "홍길동 기자 hong")
        # 일단 기자 등의 직업명이 보여야 한다.
        maybe_job = find_journalist_job(text)
        if maybe_job:
            left, right = split_get_left_right(text, maybe_job, nostrip=True)
            maybe_name = tail_to_head_until_punctuation_char(left)
            name = normalize_name_with_job(maybe_name, maybe_job)
            if name:
                self.components["name"] = name
                maybe_email = find_alpha_numeric(right)
                if maybe_email:
                    # 찾은 email 후보와 직업명 사이에 다른 문자가 있으면 안됨
                    between = right[:(right.find(maybe_email))]
                    for ch in between:
                        if ch not in S_PUNCTUATION_CHARS and ch not in S_WHITE_SPACES:
                            return
                    # @ 가 없지만 email 로 인정
                    self.components["email"] = maybe_email



    def analyze_tv(self, text):
        self.components = dict()
        m1 = RE_TV_JOURNALIST.search(text)
        if m1:
            self.components["name"] = m1.group(2)
            return

        m2 = RE_YTN_JOURNALIST.search(text)
        if m2:
            self.components["name"] = m2.group(1)
            maybe_email = m2.group(2)
            if maybe_email:
                self.components["email"] = maybe_email



    def get_name(self):
        return self.components["name"] if "name" in self.components else None

    def get_email(self):
        return self.components["email"] if "email" in self.components else None

    def get_component_count(self):
        return len(self.components)

    def get_result(self):
        return self.components



# 일단은 한 라인에 2개 이상의 기자가 발견되는 경우만 처리한다.
class BylineAnalyzer(object):
    def __init__(self, text, type=BylineExtractor.GENERAL):
        self.extractors = list()
        jobs = RE_JOURNALIST_JOB.findall(text)
        if len(jobs) <= 1:
            # 바이라인은 1개만 있는 것 같은 일반적인 경우
            self.extractors.append(BylineExtractor(text, type))
            return

        # 두 기자명을 가르는 문자로 / 또는 · 등을 찾아본다.
        for i in range(1,len(jobs)):
            left = text.find(jobs[i-1]) + len(jobs[i-1])
            right = text.find(jobs[i], left)
            m = RE_BYLINE_DIV_CHARS.search(text[left:right])
            if m:
                divpos = text.find(m.group(0), left)
                byline1 = text[:divpos]
                self.extractors.append(BylineExtractor(byline1))
                text = text[divpos+1:]
            else:
                break

        self.extractors.append(BylineExtractor(text))


    def get_bylines(self):
        ret = list()
        for extr in self.extractors:
            if extr.get_component_count() > 0:
                ret.append(extr.get_result())

        return ret

### 셈플기사로 신뢰도 측정

In [5]:
mecab = Mecab()
urls=["https://news.naver.com/main/read.naver?mode=LSD&mid=sec&sid1=103&oid=015&aid=0004611620",
      "https://news.naver.com/main/read.naver?mode=LSD&mid=sec&sid1=103&oid=081&aid=0003219740",
      "https://news.naver.com/main/read.naver?mode=LSD&mid=sec&sid1=103&oid=028&aid=0002562380"
      ]

for url in urls:
    headers = {"user-agent": "Mozilla/5.0"}
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.text, 'html.parser')

    body = soup.select_one('#articleBodyContents')
    imgs = body.find_all("img")


    script = body.find("script",attrs={"type" : "text/javascript"}).get_text()
    pt1=script.split('\n')[1]
    pt2=script.split('\n')[2]
    body = body.get_text()
    body = body.replace(pt1,"")
    body = body.replace(pt2,"")
    title = soup.select_one('#articleTitle').get_text()
    byline = soup.select_one('.b_text').get_text().replace("\n", "").strip()
    body = body.strip()

    # s_len
    sentences = body.split('.')
    sentence_lens = []
    for sentence in sentences:
        if len(sentence) < 2:
            continue
        sentence_lens.append(len(sentence))
    total = sum(sentence_lens)
    length = len(sentence_lens)
    average_sentence_len = total/length

    # 문장당 평균 부사 수
    tag_list = ['MAG','MAJ']
    cnt_adverb = []
    for sentence in sentences:
        if len(sentence) < 2:
            continue
        cnt = 0
        for pos in mecab.pos(sentence):
            if pos[1] in tag_list:
                cnt += 1
        cnt_adverb.append(cnt)

    total = sum(cnt_adverb)
    length = len(cnt_adverb)
    avgAdverbCountPerSentence = total/length
    
    # title adv
    cnt = 0
    for pos in mecab.pos(title):
        if pos[1] in tag_list:
            cnt += 1
        titleAdverbCount = cnt

    # title !?
    titleNumPuncs = title.count('!') + title.count('?')

    # 인용문 관련 수치는 api를 통하여 반환할 수 있음.

    # get score
    DOCKEY_BYLINES = "bylines"
    by=BylineExtractor(byline)
    doc = {
        DOCKEY_BYLINES : by.get_result()
    }

    score_byline = byline_score(doc)
    score_contentLength = content_len_score(len(body), 791.92, 660.64)
    #
    score_quoteCount = num_quotes_score(38)
    score_titleLength = title_len_score(len(title))
    score_titlePuncCount = num_title_puncs_score(numTitlePuncs = titleNumPuncs)
    #
    score_numberCount = num_numberts_score(14, 0, 1)
    score_imageCount = image_count_score(imageCount = len(imgs))
    score_avgSentenceLength = avg_sentence_len_score(average_sentence_len, 70, 50)
    score_titleAdverbCount = title_adverb_count_score(numTitleAdverbs = titleAdverbCount)
    score_avgAdverbCountPerSentence = avg_adverb_cps_score(avgAdverbCountPerSentence, 1, 2)
    #
    score_quotePercent = quote_percent_score(43.22)

    # 독이성
    journal_read = score_byline * 0.001 \
        + score_contentLength * 0.003 \
        + score_quoteCount * 0.001 \
        + score_titleLength * 1 \
        + score_titlePuncCount * 1.002 \
        + score_numberCount * 1.354 \
        + score_imageCount * 1.5 \
        + score_avgSentenceLength * 1.5 \
        + score_titleAdverbCount * 2.466 \
        + score_avgAdverbCountPerSentence * 0.5
    # 투명성
    journal_clear = score_byline * 4.498 \
        + score_contentLength * 3.003 \
        + score_quoteCount * 4.5 \
        + score_titlePuncCount * 3.619 \
        + score_numberCount * 1.454 \
        + score_imageCount * 1 \
        + score_quotePercent * 0.001
    # 사실성
    journal_truth = score_byline * 4.493 \
        + score_contentLength * 3.503 \
        + score_quoteCount * 3.501 \
        + score_titlePuncCount * 0.001 \
        + score_numberCount * 0.502 \
        + score_imageCount * 1.5 \
        + score_titleAdverbCount * 0.5 \
        + score_avgAdverbCountPerSentence * 1.5 \
        + score_quotePercent * 1
    # 유용성
    journal_useful = score_byline * 3.494 \
        + score_contentLength * 3.498 \
        + score_quoteCount * 2.001 \
        + score_numberCount * 1.956 \
        + score_imageCount * 1
    # 균형성
    journal_balance = score_byline * 2.996 \
        + score_contentLength * 3.002 \
        + score_quoteCount * 3 \
        + score_titlePuncCount * 1.501 \
        + score_titleAdverbCount * 0.501 \
        + score_avgAdverbCountPerSentence * 1 \
        + score_quotePercent * 1 \
        # 다양성
    journal_variety = score_byline * 0.998 \
        + score_contentLength * 4.994 \
        + score_quoteCount * 2.501 \
        + score_titleLength * 0.5 \
        + score_numberCount * 1.953 \
        + score_imageCount * 1 \
        + score_avgSentenceLength * 0.5 \
        + score_quotePercent * 0.5
    # 독창성
    journal_original = score_byline * 4.494 \
        + score_contentLength * 4.492 \
        + score_quoteCount * 3.501 \
        + score_titlePuncCount * 3.09 \
        + score_numberCount * 1.823 \
        + score_imageCount * 1.501
    # 중요성
    journal_important = score_byline * 2.495 \
        + score_contentLength * 3.503 \
        + score_quoteCount * 3.5 \
        + score_numberCount * 1.002 \
        + score_imageCount * 0.5
    # 심층성
    journal_deep = score_byline * 4.496 \
        + score_contentLength * 4.995 \
        + score_quoteCount * 3.501 \
        + score_numberCount * 1.336 \
        + score_imageCount * 1 \
        + score_quotePercent * 1
    # 선정성
    journal_yellow = score_byline * 4.491 \
        + score_titleLength * 3.5 \
        + score_titlePuncCount * 3.501 \
        + score_titleAdverbCount * 3.5 \
        + score_avgAdverbCountPerSentence * 3.5 \
        + score_quotePercent * 3.5

    journalSum = journal_read + journal_clear + journal_truth + journal_useful + journal_balance \
        + journal_variety + journal_original + \
        journal_important + journal_deep + journal_yellow
    
    print(title,"\n score :",journalSum)
    print("-"*50)

'오징어 게임' 벌써 전세계 8200만명 봤다…10일 연속 1위 
 score : 62.32782999999999
--------------------------------------------------
오징어 게임, 결국 인도 ‘발리우드’ 뚫었다…전세계 1위 
 score : 72.55552
--------------------------------------------------
오징어 게임, 넷플릭스 세계 제패 
 score : 84.12319500000001
--------------------------------------------------
