# News Sentiment Infomation

Display sentimental polarity pie chart and trend chart 

# Load Data

In [84]:
import pandas as pd

In [85]:
df = pd.read_csv('./news_for_django.csv',sep='|')

In [86]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,item_id,title,category,content,link,date,photo_link,tokens_v2,top_key_freq,summary,sentiment
0,0,_20250327_1,台股重挫308點 失守22000關卡,焦點,美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日...,https://tw.news.yahoo.com/https://tw.stock.yah...,2025-03-27,https://s.yimg.com/ny/api/res/1.2/3qEveVGKp070...,"['美國', '總統', '川普', '汽車', '關稅', '美股', '指數', '台積...","[('台積電', 4), ('指數', 3), ('月線', 3), ('美國', 2), ...",暫無,0.36


In [87]:
df.shape

(216, 12)

In [88]:
df.date.max()

'2025-03-27'

In [89]:
df.date.min()

'2025-03-23'

# Step 1: Filter news articles using the following function

In [90]:
from datetime import datetime, timedelta

In [91]:
from datetime import datetime, timedelta
# Searching keywords from "content" column
# Here this function uses df.content column, while filter_dataFrame() uses df.tokens_v2
# It is the same as filter_dataFrame_fullText()此函數名稱之前是用filter_dataFrame_fullText()
def filter_df_via_content(query_keywords, cond, cate, weeks):

    # end date: the date of the latest record of news
    end_date = df.date.max()
    
    # start date
    start_date_delta = (datetime.strptime(end_date, '%Y-%m-%d').date() - timedelta(weeks=weeks)).strftime('%Y-%m-%d')
    start_date_min = df.date.min()
    # set start_date as the larger one from the start_date_delta and start_date_min 開始時間選資料最早時間與周數:兩者較晚者
    start_date = max(start_date_delta,   start_date_min)


    # (1) proceed filtering: a duration of a period of time
    # 期間條件
    period_condition = (df.date >= start_date) & (df.date <= end_date) 
    
    # (2) proceed filtering: news category
    # 新聞類別條件
    if (cate == "全部"):
        condition = period_condition  # "全部"類別不必過濾新聞種類
    else:
        # 過濾category新聞類別條件
        condition = period_condition & (df.category == cate)

    # (3) proceed filtering: and or
    # and or 條件
    if (cond == 'and'):
        # query keywords condition使用者輸入關鍵字條件and
        condition = condition & df.content.apply(lambda text: all((qk in text) for qk in query_keywords)) #寫法:all()
    elif (cond == 'or'):
        # query keywords condition使用者輸入關鍵字條件
        condition = condition & df.content.apply(lambda text: any((qk in text) for qk in query_keywords)) #寫法:any()
    # condiction is a list of True or False boolean value
    df_query = df[condition]

    return df_query


# Step 2: Sentimental polarity score計算整體情緒分數(影響力)

Count number of article with Positive, Negative, and Neutral polarity (objective polarity)

0~1  

In [92]:
# sentimental polarity score
def get_article_sentiment(df_query):
    # df_query = df[df['tokens'].str.contains(query_key)]
    sentiCount = {'pos': 0, 'neg': 0, 'obj': 0} # count次數
    sentiPercnt = {'pos': 0, 'neg': 0, 'obj': 0} # percentage百分比
    numberOfArticle = len(df_query)
    for senti in df_query.sentiment:
        # 判斷文章的情緒極性
        if senti >= 0.6: #大於0.6為正向 
            sentiCount['pos'] += 1
        elif senti <= 0.4: #小於0.4為負向
            sentiCount['neg'] += 1
        else:
            sentiCount['obj'] += 1 #介於中間為中立
    for polar in sentiCount :
        try:
            sentiPercnt[polar]=int(sentiCount[polar]/numberOfArticle*100)
            # sentiPercnt[polar]=round(sentiCount[polar]/numberOfArticle,2) # 0.75
        except:
            sentiPercnt[polar] = 0 # 若分母 numberOfArticle=0會報錯
    return sentiCount, sentiPercnt


In [93]:
query_keywords = ['黃珊珊']
cond='or'
weeks=12
cate='全部'
df_query = filter_df_via_content(query_keywords, cond, cate, weeks)
sentiCount, sentiPercnt = get_article_sentiment(df_query)

In [94]:
sentiCount

{'pos': 0, 'neg': 0, 'obj': 0}

In [95]:
sentiPercnt

{'pos': 0, 'neg': 0, 'obj': 0}

In [96]:
'正向:{}%, 中立:{}%, 負向:{}%'.format(str(sentiPercnt['pos']), str(sentiPercnt['obj']),str(sentiPercnt['neg']))

'正向:0%, 中立:0%, 負向:0%'

In [97]:
[sentiPercnt[p] for p in ['pos', 'obj', 'neg']]

[0, 0, 0]

# Step 3: Count positive and negative articles and get line chart data

The number of articles for positive and negative sentimental polarity.

This is the data for drawing daily-basis line chart on homepage.

    The data format for the line chart is as follows.

    [{'x': '2020-03-05', 'y': 1},
    {'x': '2020-03-06', 'y': 0},
    {'x': '2020-03-07', 'y': 0},
    {'x': '2020-03-08', 'y': 0},
    {'x': '2020-03-09', 'y': 0},
    {'x': '2020-03-10', 'y': 5},
    {'x': '2020-03-11', 'y': 5},
    {'x': '2020-03-12', 'y': 7},
    {'x': '2020-03-13', 'y': 62},
    {'x': '2020-03-14', 'y': 29}]

In [98]:
def get_daily_basis_sentiment_count(df_query, sentiment_type='pos', freq_type='D'):

    # 自訂正負向中立的標準，sentiment score是機率值，介於0~1之間
    # Using lambda to determine if an article is postive or not.
    if sentiment_type == 'pos':
        lambda_function = lambda senti: 1 if senti >= 0.6 else 0 #大於0.6為正向 
    elif sentiment_type == 'neg':
        lambda_function = lambda senti: 1 if senti <= 0.4 else 0 #小於0.4為負向
    elif sentiment_type == 'neutral':
        lambda_function = lambda senti: 1 if senti > 0.4 & senti < 0.4 else 0 #介於中間為中立
    else:
        return None 
        
    freq_df = pd.DataFrame({'date_index': pd.to_datetime(df_query.date),
                             'frequency': [lambda_function(senti) for senti in df_query.sentiment]})
    # Group rows by the date to the daily number of articles. 加總合併同一天新聞，篇數就被計算好了
    freq_df_group = freq_df.groupby(pd.Grouper(key='date_index', freq=freq_type)).sum()
    
    # 'date_index'為index(索引)，將其變成欄位名稱，inplace=True表示原始檔案會被異動
    freq_df_group = freq_df_group.reset_index()
    #freq_df_group.reset_index(inplace=True)
    
    # x,y，用於畫趨勢線圖
    xy_line_data = [{'x':date.strftime('%Y-%m-%d'),'y':freq} for date, freq in zip(freq_df_group.date_index,freq_df_group.frequency)]
    return xy_line_data

In [99]:
query_keywords = ['黃珊珊']
cond='or'
weeks=12
cate='全部'
df_query = filter_df_via_content(query_keywords, cond, cate, weeks)

In [100]:
df_query.shape

(0, 12)

In [101]:
get_daily_basis_sentiment_count(df_query, sentiment_type='pos')

[]

In [102]:
get_daily_basis_sentiment_count(df_query, sentiment_type='neg')

[]

## How does this work?

In [103]:
# This is what we used to calculate daily frequency of keyword in our previous app
pd.DataFrame({'date_index':pd.to_datetime( df_query.date ),'frequency':[1 for _ in range(len(df_query))]})

Unnamed: 0,date_index,frequency


In [104]:
# Now we need to modify the above line as follows.
# Using lambda to determine if an article is postive or not.
pd.DataFrame({'date_index':pd.to_datetime( df_query.date ),'frequency':[ (lambda senti: 1 if senti >= 0.6 else 0)(senti) for senti in df_query.sentiment]})

Unnamed: 0,date_index,frequency


In [105]:
# What is the following lambda function? It is a little bite hard to understand.
[ (lambda senti: 1 if senti >= 0.6 else 0)(senti) for senti in df_query.sentiment  ]

[]

In [106]:
lambda x: 1 if x >= 0.6 else 0

<function __main__.<lambda>(x)>

In [107]:
(lambda x: 1 if x >= 0.6 else 0)(0.9)  #  f(0.9)   f: lambda x: 1 if x >= 0.6  def f:

1

In [108]:
(lambda x: 1 if x >= 0.6 else 0)(0.2)

0

In [109]:
pd.to_datetime( df_query.date )

Series([], Name: date, dtype: datetime64[ns])

In [110]:
{'date_index':pd.to_datetime( df_query.date ),'frequency':[ (lambda senti: 1 if senti >= 0.7 else 0)(senti) for senti in df_query.sentiment]}

{'date_index': Series([], Name: date, dtype: datetime64[ns]), 'frequency': []}

In [111]:
pd.DataFrame({'date_index':pd.to_datetime( df_query.date ),'frequency':[ (lambda senti: 1 if senti >= 0.7 else 0)(senti) for senti in df_query.sentiment]})

Unnamed: 0,date_index,frequency


In [112]:
# a dict with 'date_index' key and 'frequency' value ==> A dict is consist of several (key, value) pairs.
{'date_index':pd.to_datetime( df_query.date ),'frequency':[ (lambda senti: 1 if senti >= 0.6 else 0)(senti) for senti in df_query.sentiment]}

{'date_index': Series([], Name: date, dtype: datetime64[ns]), 'frequency': []}

In [113]:
# Convert the above dict into DataFrame and assign it to query_freq
freq_df = pd.DataFrame({'date_index':pd.to_datetime( df_query.date ),'frequency':[ (lambda senti: 1 if senti >= 0.6 else 0)(senti) for senti in df_query.sentiment]})

In [114]:
freq_df

Unnamed: 0,date_index,frequency


In [115]:
# Group rows by the date to the daily number of articles. 加總合併同一天新聞，篇數就被計算好了
freq_df_group = freq_df.groupby(pd.Grouper(key='date_index',freq='D')).sum()

In [116]:
freq_df_group

Unnamed: 0_level_0,frequency
date_index,Unnamed: 1_level_1


In [117]:
# 'date_index'為index(索引)，將其變成欄位名稱，inplace=True表示原始檔案會被異動
freq_df_group.reset_index(inplace=True)
#freq_df_group = freq_df_group.reset_index() # 這樣也可以得到同樣結果


In [118]:
freq_df_group

Unnamed: 0,date_index,frequency


In [119]:
list(zip(freq_df_group.date_index,freq_df_group.frequency))

[]

In [120]:

# 有時間變數x,y，用於畫趨勢線圖
line_xy_data = [{'x':date.strftime('%Y-%m-%d'),'y':freq} for date, freq in zip(freq_df_group.date_index,freq_df_group.frequency)]

In [121]:
line_xy_data

[]

# Step 4: Prepare the response data 

In [122]:
def prepare_for_response(query_keywords, cond, cate, weeks):

    # Proceed filtering
    df_query = filter_df_via_content(query_keywords, cond, cate, weeks)
    
    sentiCount, sentiPercnt = get_article_sentiment(df_query)

    if weeks <= 4:
        freq_type = 'D'
    else:
        freq_type = 'W' # weekly basis

    line_data_pos = get_daily_basis_sentiment_count(df_query, sentiment_type='pos', freq_type=freq_type)
    line_data_neg = get_daily_basis_sentiment_count(df_query, sentiment_type='neg', freq_type=freq_type)

    response = {
        'sentiCount': sentiCount,
        'data_pos':line_data_pos,
        'data_neg':line_data_neg,
    }
    return response

In [None]:
query_keywords = ['黃珊珊']
cond='or'
weeks=12
cate='全部'
prepare_for_response(query_keywords, cond, cate, weeks)

{'sentiCount': {'pos': 0, 'neg': 0, 'obj': 0}, 'data_pos': [], 'data_neg': []}

: 

# Step 5: Django views.py 請看示範專案APP 重複的模組要跟別人借用
    (1)app name: app_userkey_sentiment
    (2)namespace defined in urls.py: 
        app_name="namespace_userkey_sentiment"  
        or app_name="app_userkey_sentiment"
    (3) home.html


