# LDA Analysis

## Main Research Questions
2021 年 3 月事件 (Atlanta Spa shootings) 發生前後的比較：
- 排名前十個概念 (concept) 是否有顯著不同
- 報導 Anti-Asian Racism 態度上是否不同（中立性、支持 Asian 性、強調 Anti-Asian 對社會上種族不平等的嚴重性）

## Read the data

In [1]:
# Read from date.txt and store the contents into a dictionary
import os
import json
import pandas as pd

df_news = pd.read_csv('../results/preprocessed_news.csv')
preprocessed_articles = [article.strip("['").strip("']").split("', '") for article in df_news['preprocessed_articles'].tolist()]
df_news.head()

Unnamed: 0,id,contents,source,date,preprocessed_articles
0,0,7/1/2021\n https://www-proquest-com.falcon.lib...,Boston Globe,202004,"['see', 'chinatown', 'restaurateur', 'face', '..."
1,1,7/2/2021\n https://www-proquest-com.falcon.lib...,Boston Globe,202004,"['racist', 'zoom', 'bombing', 'invades', 'clas..."
2,10,Find a copy\n Abstract\n document 1 of 1\n Ful...,Boston Globe,202103,"['asian_american', 'attack', 'hit', 'struggle'..."
3,100,6/24/2021\n https://www-proquest-com.falcon.li...,Others,202003,"['world', 'human', 'right', 'dimension', 'covi..."
4,101,6/24/2021\n https://www-proquest-com.falcon.li...,Others,202003,"['trump', 'brush', 'remark', 'claim', 'asian_a..."


## Create TF-IDF corpus

In [2]:
from gensim.corpora import Dictionary
from gensim.models import TfidfModel

In [3]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(preprocessed_articles)
dictionary.filter_extremes(no_below=7, no_above=0.3)
corpus = [dictionary.doc2bow(article) for article in preprocessed_articles]

# Get TF-IDF scores
tfidf = TfidfModel(corpus)  
tfidf_corpus = tfidf[corpus]

## Train LDA model


用不同參數訓練 LDA ，並選擇一致性最高的模型

In [4]:
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import pandas as pd

def compute_coherence_values(dictionary, corpus, texts, limit, start=5, step=1):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        # Train LDA model
        model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto')
        model_list.append(model)
        # Compute coherence value
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='u_mass')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [5]:
# Compute coherence values
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=preprocessed_articles, start=10, limit=15, step=1)
best_model = model_list[coherence_values.index(max(coherence_values))]

輸出各主題的前 20 名關鍵字

In [6]:
# Set the number of keywords to 20
num_keywords = 20
# Get the top keywords for each topic
topics = best_model.show_topics(num_topics=-1, num_words=num_keywords, formatted=False)
# Create a dictionary to store the keywords
keywords = {}
for topic in topics:
    keywords[topic[0]] = [word for word, _ in topic[1]]

# Save the keywords
with open(f'../results/keywords_time.json', 'w') as f:
    json.dump(keywords, f)
    

## Select the dominant topic for each document

對每篇文章選出最主要的主題。

選擇方法：每篇文章都有不同主題的機率分配，我們於此選出機率最高的主題。

In [7]:
# Get the most probable topic for each document
topics = best_model[corpus]
d = []
id = 0
for topic in topics:
    o = {"id": df_news.iloc[id].id, "odd": topic}
    d.append(o)
    id += 1
df = pd.DataFrame(d)
df.to_csv(f"../results/odd_media.csv", index=False)

In [8]:
def find_max_category(odd_list):
    max_category = None
    max_prob = -1
    for item in odd_list:
        if item[1] > max_prob:
            max_prob = item[1]
            max_category = item[0]
    return max_category

# 將最大分類添加到新的 class 列中
df['class'] = df['odd'].apply(find_max_category)
class_counts = df['class'].value_counts()
class_7_count = class_counts.get(7, 0)  # 如果沒有值為 7 的話，返回 0

In [9]:
df_news['date'] = pd.to_datetime(df_news['date'], format='%Y%m')

df_news_before_202103 = df_news[df_news['date'] < '2021-03-01']
df_news_after_202103 = df_news[df_news['date'] >= '2021-03-01']

In [10]:
class_news_before_202103 =pd.merge(df, df_news_before_202103, on='id', suffixes=('_df1', '_df2'), how='inner')
class_news_before_202103_counts = class_news_before_202103['class'].value_counts()

In [11]:
class_news_after_202103 =pd.merge(df, df_news_after_202103, on='id', suffixes=('_df1', '_df2'), how='inner')
class_news_after_202103_counts = class_news_after_202103['class'].value_counts()

In [12]:
class_list = df['class'].unique().tolist()
r_news_before_202103 = []
r_news_after_202103 = []
for c in class_list:
    r_news_before_202103.append((c, class_news_before_202103_counts.get(c,0)/class_counts.get(c, 1)/df_news_before_202103.shape[0]))
    r_news_after_202103.append((c, class_news_after_202103_counts.get(c,0)/class_counts.get(c, 1)/df_news_after_202103.shape[0]))
r_news_before_202103 = sorted(r_news_before_202103, key=lambda x: x[0])
r_news_after_202103 = sorted(r_news_after_202103, key=lambda x: x[0])

In [None]:
import matplotlib.pyplot as plt
import numpy as np
x1, y1 = zip(*r_news_before_202103)
x2, y2 = zip(*r_news_after_202103)
fig, ax = plt.subplots(figsize=(10, 5))
ax.bar(np.array(x1) - 0.1, y1, width=0.2, color='blue', alpha=0.5, label='before_202103')
ax.bar(np.array(x2) + 0.1, y2, width=0.2, color='red', alpha=0.5, label='after_202103')
ax.set_title('Bar Chart')
ax.set_xlabel('Class')
ax.set_ylabel('Value')
ax.set_xticks(np.arange(len(x1)))
ax.legend()
plt.savefig('../results/bar_chart_time.png', bbox_inches='tight')
plt.show()


## Select the dominant documents for each topic

對於每個主題，選擇最具代表性的兩篇文章，並儲存

In [None]:
topics = [max(topic, key=lambda x: x[1]) for topic in topics]
# Select 10 best documents for each topic
# Convert topics to a dataframe
topics_df = pd.DataFrame(topics, columns=['dominant topic', 'prob'])
topics_df['doc_id'] = topics_df.index
# Select 10 best documents for each topic
best_docs = topics_df.groupby('dominant topic').apply(lambda x: x.nlargest(2, 'prob')).reset_index(drop=True)
# Save the best documents
best_docs.to_csv(f'../results/best_docs_time.csv', index=False)