# Text Analysis

### Import packages

In [1]:
import pandas as pd
import nltk
from collections import defaultdict
from tqdm import tqdm
from nltk.tokenize.stanford import CoreNLPTokenizer
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF
import numpy as np

### Pre-requisite: Running NLTK Core Server on http://localhost:9001

In [2]:
tokenizer = CoreNLPTokenizer('http://localhost:9001')

### Read data

In [3]:
content_df = pd.read_csv('../data/1_xwlb_content_title_daily.csv', encoding='gb18030')

### Add columns that label the day of the week and whether the date of broadcast was a weekday or a weekend

In [4]:
# Convert date column's type to datetime
# Use date column to compute day of week and is_weekend
content_df['date'] = pd.to_datetime(content_df['date'], format='%Y-%m-%d')
content_df['day_of_week'] = content_df['date'].dt.weekday
content_df['is_weekend'] = content_df.day_of_week.map(lambda x: x > 4)
content_df.head()

Unnamed: 0,content,date,title,day_of_week,is_weekend
0,陆军领导机构火箭军战略支援部队成立大会在京举行 习近平向中国人民解放军陆军火箭军战略支援部队...,2016-01-01,陆军领导机构火箭军战略支援部队成立大会在京举行 习近平向中国人民解放军陆军火箭军战略支援部队...,4,False
1,中国人民解放军陆军领导机构、中国人民解放军火箭军、中国人民解放军战略支援部队成立大会2015...,2016-01-01,陆军领导机构火箭军战略支援部队成立大会在京举行 习近平向中国人民解放军陆军火箭军战略支援部队...,4,False
2,下午4时，成立大会开始，全场高唱国歌。仪仗礼兵护卫着鲜艳军旗，正步行进到主席台前。习近平将军...,2016-01-01,陆军领导机构火箭军战略支援部队成立大会在京举行 习近平向中国人民解放军陆军火箭军战略支援部队...,4,False
3,授旗仪式后，习近平致训词。他指出：“成立陆军领导机构、火箭军、战略支援部队，是党中央和中央军...,2016-01-01,陆军领导机构火箭军战略支援部队成立大会在京举行 习近平向中国人民解放军陆军火箭军战略支援部队...,4,False
4,习近平强调，陆军是党最早建立和领导的武装力量，历史悠久，敢打善战，战功卓著，为党和人民建立了...,2016-01-01,陆军领导机构火箭军战略支援部队成立大会在京举行 习近平向中国人民解放军陆军火箭军战略支援部队...,4,False


### Separate the data frame into weekend_df and weekday_df

In [5]:
# weekend_df is the dataframe that contains only weekend
# weekday_df is similarly defined
weekend_df = content_df[content_df.is_weekend]
weekday_df = content_df[~content_df.is_weekend]
weekend_df.head()

Unnamed: 0,content,date,title,day_of_week,is_weekend
54,【年终特稿】依法治国按下“快进键”,2016-01-02,【年终特稿】依法治国按下“快进键”,5,True
55,,2016-01-02,,5,True
56,2015年是“全面推进依法治国”的开局之年，习近平总书记提出“要全面推进依法治国，更好维护人...,2016-01-02,,5,True
57,2015年12月29号，一南一北两起大案的同日开庭，在年末再次把人们的目光凝聚到司法与公正。...,2016-01-02,,5,True
58,“让人民群众在每一起司法案件中感受到公平与正义”，这句话，在过去的这一年，前所未有地广为人知...,2016-01-02,,5,True


### Tokenize each paragraph

In [6]:
# Use the nltk tokenizer to tokenize corpus

def tokenize_to_corpus(df):
    '''
    For each title and date, concatenate all passages belonging to that title
    Then run tokenizer on the passage
    Eventually each tokenized passage belonging to a title and date combination 
    will be treated as one article and collected in a list called corpus
    '''
    tokenized_passages = {}
    for title_and_date, title_df in tqdm(df.groupby(['title', 'date'])):
        passage = ''.join(title_df.dropna().content)
        if len(passage) > 10:
            tokenized_passages[title_and_date] = tokenizer.tokenize(passage)
    corpus = [' '.join(v) for v in tokenized_passages.values()]
    return corpus

# segregate weekend and weekday articles
corpus_weekday = tokenize_to_corpus(weekday_df)
corpus_weekend = tokenize_to_corpus(weekend_df)
corpus_all = corpus_weekday + corpus_weekend

100%|█████████▉| 11506/11532 [12:50<00:01, 14.94it/s]
100%|█████████▉| 4565/4575 [05:18<00:00, 14.33it/s]


### Define and save the corpora

In [7]:
with open('../data/corpus_all.pickle', 'wb') as pickle_file:
    pickle.dump(corpus_all, pickle_file)
with open('../data/corpus_weekday.pickle', 'wb') as pickle_file:
    pickle.dump(corpus_weekday, pickle_file)
with open('../data/corpus_weekend.pickle', 'wb') as pickle_file:
    pickle.dump(corpus_weekend, pickle_file)

### Read from saved pickle file

In [8]:
with open('../data/corpus_all.pickle', 'rb') as pickle_file:
    corpus_all = pickle.load(pickle_file)
with open('../data/corpus_weekday.pickle', 'rb') as pickle_file:
    corpus_weekday = pickle.load(pickle_file)
with open('../data/corpus_weekend.pickle', 'rb') as pickle_file:
    corpus_weekend = pickle.load(pickle_file)

### Write a function to do the following tasks
1. Vectorize the corpus; 
2. Decompose corpus matrix into a document-topic matrix (Thanks to Fiona's lecture notes)

In [9]:
def model_topics(corpus, num_topics=10, num_top_words=20):
    # Initialize vectorizer as CountVecotorizer from sklearn
    # Use vectorizer to map corpus into a word frequency matrix
    vectorizer = CountVectorizer()
    m = vectorizer.fit_transform(corpus)
    
    # Use NMF to decompose word frequency matrix to get topics
    clf = NMF(n_components=num_topics, random_state=1)
    doctopic = clf.fit_transform(m)

    topic_words = []
    
    # Collect topic words into a list
    vocab = vectorizer.get_feature_names()
    for topic in clf.components_:
        word_idx = np.argsort(topic)[::-1][0:num_top_words]
        topic_words.append([vocab[i] for i in word_idx])
        
    return clf, topic_words

weekend_clf, weekend_topic_words = model_topics(corpus_weekend, 5, 10)
weekday_clf, weekday_topic_words = model_topics(corpus_weekday, 5, 10)
all_clf, all_topic_words = model_topics(corpus_all)

### Create the data frames for weekend and weekday topics and output them as csv

In [10]:
weekend_topics_df = pd.DataFrame(data=weekend_topic_words).transpose()
weekday_topics_df = pd.DataFrame(data=weekday_topic_words).transpose()
weekend_topics_df.to_csv('../results/result3_xwlb_daily_weekend_topics.csv', encoding='gb18030', index=False)
weekday_topics_df.to_csv('../results/result4_xwlb_daily_weekday_topics.csv', encoding='gb18030', index=False)

In [11]:
weekday_topics_df

Unnamed: 0,0,1,2,3,4
0,中国,合作,改革,中央,会议
1,人民,发展,发展,工作,人大
2,习近平,关系,经济,政治,代表
3,发展,习近平,创新,习近平,常委会
4,我们,共同,推进,领导,关于
5,国家,中方,企业,坚持,审议
6,世界,加强,建设,干部,草案
7,伟大,领域,会议,重要,报告
8,社会,双方,工作,全面,工作
9,社会主义,国家,深化,问题,委员会


Let's look at the words and see if they form any reasonable topics.

In [12]:
weekday_topics_eng = pd.DataFrame({'#WeAreTheBest': ['China', 'people', 'Xi Jinping', 'development', 'we', 'country', 'world', 'great', 'society', 'socialism'],
                                   'intl_cooperation': ['cooperation', 'development', 'relations', 'Xi Jinping', 'together', 'the Chinese side', 'strengthen', 'field', 'bilateral', 'country'], 
                                   'reform': ['reform', 'development', 'economics', 'innovation', 'push forward', 'corporation', 'construction', 'conference', 'work', 'deepen'],
                                   'party_discipline': ['central', 'work', 'politics', 'Xi Jinping', 'leadership', 'persist', 'cadre', 'important', 'comprehensive', 'issue'], 
                                   'Natl_Peoples_Congress': ['conference', 'Peoples Congress', 'representative', 'standing committee', 'about', 'deliberation', 'draft', 'report', 'work', 'committee'] 
})
weekday_topics_eng.to_csv('../results/result5_xwlb_daily_weekday_topics_eng.csv', index=False)
weekday_topics_eng

Unnamed: 0,#WeAreTheBest,Natl_Peoples_Congress,intl_cooperation,party_discipline,reform
0,China,conference,cooperation,central,reform
1,people,Peoples Congress,development,work,development
2,Xi Jinping,representative,relations,politics,economics
3,development,standing committee,Xi Jinping,Xi Jinping,innovation
4,we,about,together,leadership,push forward
5,country,deliberation,the Chinese side,persist,corporation
6,world,draft,strengthen,cadre,construction
7,great,report,field,important,conference
8,society,work,bilateral,comprehensive,work
9,socialism,committee,country,issue,deepen


Comment: The words seem to hint at the five generic topics. As we will see later, they differ little from the data for all days.

In [13]:
weekend_topics_df

Unnamed: 0,0,1,2,3,4
0,中央,合作,发展,中国,贫困
1,工作,发展,经济,世界,脱贫
2,习近平,关系,金融,国家,扶贫
3,宗教,习近平,改革,国际,地区
4,政治,中方,创新,人民,组织
5,全面,国家,建设,发展,攻坚
6,会议,国际,社会,我们,群众
7,问题,共同,健康,经济,深度
8,精神,组织,推进,全球,叙利亚
9,人民,双方,服务,习近平,精准


Let's look at the words and see if they form any reasonable topics.

In [15]:
weekend_topics_eng = pd.DataFrame({'generic': ['central', 'work', 'Xi Jinping', 'religion', 'politics', 'comprehensive', 'conference', 'issue', 'spirit', 'people'],
                                   'intl_cooperation': ['cooperation', 'development', 'relations', 'Xi Jinping', 'the Chinese side', 'country', 'international', 'together', 'organization', 'bilateral'], 
                                   'Policy area': ['development', 'economics', 'finance', 'reform', 'innovation', 'construction', 'society', 'health', 'push forward', 'service'],
                                   '#WeAreTheBest': ['China', 'world', 'country', 'international', 'people', 'development', 'we', 'economics', 'global', 'Xi Jinping'], 
                                   'poverty + Syria': ['poverty', 'overcome poverty', 'poverty relief', 'region', 'organization', 'conquer', 'people', 'deeply', 'Syria', 'accurate'] 
})
weekend_topics_eng.to_csv('../results/result6_xwlb_daily_weekend_topics_eng.csv', index=False)
weekend_topics_eng

Unnamed: 0,#WeAreTheBest,Policy area,generic,intl_cooperation,poverty + Syria
0,China,development,central,cooperation,poverty
1,world,economics,work,development,overcome poverty
2,country,finance,Xi Jinping,relations,poverty relief
3,international,reform,religion,Xi Jinping,region
4,people,innovation,politics,the Chinese side,organization
5,development,construction,comprehensive,country,conquer
6,we,society,conference,international,people
7,economics,health,issue,together,deeply
8,global,push forward,spirit,organization,Syria
9,Xi Jinping,service,people,bilateral,accurate


Comment: The words seem to hint at the 4 topics, and one that has fewer patterns. Note that #WeAreTheBest are both present in weekend and weekday data. One major hint of this categorization comes from the inclusion of grand and vague words like "world", "global", and "country", together with "we." This combination changes the tone of the passage as if the broadcaster is speaking directly at the viewers.
International cooperation also seems to occur in both datasets. 

In [16]:
index = [
    'XJP', 
    'intl_cooperation', 
    'reform',
    'party_discipline',
    'Natl_Peoples_Congress',
    '#WeAreTheBest',
    'intl_conflict',
    'domestic_economics',
    'leadership',
    'intl_economics'
]
topic_df = pd.DataFrame(data=all_topic_words, index=index).transpose()
topic_df.to_csv('../results/result7_xwlb_daily_all_topics.csv', encoding='gb18030', index=False)
topic_df

Unnamed: 0,XJP,intl_cooperation,reform,party_discipline,Natl_Peoples_Congress,#WeAreTheBest,intl_conflict,domestic_economics,leadership,intl_economics
0,习近平,合作,改革,政治,会议,中国,美国,发展,中央,国家
1,人民,关系,深化,工作,人大,世界,组织,经济,中共,金砖
2,发展,发展,会议,中央,审议,人民,贫困,创新,政治局,安全
3,社会,习近平,推进,监督,草案,国际,叙利亚,企业,委员,国际
4,建设,中方,制度,领导,关于,我们,地区,建设,主席,合作
5,工作,双方,全面,干部,常委会,共产党,扶贫,社会,同志,习近平
6,我们,领域,体制,党内,代表,伟大,脱贫,金融,全国,领导人
7,总书记,共同,落实,坚持,报告,特色,政府,李克强,政协,世界
8,社会主义,加强,工作,全面,工作,全球,问题,推动,书记,全球
9,坚持,推动,机制,问题,委员会,实现,俄罗斯,增长,十九,共同


Finally, let's look at the words and see if they form any reasonable topics in the big dataset.

In [17]:
all_topics_eng = pd.DataFrame({'XJP': ['Xi Jinping', 'people', 'development', 'society', 'construction', 'work', 'we', 'General Secretary', 'socialism', 'persist','ethnic', 'great', 'important', 'spirit', 'characteristics', 'thoughts', 'emphasize', 'point out', 'comrade', 'era'],
                               'intl_cooperation': ['cooperation', 'realtions', 'development', 'Xi Jinping', 'the Chinese side', 'bilateral', 'field', 'together', 'strengthen', 'push forward','strategy', 'Chairman', 'communication', 'partner', 'meeting', 'friendly', 'deepen', 'express', 'international', 'support'], 
                               'reform': ['reform', 'deepen', 'conference', 'push forward', 'system', 'comprehensive', 'institution', 'implement', 'work', 'mechanism', 'central','perfect (v.)', 'construction', 'management', 'strengthen', 'testing', 'service', 'leadership', 'emphasize', 'comment'],
                               'party_discipline': ['politics', 'work', 'central', 'supervise', 'leadership', 'cadre', 'within party', 'persist', 'comprehensive', 'issue','party management', 'inspection', 'plenary session', 'organization', 'strictly', 'discipline', 'life', 'strengthen', 'party member', 'important'], 
                               'Natl_Peoples_Congress': ['conference', 'Peoples Congress', 'deliberate', 'draft', 'about', 'standing committee', 'representative', 'report', 'work', 'committee', 'twelve', 'people', 'decosopm', 'law', 'Zhang Dejiang', 'national', 'convention', 'committee chair', 'situation', 'pass'],
                               '#WeAreTheBest': ['China', 'world', 'people', 'international', 'we', 'communist party', 'great', 'characterstics', 'global', 'realize','socialism', 'era', 'history', 'express', 'represent', 'today', 'convention', 'China (Greater China area)', 'country', 'reform'],
                               'intl_conflict': ['USA', 'organization', 'poverty', 'Syria', 'area', 'poverty relief', 'overcome poverty', 'government', 'issue', 'Russia', 'express', 'extreme', 'at present', 'conduct', 'today', 'already', 'local', 'people', 'this year', 'president'],
                               'domestic_economics': ['development', 'economics', 'innovation', 'corporation', 'construction', 'society', 'finance', 'Li Keqiang', 'push on', 'grow', 'promote', 'policy', 'industry', 'push forward', 'technology', 'open up', 'market', 'supply', 'technology', 'world'],
                               'leadership': ['central', 'Chinese Communist party', 'Politburo', 'committee members', 'chairman', 'comrade', 'national', 'Chinese Peoples Political Consultative Conference', 'secretary', 'nineteen','standing committee', 'leadership', 'Xi Jinping', 'spirit', 'military commission', 'General Secretary', 'committee', 'communist party', 'people', 'representative'],
                               'intl_economics': ['country', 'BRIC', 'security', 'international', 'cooperation', 'Xi Jinping', 'leaders', 'world', 'global', 'together','manage', 'meeting', 'developing', 'scientific technology', 'internet', 'Hong Kong', 'innovation', 'attend', 'technology', 'important']
})
all_topics_eng.to_csv('../results/result8_xwlb_daily_all_topics_eng.csv', index=False)
all_topics_eng

Unnamed: 0,#WeAreTheBest,Natl_Peoples_Congress,XJP,domestic_economics,intl_conflict,intl_cooperation,intl_economics,leadership,party_discipline,reform
0,China,conference,Xi Jinping,development,USA,cooperation,country,central,politics,reform
1,world,Peoples Congress,people,economics,organization,realtions,BRIC,Chinese Communist party,work,deepen
2,people,deliberate,development,innovation,poverty,development,security,Politburo,central,conference
3,international,draft,society,corporation,Syria,Xi Jinping,international,committee members,supervise,push forward
4,we,about,construction,construction,area,the Chinese side,cooperation,chairman,leadership,system
5,communist party,standing committee,work,society,poverty relief,bilateral,Xi Jinping,comrade,cadre,comprehensive
6,great,representative,we,finance,overcome poverty,field,leaders,national,within party,institution
7,characterstics,report,General Secretary,Li Keqiang,government,together,world,Chinese Peoples Political Consultative Conference,persist,implement
8,global,work,socialism,push on,issue,strengthen,global,secretary,comprehensive,work
9,realize,committee,persist,grow,Russia,push forward,together,nineteen,issue,mechanism


Comment: The ten topics seem pretty distinct. Four of the ten topics include Xi Jinping in the list of words, and one of the topic seems to be entirely about Xi himself. 