In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
import fasttext
import string
from pycountry import languages
import re
from gensim.parsing.preprocessing import remove_stopwords



In [3]:
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ashbu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# bringing in the model path for fasttext
PRETRAINED_MODEL_PATH = 'lid.176.bin'
model = fasttext.load_model(PRETRAINED_MODEL_PATH)




In [5]:
# function to help figure out the language
def lang_class(x):
    x = x .replace("\n", " ")
    lang = model.predict(x)
    
    return lang

In [6]:
# function to help retrieve the language name using the language code
def lang_from_code(x):
    x = x[0]
    x = str(x)
  
    match = re.match(r"__label__([a-zA-Z]+)", x)
    match = match.group(1)

    if len(match) == 2:
        
        try:
            lang = languages.get(alpha_2=match).name

        except:
            lang = match
    elif len(match) == 3:
        try:
            lang = languages.get(alpha_3=match).name
        
        except:   
            lang = match
        
    return(lang)

In [7]:
# global vars -> but not too terrible because otherwise it slows down executation by orders of magnitude
stopwords = nltk.corpus.stopwords.words("english")
import string
punct = string.punctuation
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# pre-proc function to lowercase, rm punct, #s, stopwords, split & lemmatize
def pre_proc(x):
    # lowercasing the text -> slight possibility of an issue with names...but should be okayish
    x = x.lower()
   
    # removing punctuation
    x = x.translate(str.maketrans('', '', punct))
    
    # removing double-spaces by changing them to single spaces
    x = x.replace("  ", " ")
    
    # removing numbers 
    x = ''.join(filter(lambda string: not string.isdigit(), x))

    # removing the stopwords
    x = remove_stopwords(x)
        
    # splitting the press releases
    x = x.split()
    
    # lemmatizing the words in the releases ->
    x = [wordnet_lemmatizer.lemmatize(word) for word in x]
    
    # joining together the elements in the tokenized list
    x = " ".join(x)  
        
    # making sure I don't forget to return my result ;) 
    return x

In [8]:
# reading in the data
data = pd.read_json("PRC-UK_Embassy_press_releases.ndjson",lines=True)

### Looking at the more complicated stuffs

#### Fasttext language classification on body

In [9]:
%%time
# trying to detecting the languages present here
data['bo_lang'] = data['body'].apply(lang_class)

Wall time: 131 ms


In [10]:
%%time
# splitting the tuples found in the lang_class column
data[["bo_lang","conf"]] = pd.DataFrame(data['bo_lang'].tolist(), index=data.index)


Wall time: 2 ms


In [11]:
%%time
# applying func to pull lang code
data["bo_lang"] = data["bo_lang"].apply(lang_from_code)

Wall time: 33 ms


In [12]:
data["bo_lang"].value_counts()

English    373
Name: bo_lang, dtype: int64

In [13]:
# need to convert the confidence from single deep lists into actual values
data["conf"]

0      [0.9390600919723511]
1      [0.9533370733261108]
2       [0.952850878238678]
3      [0.9550470113754272]
4      [0.9619375467300415]
               ...         
368    [0.9459292888641357]
369    [0.9434953331947327]
370    [0.9434953331947327]
371    [0.9681368470191956]
372    [0.9681368470191956]
Name: conf, Length: 373, dtype: object

### Data Cleaning

In [14]:
%%time
# cleaning function -> returns un tokenized text but cleaned w/stopwords removed
data["cleaned"] = data["body"].apply(pre_proc)

Wall time: 1.98 s


### Single Term Frequency

#### Term Frequency overall

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

# vectorizing the posts
# initializing the vectorizer
# setting min doc freq to 1 release b/c of the small size of the corpus (not setting max df b/c of a fear of losing things)
vectorizer = CountVectorizer(min_df=1)

# applying the vectorizer
tf = vectorizer.fit_transform(data["cleaned"])

# pulling the terms
tf_feature_names = vectorizer.get_feature_names()

# pulling the term counts
term_counts = np.asarray(tf.sum(axis=0))[0]

# making a dictionary w/terms and term counts
term_freqs = dict(zip(tf_feature_names, term_counts))

In [16]:
# looks at the top 50 terms to be found here overall
# takes advantage of the return type of sorted (a list of tuples) and takes a slice before making a dict from that
dict((sorted(term_freqs.items(), key=lambda item: item[1],reverse=True))[0:50])

{'china': 2349,
 'chinese': 1080,
 'country': 1062,
 'hong': 995,
 'kong': 966,
 'people': 910,
 'xinjiang': 758,
 'uk': 753,
 'law': 723,
 'international': 629,
 'right': 591,
 'development': 497,
 'affair': 489,
 'security': 436,
 'government': 432,
 'national': 414,
 'global': 400,
 'world': 384,
 'ethnic': 372,
 'cooperation': 356,
 'embassy': 343,
 'report': 319,
 'time': 310,
 'year': 282,
 'foreign': 281,
 'group': 268,
 'state': 268,
 'question': 248,
 'measure': 240,
 'policy': 237,
 'internal': 234,
 'public': 233,
 'region': 233,
 'joint': 229,
 'covid': 226,
 'relevant': 224,
 'spokesperson': 223,
 'fact': 215,
 'economic': 214,
 'freedom': 211,
 'including': 210,
 'issue': 205,
 'effort': 204,
 'comment': 201,
 'community': 198,
 'th': 195,
 'democracy': 193,
 'letter': 191,
 'human': 189,
 'million': 188}

Looking at the terms and their frequency. We don't see anything too surprising here to a student of global affairs. Quite a bit about China and Chinese, Hong Kong, the UK. Standard terms such as international, right, law, global, government, and security. Question isn't too surprising as it is a common enough term that pops up in the corpus (many of the releases follow a Q&A format to a degree). While "th" clearly refers to dates (after I removed the #'s).

#### TF by Ambassador

In [27]:
data["ambassador"].unique()

array(['Ambassador Zheng Zeguang', 'Ambassador Liu Xiaoming'],
      dtype=object)

In [24]:
# first step is to run it per ambassador and then run the top terms for each of those :(
data[data["ambassador"]=="Ambassador Zheng Zeguang"]

Unnamed: 0,storage_url,title,date,body,ambassador,bo_lang,conf,cleaned
0,https://web.archive.org/web/20211130031323/htt...,Embassy Spokesperson's Remarks on Chinese oper...,2020-08-24 23:45:00,"Question: According to British media's report,...",Ambassador Zheng Zeguang,English,[0.9390600919723511],question according british medias report ecuad...
1,https://web.archive.org/web/20211130052058/htt...,Embassy Spokesperson's Remarks on issues relat...,2020-08-25 21:03:00,"Question: In an open letter, MPs will be urged...",Ambassador Zheng Zeguang,English,[0.9533370733261108],question open letter mp urged campaign uk push...
2,https://web.archive.org/web/20211130025452/htt...,Embassy Spokesperson's Remarks on issues relat...,2020-08-25 21:05:00,Question: A letter signed by religious leaders...,Ambassador Zheng Zeguang,English,[0.952850878238678],question letter signed religious leader differ...
3,https://web.archive.org/web/20211130042447/htt...,Embassy Spokesperson's Remarks on The Times' R...,2020-08-29 07:08:00,"Question: On 28 August, The Times carried a re...",Ambassador Zheng Zeguang,English,[0.9550470113754272],question august time carried report saying num...
4,https://web.archive.org/web/20211130054122/htt...,Embassy Spokesperson's Remarks on the Claim by...,2020-09-01 23:40:00,"Question: On August 30th, Tom Tugendhat, Chair...",Ambassador Zheng Zeguang,English,[0.9619375467300415],question august th tom tugendhat chairman fore...
...,...,...,...,...,...,...,...,...
310,https://web.archive.org/web/20211009215307/htt...,Embassy Spokesperson's Remarks on Some Media R...,2021-10-08 00:00:00,Question: Some media report cites a study that...,Ambassador Zheng Zeguang,English,[0.9542698264122009],question medium report cite study alleges chin...
311,https://web.archive.org/web/20211013205341/htt...,Chinese Foreign Ministry Spokesperson's Remark...,2021-10-12 00:00:00,"Question: The ""witness cost"" of the so-called ...",Ambassador Zheng Zeguang,English,[0.9676141738891602],question witness cost socalled uyghur tribunal...
312,https://web.archive.org/web/20211014135329/htt...,Embassy Spokesperson's Comment on the China-re...,2021-10-14 00:00:00,"Question: A few days ago, UK Secretary of the ...",Ambassador Zheng Zeguang,English,[0.9386454224586487],question day ago uk secretary state foreign co...
313,https://web.archive.org/web/20211020221311/htt...,Chinese Embassy Spokesperson's Remarks on Ecol...,2021-10-19 00:00:00,Question: The Qinghai-Tibet Plateau is known a...,Ambassador Zheng Zeguang,English,[0.9391968846321106],question qinghaitibet plateau known pole world...


In [None]:

# applying the vectorizer
tf = vectorizer.fit_transform(data["cleaned"])

# pulling the terms
tf_feature_names = vectorizer.get_feature_names()

# pulling the term counts
term_counts = np.asarray(tf.sum(axis=0))[0]

# making a dictionary w/terms and term counts
term_freqs = dict(zip(tf_feature_names, term_counts))

### Bi-gram & Tri-gram TF

In [None]:
# first step is to resplit into unigram - tri trigrams


### Sentiment Analysis

In [41]:
# pretty quick and easy

#### Textblob

#### BERT

#### Vader & NLTK

#### Visualize by Ambassador's Term

### Named Entity Recognition