In [1]:
!pip install gensim spacy



In [16]:
import pandas as pd
import numpy
from gensim import corpora
from gensim.models import Phrases
from gensim.utils import simple_preprocess
from gensim.models import LdaMulticore
import os

In [3]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [4]:
df = pd.read_csv("usnewspaper_china.csv", header=None,)

In [5]:
header = ['documents', 'id', 'date', 'title', 'href', 'date_2', 'author', 'tags', 'base_url'] # will workshop the names if they will be helpful for demo

In [6]:
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,"In the new Sino-Russian defense relationship, ...",19169618,2023-04-04,China Is Eating Russia’s Lunch in the Defense...,https://www.defenseone.com/ideas/2023/04/chi...,2023-04-04,"{Thomas Corbett,Research Analyst,Bluepath Labs}","{china,war,russias,chinas,lunch,market,industr...",https://www.defenseone.com/,,
1,Cookie Preferences Cookie List\n\nCookie List\...,22151305,2023-09-05,Geopolitical tension with China would hit U.S...,https://www.defenseone.com/policy/2023/08/ge...,2023-09-05,"{Patrick Tucker,Science,Technology Editor,Defe...","{tension,study,geopolitical,sale,selection,sho...",https://www.defenseone.com/,,
2,"In this image made from video, Chinese Foreign...",18118957,2023-02-13,China says more than 10 US balloons flew in i...,https://www.journalgazette.net/news/world/ch...,2023-02-13,{Associated Press},"{airspace,wang,ministry,office,china,image,spe...",https://www.journalgazette.net/,,
3,Associated Press\n\nBEIJING — Heavy flooding h...,20894060,2023-07-04,"Flooding displaces 10,000 across China",https://www.mercurynews.com/2023/07/03/flood...,2023-07-04,{The Associated Press},"{temperature,china,heavy,rainfall,capital,floo...",http://www.mercurynews.com/,,
4,"El presidente estadounidense, Joe Biden, afirm...",21023107,2023-07-10,Joe Biden cree que se pueden establecer relac...,https://laopinion.com/2023/07/09/joe-biden-c...,2023-07-10,{La Opinión},"{y,joe,relaciones,los,que,china,en,cree,pueden...",https://laopinion.com/,,


In [7]:
df.isnull().sum()

0         0
1         0
2         0
3         0
4         0
5         0
6      3956
7         0
8         0
9     10000
10    10000
dtype: int64

In [8]:
df = df.drop(columns=[9, 10])

In [9]:
df[0]

0       In the new Sino-Russian defense relationship, ...
1       Cookie Preferences Cookie List\n\nCookie List\...
2       In this image made from video, Chinese Foreign...
3       Associated Press\n\nBEIJING — Heavy flooding h...
4       El presidente estadounidense, Joe Biden, afirm...
                              ...                        
9995    \n\n\n\n\n\n\n\n\n\n\n\n\n\nBEIJING, March 20 ...
9996    It was a cold and overcast morning in November...
9997    China`s expansion of its nuclear arsenal is th...
9998    NEW You can now listen to Fox News articles!\n...
9999    Hong Kong CNN —\n\nKiki Yang brought an empty ...
Name: 0, Length: 10000, dtype: object

In [10]:
documents = df[0].dropna().tolist()

In [11]:
def preprocess(docs):
    tokenized_docs = [simple_preprocess(doc) for doc in docs]
    bigram = Phrases(tokenized_docs, min_count=5, threshold=100)
    trigram = Phrases(bigram[tokenized_docs], threshold=100)
    return [trigram[bigram[doc]] for doc in tokenized_docs]

processed_docs = preprocess(documents)

In [12]:
dictionary = corpora.Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [17]:
lda = LdaMulticore(corpus, num_topics=20, id2word=dictionary, passes=5, workers=min(10, os.cpu_count() - 1))
for topic in lda.print_topics():
    print(topic)

(0, '0.068*"the" + 0.035*"to" + 0.026*"of" + 0.024*"and" + 0.018*"in" + 0.014*"it" + 0.014*"that" + 0.014*"on" + 0.011*"for" + 0.011*"china"')
(1, '0.073*"the" + 0.035*"and" + 0.032*"to" + 0.031*"in" + 0.025*"of" + 0.014*"china" + 0.011*"that" + 0.010*"on" + 0.009*"with" + 0.009*"said"')
(2, '0.018*"to" + 0.017*"the" + 0.013*"and" + 0.013*"in" + 0.011*"de" + 0.011*"of" + 0.007*"la" + 0.007*"que" + 0.006*"is" + 0.006*"china"')
(3, '0.066*"the" + 0.029*"to" + 0.029*"and" + 0.029*"of" + 0.022*"in" + 0.022*"china" + 0.014*"taiwan" + 0.013*"that" + 0.010*"on" + 0.010*"is"')
(4, '0.081*"de" + 0.032*"la" + 0.030*"en" + 0.027*"el" + 0.026*"que" + 0.015*"china" + 0.009*"un" + 0.009*"los" + 0.008*"para" + 0.007*"se"')
(5, '0.036*"die" + 0.025*"der" + 0.022*"in" + 0.021*"und" + 0.015*"china" + 0.011*"das" + 0.010*"von" + 0.010*"zu" + 0.008*"ist" + 0.007*"für"')
(6, '0.025*"the" + 0.017*"of" + 0.017*"to" + 0.012*"in" + 0.009*"new" + 0.008*"and" + 0.006*"have" + 0.006*"is" + 0.005*"that" + 0.005*"t

In [None]:
!pip install -U pip setuptools wheel
!pip install -U 'spacy[apple]'

In [None]:
import spacy
import networkx as nx

nlp = spacy.load("en_core_web_sm")

doc = nlp(' '.join(documents[:5])) 

graph = nx.DiGraph()

for token in doc:
    graph.add_node(token.text)

    if token.dep_ != "ROOT":
        graph.add_edge(token.head.text, token.text, label=token.dep_)

