In [None]:
"""
Explore textnets with tutorial example (newspaper, headlines)
**Run with text_env in m1**

"""

In [2]:
from textnets import Corpus, Textnet
from textnets import examples

In [12]:
import spacy
from spacy.lang.en.examples import sentences 
import en_core_web_sm
nlp = en_core_web_sm.load()

In [13]:
import pandas as pd

In [14]:
corpus = Corpus(examples.moon_landing)

In [15]:
type(examples.moon_landing)

pandas.core.series.Series

In [16]:
examples.moon_landing

The Guardian                         3:56 am: Man Steps On to the Moon
New York Times       Men Walk on Moon -- Astronauts Land on Plain, ...
Boston Globe                                         Man Walks on Moon
Houston Chronicle    Armstrong and Aldrich "Take One Small Step for...
Washington Post       The Eagle Has Landed -- Two Men Walk on the Moon
Chicago Tribune      Giant Leap for Mankind -- Armstrong Takes 1st ...
Los Angeles Times    Walk on Moon -- That's One Small Step for Man,...
Name: headlines, dtype: object

In [17]:
corpus

Corpus(7 docs: The Guardian, New York Times, Boston Globe…)

In [18]:
pd.DataFrame(examples.moon_landing)

Unnamed: 0,headlines
The Guardian,3:56 am: Man Steps On to the Moon
New York Times,"Men Walk on Moon -- Astronauts Land on Plain, ..."
Boston Globe,Man Walks on Moon
Houston Chronicle,"Armstrong and Aldrich ""Take One Small Step for..."
Washington Post,The Eagle Has Landed -- Two Men Walk on the Moon
Chicago Tribune,Giant Leap for Mankind -- Armstrong Takes 1st ...
Los Angeles Times,"Walk on Moon -- That's One Small Step for Man,..."


In [19]:
# Create textnet
tn = Textnet(corpus.tokenized(), min_docs=1)



KeyError: "[E002] Can't find factory for 'tok2vec'. This usually happens when spaCy calls `nlp.create_pipe` with a component name that's not built in - for example, when constructing the pipeline from a model's meta.json. If you're using a custom component, you can write to `Language.factories['tok2vec']` or remove it from the model meta and add it via `nlp.add_pipe` instead."

In [None]:
# Identifies document–term groups that appear to form part of the same theme in the texts.
tn.plot(label_term_nodes=True,
        label_doc_nodes=True,
        show_clusters=True)

In [None]:
# Network of newspapers
papers = tn.project(node_type='doc')
papers.plot(label_nodes=True)

In [None]:
# Network of the words (terms)
words = tn.project(node_type='term')
words.plot(label_nodes=True,
           show_clusters=True)

In [None]:
# Between centrality of newspaper
# See that the Los Angeles Times is a cultural bridge
# linking the headline themes of the East Coast newspapers to the others!

papers.top_betweenness()

In [None]:
# Between centrality of terms
# the Times uses the word “walk” in its headline,
# linking the “One Small Step” cluster to the “Man on Moon” cluster.

words.top_betweenness()

In [None]:
words.plot(label_nodes=True,
           scale_nodes_by='betweenness',
           color_clusters=True,
           node_label_filter=lambda n: n.betweenness() > words.betweenness.median())

## Try with our transcripts

In [None]:
# Let's read in our document-term matrix
speech_df = pd.read_pickle('../dump/speech_clean_lemma')
topic_df = pd.read_pickle('../dump/topic_df')
# data = pd.read_pickle('../dump/data_dtm_lemma.pkl')
# tdm = data.transpose()

In [None]:
speech_df

In [None]:
df = pd.DataFrame(speech_df['transcript'])
df.index = speech_df.speaker.tolist()

In [None]:
df

In [None]:
df.transcript

In [None]:
examples.moon_landing

In [None]:
topic_df

In [None]:
topic_df['words'] = topic_df.apply(lambda x: ' '.join(x),axis=1)

In [None]:
topic_word = topic_df[['words']]
topic_word.index = ['#01','#02','#03','#04']
topic_word

In [None]:
# Create corpus
corpus = Corpus(topic_word.words)

In [None]:
# Create textnet
tn = Textnet(corpus.tokenized(), min_docs=1)

In [None]:
tn

In [None]:
# Identifies document–term groups that appear to form part of the same theme in the texts.
tn.plot(label_term_nodes=True,
        label_doc_nodes=True,
        show_clusters=True,
        color_clusters=False)