In [None]:
import pandas as pd
import spacy
import networkx as nx                        # a really useful network analysis library
import matplotlib.pyplot as plt
# from networkx.algorithms import community   # not used, yet... 
import datetime                              # access to %%time, for timing individual notebook cells
import os
import spacy_transformers

This section we parse the doc into csv. *Unfortunately, only docx works and doc files are used in python 2.*


In this notebook, a Thematic analysis is considered and below is the coding section of the essay. 

In [None]:
nlp = spacy.load('en_core_web_lg')           # A more detailed model (with higher-dimension word vectors) - 13s to load, normally 
#nlp = spacy.load('en_core_web_md')           # a smaller model, e.g. for testing

In [None]:
plt.rcParams['figure.figsize'] = [5, 20]  # makes the output plots large enough to be useful

In [None]:
import pandas as pd
data = pd.read_csv('panel_discussion.csv')

In [None]:
tokens = []
lemma = []
pos = []
parsed_doc = [] 
col_to_parse = 'Q1'
col2_to_parse = 'Q2'
col3_to_parse = 'Q3'
col4_to_parse = 'Q4'
col5_to_parse = 'AddQ'
col6_to_parse = 'LastQ'


for doc in nlp.pipe(data[col_to_parse].astype('unicode').values, batch_size=1,
                        n_process=1):
    if doc.has_annotation("DEP"):
        parsed_doc.append(doc)
        tokens.append([n.text for n in doc])
        lemma.append([n.lemma_ for n in doc])
        pos.append([n.pos_ for n in doc])
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        tokens.append(None)
        lemma.append(None)
        pos.append(None)
data['parsed_doc'] = parsed_doc
data['comment_tokens'] = tokens
data['comment_lemma'] = lemma
data['pos_pos'] = pos


In [None]:
#print(df.parsed_doc.to_string(index=False))
print(type(data['parsed_doc'][0]))
data.Panelist.unique()



The next frame is how I got parsed values for each section(i.e Q1, Q2 etc...). SpaCy's NLP pipeline only allows for one column at a time so making csv's and copying the contents was quicker. 

So its only necessary for multiple values however the data type happens to not be a SpaCy object. For multiple columns, the dataframe returns a string object instead. This is due to NLP pipeline appending the 'str' from the csv and then 'broadcasting' onto vectors readable by Similarity, a SpaCy function which leverages Numpy. 

In [None]:
#pd.set_option('max_colwidth', None)
#blah = data[['parsed_doc', 'comment_tokens', 'comment_lemma', 'pos_pos']].head(50)
#with open('file7.csv', mode='w') as file_object:
#            print(blah, file=file_object)

In [None]:
stop_words = spacy.lang.en.stop_words.STOP_WORDS
print('Number of stopwords: %d' % len(stop_words))
print(list(stop_words))

Spacy has a built-in similarity function, thereby correlating how closely related two or more objects such as sentiments are to a 'target object' (in this case the rank in k-space)

In [None]:
print(data.parsed_doc)

In [None]:
print(data['parsed_doc'][0].similarity(data['parsed_doc'][1]))
#print(data['parsed_doc'][0].similarity(data['parsed_doc'][10]))
#print(data['parsed_doc'][1].similarity(data['parsed_doc'][10]))
#train_data = nlp(data)

In [None]:
tot = data
X_data = data[data.Panelist == 'X']
Y_data = data[data.Panelist == 'Y']
Z_data = data[data.Panelist == 'Z']
T_data = data[data.Panelist == 'T']
R_data = data[data.Panelist == 'R']

In [None]:
# takes 1s for 500 nodes - but of course this won't scale linearly!                              
raw_G = nx.Graph() # undirected
n = 0

for i in tot['parsed_doc']:        # sure, it's inefficient, but it will do
    for j in tot['parsed_doc']:
        if i != j:
            if not (raw_G.has_edge(j, i)):
                sim = i.similarity(j)
                raw_G.add_edge(i, j, weight = sim)
                n = n + 1

print(raw_G.number_of_nodes(), "nodes, and", raw_G.number_of_edges(), "edges created.")

In [None]:
edges_to_kill = []
min_wt = 0.96    # this is our cutoff value for a minimum edge-weight 

for n, nbrs in raw_G.adj.items():
    #print("\nProcessing origin-node:", n, "... ")
    for nbr, eattr in nbrs.items():
        # remove edges below a certain weight
        data = eattr['weight']
        if data < min_wt: 
            # print('(%.3f)' % (data))  
            # print('(%d, %d, %.3f)' % (n, nbr, data))  
            #print("\nNode: ", n, "\n <-", data, "-> ", "\nNeighbour: ", nbr)
            edges_to_kill.append((n, nbr)) 
            
print("\n", len(edges_to_kill) / 2, "edges to kill (of", raw_G.number_of_edges(), "), before de-duplicating")

In [None]:
for u, v in edges_to_kill:
    if raw_G.has_edge(u, v):   # catches (e.g.) those edges where we've removed them using reverse ... (v, u)
        raw_G.remove_edge(u, v)

In [None]:
strong_G = raw_G
print(strong_G.number_of_edges())

In this section we visualze the graphed nodes. 

In [None]:
nx.draw(strong_G, node_size=20, edge_color='aqua')

In [None]:
from math import sqrt
count = strong_G.number_of_nodes()
equilibrium = 10 / sqrt(count)    # default for this is 1/sqrt(n), but this will 'blow out' the layout for better visibility
pos = nx.fruchterman_reingold_layout(strong_G, k=equilibrium, iterations=200)
nx.draw(strong_G, pos=pos, node_size=10, edge_color='tan')

In [None]:


plt.rcParams['figure.figsize'] = [16, 9]  # a better aspect ratio for labelled nodes

nx.draw(strong_G, pos, font_size=3, node_size=50, edge_color='tan', with_labels=False)
for p in pos:  # raise positions of the labels, relative to the nodes
    pos[p][1] -= 0.03
nx.draw_networkx_labels(strong_G, pos, font_size=12, font_color='k')

plt.show()



<h4><i>In this analysis, at the edge weight value of 0.989, depicting the most similar values we can see two clusters: 'NaN', which we will ignore and a response from X in question one, thereby emphasizing the significance of sentiments with respect to this qualitative data as a whole.  "Y mentioned a war technology. How in principle this word could attract many people is because it makes it more real, more concrete. On the other side, Quantum Technology in general is already a sort of science-fiction topic. In that sense, it brings two points.</i> <h3><b>One is the word technology in the case of quantum, may not be so helpful for the general public.</b><b> Because general public is attracted by the aura of mystery behind it.</b> </h3><i>This is however not representative of the whole, just for sentences of the highest similarity. For a highly connected cluster that, we will lower the minimum weight to include more nodes. </i></h4>
<h4>It's clear to see a giant cluster at a weight of < 0.985, and at that value the data vectors are similar and legible enough to make sense of what the central node is. 
From this we can see sentence, "I think it's extremely important to bring people from outside of the physics department in Quantum Technology" -From X in question 2. Any weights value lower that the current weight still yeilds this sentence as its strongest connected nodal cluster.</h4>

In [None]:
import explacy
explacy.print_parse_info(nlp," One is the word technology in the case of quantum, may not be so helpful for the general public.</b><b> Because general public is attracted by the aura of mystery behind it.")

In [None]:
import explacy
explacy.print_parse_info(nlp," I think it's extremely important to bring people from outside of the physics department in Quantum Technology.")

"I think it's extremely important to bring people from outside of the physics department in Quantum Technology." This quote contains the highest edge weight and although the fully connected graph contains 32 'similar' points, the above quote contains 12 nodes. It then stands to reason that Panelist X's 2nd response represents the central theme as a whole, and this single sentence emphasizes that point. 