In [None]:
import spacy
import pandas as pd
from subprocess import PIPE
import networkx as nx
import uuid
import os
import time
import matplotlib.pyplot as plt
import numpy as np

In [None]:
nlp = spacy.load('en_core_web_md')

In [None]:
filename = '/Users/kellycoulter/Desktop/PhD_Code_2021/df6.csv'

df6 = pd.read_csv(filename)

df6.insert(0, 'Query', 'Cryptocurrency')

df6.loc[:, "UUID"] = 1

df6.loc[:, "UUID"] = df6.groupby("Title").UUID.transform(lambda g: uuid.uuid4())

df6.head()

In [None]:
df6_copy = df6.copy()

In [None]:
df6.info()

In [None]:
df6.groupby('Source').count()

In [None]:
# Plot a histogram for article distribution

publ_count = df6.groupby('Source').count()
plt.hist (publ_count) 
plt.yscale('log')


plt.xlabel('Number of Articles')
plt.ylabel('News Publication Outlet') 
plt.title('Article Distribution across Publications')

In [None]:
pub_cnt = df6['Source']

pubs_cnts = pub_cnt.value_counts()

pubs_cnts


In [None]:

df_publications = pubs_cnts.reset_index().rename(columns={'index':'Publication','Source':'Number of Publications'})
df_publications

In [None]:
df_publications['Higher_or_equal_5'] = np.where(df_publications['Number of Publications'] >= 5, True, False)
df_publications.head(20)

In [None]:
df_pubs_filtered = df_publications[df_publications['Higher_or_equal_5'] == False] 
df_pubs_filtered

In [None]:
pubs_less5_filter = df6['Source'].isin(df_pubs_filtered)

In [None]:
df6 = df6[~pubs_less5_filter]

In [None]:
df6.info()

In [None]:
%time processed_docs = list(nlp.pipe(df6['Body']))
df6['Text_nlp'] = processed_docs

In [None]:
df_processed_doc = df6.loc[:, ['Text_nlp']]                 

df_processed_doc.to_csv('df_processed_doc.csv')

df6.head()

In [None]:
def entity_extractor(nlp_doc, entity_type):
    ents = list(nlp_doc.ents)
    ents_filtered = [ent.text for ent in ents if ent.label_ == entity_type]
    unique = list(set(ents_filtered))
    return unique

In [None]:
processed_row = df6.loc[0, 'Text_nlp']
entity_extractor(processed_row, entity_type='PERSON')

processed_row = df6.loc[0, 'Text_nlp']
entity_extractor(processed_row, entity_type='ORG')

In [None]:
df6['entity_person'] = df6['Text_nlp'].apply(entity_extractor, entity_type='PERSON')

df6['entity_org'] = df6['Text_nlp'].apply(entity_extractor, entity_type='ORG')

df6['entity_NORP'] = df6['Text_nlp'].apply(entity_extractor, entity_type='NORP')

df6['entity_MONEY'] = df6['Text_nlp'].apply(entity_extractor, entity_type='MONEY')

df6['entity_LAW'] = df6['Text_nlp'].apply(entity_extractor, entity_type='LAW')

df6['entity_PRODUCT'] = df6['Text_nlp'].apply(entity_extractor, entity_type='PRODUCT')

df6['entity_EVENT'] = df6['Text_nlp'].apply(entity_extractor, entity_type='EVENT')

In [None]:
crypto_data = df6[df6['Query'] == 'Cryptocurrency'].copy()

In [None]:
exploded_df = crypto_data.explode('entity_person')
exploded_df

exploded_df['entity_person'].value_counts().head(10)

crypto_data.explode('entity_person')['entity_person'].value_counts()

exploded_data = crypto_data.explode('entity_person')

groupby_query = exploded_data.groupby('Query')
groupby_query

groupby_query['entity_person'].value_counts()

groupby_query['entity_person'].value_counts().groupby('Query').head(10)

In [None]:
def top_entity(data, entity_col, groupby_col, top_n=5):
    exploded = data.explode(entity_col)
    g = exploded.groupby(groupby_col)
    result = g[entity_col].value_counts().groupby(groupby_col).head(top_n)
    return result

top_entity(data=crypto_data, entity_col='entity_person', groupby_col='Query', top_n=5)

top_entity(data=crypto_data, entity_col='entity_org', groupby_col='Query', top_n=5)

top_entity(data=crypto_data, entity_col='entity_NORP', groupby_col='Query', top_n=5)

top_entity(data=crypto_data, entity_col='entity_LAW', groupby_col='Query', top_n=5)

top_entity(data=crypto_data, entity_col='entity_PRODUCT', groupby_col='Query', top_n=5)

top_entity(data=crypto_data, entity_col='entity_EVENT', groupby_col='Query', top_n=5)

exploded = crypto_data.explode('entity_person')

In [None]:
edge_list = exploded[['UUID', 'entity_person']].copy()

edge_list = edge_list.rename(columns={'UUID':'source','entity_person':'target'})
edge_list

In [None]:
my_list = [
           ('Trump', 'Donald Trump'), 
           ('Nakamoto', 'Satoshi Nakamoto'), 
           ('Cotten','Gerald Cotton'), 
           ('Gerry Cotton','Gerald Cotton'),
           ('Kneer','Ryan Kneer'),
           ('Zou','Tong Zou'),
           ('Duhaime','Christine Duhaime'), 
           ('Wood','Michael Wood'),
           ('Ernst','Ernst Young'),
           ('Robertson','Jennifer Robertson'),
           ('Joleeeeene','Jolene'),
           ('Finney','Hal Finney'),
           ('Satoshi','Satoshi Nakamoto'), 
           ('Andresen','Gavin Andresen'),
           ('Jobs"', 'Steve Jobs'),
           ('Jobs','Steve Jobs'),
           ('Bitcoins','Bitcoin'),
           ('Tyler','Tyler Winklevoss'),
           ('Dimon','Jamie Dimon'),
           ("Philip Green's",'Philip Green'),
           ('Shrem','Charlie Shrem'),
           ('Gates','Bill Gates'),
           ('Patryn','Michael Patryn'),
           ('Buffet','Warren Buffet'),
           ('Nuttall', 'John Nuttall'),
           ('Yermack','David Yermack'),
           ('Sluymer','Robert Sluymer'),
           ('Nick','Nick Szabo'),
           ('Ozcan','Pinar Ozcan'),
           ('Yatsenko','Vlad Yatsenko'),
           ('"Storonsky','Nikolay Storonsky'),
           ('Wright','Craig Wright'),
           ('Craig Steven Wright', 'Craig Wright'),
           ('Clegg','Nicholas Clegg'),
           ('Sheryl','Sheryl Sandberg'),
           ('Zuckerberg', 'Mark Zuckerberg'),
           ('Young', 'Ernst Young'),
           ('Marcus', 'David Marcus'),
           ('Xi', 'Xi Jinping'),
           ('Livingston', 'Ted Livingston'),
           ('Buterin', 'Vitalik Buterin'),
           ('Buffett','Warren Buffet'),
           ('Piette', 'Marc Piette'),
           ('Storonsky','Nikolay Storonsky'),
           ('Thiel', 'Peter Thiel'),
           ('Hayes', 'Arthur Hayes'),
           ('Kushner', 'Jared Kushner'),
           ('Sandberg','Sheryl Sandberg'),
           ('Aven','Petr Aven')
           ]


In [None]:
problems = [(idx, value) for idx,value in enumerate(my_list) if len(value)!=2]

problems

In [None]:
for to_replace, replacement_value in my_list :
    edge_list["target"].replace(to_replace=to_replace, value=replacement_value, inplace=True)

In [None]:
check_edge_list = edge_list[(edge_list['target'] == 'Donald Trump')]
check_edge_list.head()



In [None]:
to_drop_non_relevant_target = [
                                'Goolwa',
                                'Friends of Mitten',
                                'Pt Elliot',
                                'Cit',
                                'Chapman Rd',
                                'Air Youth',
                                'Martha Close',
                                'Mosquito Hill',
                                'Zwergeland',
                                'Maude St',
                                'Coorang dragons',
                                'Free-WiFi',
                                'Mindfulness',
                                'Bon Appetit',
                                '9am-10pm',
                                'Glenda',
                                'Judy',
                                'Footloose',
                                'Goolwa Hotel',
                                'Mens Shed',
                                'Mt Compass',
                                'Chiton Retirement Village',
                                'Myponga playcentre',
                                'Heysen Hikers',
                                "St Augustine's",
                                "Goolwa RSL's",
                                'Myponga Walkers',
                                'Goolwa Croquet Club',
                                'Goolwa Oval',
                                'Goolwa Lions',
                                'Goolwa Nth',
                                'Al-Anon',
                                'Corio Tce',
                                'Middleton',
                                'Riverport Singers',
                                'Goolwa RSL',
                                'Porter',
                                'cnr Wilman',
                                'Maude St',
                                'Victor Mahjong',
                                'Pt Elliot Showgrounds',
                                'Victor Harbor',
                                'Goolwa United Strollers',
                                'Impact Youth',
                                'Phil 0451',
                                'Canasta',
                                'Bacchus Rd',
                                'Coles',
                                'Ph Sharon',
                                'Qigong and Mindfulness Meditation',
                                'Goolwa North',
                                'Noah',
                                'Goolwa Library',
                                'Sabine',
                                'Ratalang',
                                'Zumba Class',
                                'Michelle',
                                'Goolwa Cittaslow',
                                'Carrickalinga House',
                                'Victor Floral Art',
                                'Victor Harbor Coastcare', 
                                'Gail', 
                                'Elliot Patchworkers', 
                                'Rugmaking',
                                'La Serenissima',
                                'Shoreditch',
                                'Edinburgh',
                                'Dunham',
                                'Fringe',
                                'embroils',
                                'Hart',
                                'Hendrick',
                                'Glaswegian',
                                'Suspiria',
                                'Android',
                                'Chen',
                                'Loftwall',
                                'Hagen',
                                'Nigel 0407',
                                'Ping-Pong',
                                '8555',
                                'Fhior',
                                'Coorong Dragons'
                                

                              ]

In [None]:
non_relevant_filter = edge_list['target'].isin(to_drop_non_relevant_target)

edge_list = edge_list[~non_relevant_filter]

In [None]:
G = nx.from_pandas_edgelist(edge_list)

nx.algorithms.bipartite.is_bipartite(G)

newG = nx.algorithms.bipartite.weighted_projected_graph(G, edge_list["target"])

nx.write_gexf(newG, 'new_crypto_article_person.gexf')

In [None]:
filtered_edge_list = [(edge[0], edge[1]) for edge in newG.edges(data = True) if edge[2]["weight"] != 1]

filteredG = newG.edge_subgraph(filtered_edge_list)

filteredG.number_of_edges()

newG.number_of_edges()

In [None]:
nx.write_gexf(filteredG, 'updated_filteredG_crypto_article_person.gexf')

In [None]:
print(nx.info(filteredG))

In [None]:
filteredG.size()

In [None]:
filteredG.size(weight="weight")

In [None]:
filteredG['Donald Trump']['Bitcoin']["weight"]

In [None]:

# Specify figure size
plt.figure(figsize=(30,30))
plt.axis('equal')

# Draw the Crypto network 
node_pos = nx.spring_layout(filteredG)
nx.draw(filteredG,node_pos,with_labels=True)

# Draw edge weights
labels = nx.get_edge_attributes(filteredG,'weight')
nx.draw_networkx_edge_labels(filteredG,node_pos,edge_labels=labels)
plt.axis('off')
plt.show()

In [None]:
filteredG_weights=filteredG.degree(weight='weight')
sorted(filteredG_weights, key=lambda x: x[1], reverse=True)

In [None]:
# Plot a histogram for node degrees

weights_values = [v for k, v in filteredG_weights]
plt.hist(weights_values,8) 
plt.yscale('log')


plt.xlabel('Weight')
plt.ylabel('Number of nodes') 
plt.title('Cryptocurrency Network')

In [None]:
#average local clustering co efficient over all nodes in the graph
nx.average_clustering(filteredG, weight="weight")

In [None]:
#global clustering co efficient-transivity ratio of number of triangles  and number of open triads in network
nx.transitivity(filteredG)

In [None]:
 df_publications['equal_or_higher_than_5?'] = df_publications['Number of Publications'].apply(lambda x: 'True' if x >= 5 else 'False')

print (df_publications)