In [16]:
import random
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas()

In [2]:
df_authors = pd.read_csv('/mnt/hdd01/Cleantech Network Analysis/df_authors.csv', dtype={'author_id': str, 'oaid': str, 'appln_id': str})
df_authors = df_authors[['appln_id', 'oaid', 'author_id']]
df_authors['author_id'] = df_authors['author_id'].replace("https://openalex.org/", "", regex=True)
df_authors['oaid'] = df_authors['oaid'].apply(lambda x: 'W' + str(x))

df_rel_pcs = pd.read_csv('/mnt/hdd01/Cleantech Network Analysis/df_rel_pcs_patstat.csv', dtype={'appln_id': str, 'oaid': str})
df_rel_pcs['oaid'] = df_rel_pcs['oaid'].apply(lambda x: 'W' + str(x))
df_rel_pcs['appln_id'] = df_rel_pcs['appln_id'].apply(lambda x: 'P' + str(x))

df_oaid_cleantech_dtm = pd.read_csv('/mnt/hdd01/Cleantech Network Analysis/df_oaid_cleantech_dtm.csv')
df_oaid_cleantech_dtm['oaid'] = df_oaid_cleantech_dtm['oaid'].apply(lambda x: 'W' + str(x))

df_patstat_cleantech_dtm = pd.read_csv('/mnt/hdd01/Cleantech Network Analysis/df_patstat_cleantech_dtm.csv')
df_patstat_cleantech_dtm['appln_id'] = df_patstat_cleantech_dtm['appln_id'].apply(lambda x: 'P' + str(x))

df_cleantech_keyword = pd.read_json("/home/thiesen/Documents/Cleantech_Concepts/cleantech_keywords_similarity_015_co_occurrence_025_claim_fulltext.json")

  df_authors = pd.read_csv('/mnt/hdd01/Cleantech Network Analysis/df_authors.csv', dtype={'author_id': str, 'oaid': str, 'appln_id': str})


In [3]:
chunk_size = 50000  # adjust this value based on your system's memory
chunks = [x for x in range(0, df_patstat_cleantech_dtm.shape[0], chunk_size)]

df_patstat_cleantech_edge_index = pd.DataFrame()

for i in tqdm(range(len(chunks) - 1)):
    chunk_start = chunks[i]
    chunk_end = chunks[i + 1]
    temp_df = df_patstat_cleantech_dtm.iloc[chunk_start:chunk_end]
    temp_df_long = temp_df.melt(id_vars='appln_id', var_name='keyword_yake_lemma', value_name='occurrence')
    temp_df_edges = temp_df_long[temp_df_long['occurrence'] == 1]
    temp_df_edges = temp_df_edges.drop(columns='occurrence')
    df_patstat_cleantech_edge_index = pd.concat([df_patstat_cleantech_edge_index, temp_df_edges])

# handle the last chunk
temp_df = df_patstat_cleantech_dtm.iloc[chunks[-1]:]
temp_df_long = temp_df.melt(id_vars='appln_id', var_name='keyword_yake_lemma', value_name='occurrence')
temp_df_edges = temp_df_long[temp_df_long['occurrence'] == 1]
temp_df_edges = temp_df_edges.drop(columns='occurrence')
df_patstat_cleantech_edge_index = pd.concat([df_patstat_cleantech_edge_index, temp_df_edges])

100%|██████████| 13/13 [01:24<00:00,  6.53s/it]


In [14]:
chunk_size = 50000  # adjust this value based on your system's memory
chunks = [x for x in range(0, df_oaid_cleantech_dtm.shape[0], chunk_size)]

df_oaid_cleantech_edge_index = pd.DataFrame()

for i in tqdm(range(len(chunks) - 1)):
    chunk_start = chunks[i]
    chunk_end = chunks[i + 1]
    temp_df = df_oaid_cleantech_dtm.iloc[chunk_start:chunk_end]
    temp_df_long = temp_df.melt(id_vars='oaid', var_name='keyword_yake_lemma', value_name='occurrence')
    temp_df_edges = temp_df_long[temp_df_long['occurrence'] == 1]
    temp_df_edges = temp_df_edges.drop(columns='occurrence')
    df_oaid_cleantech_edge_index = pd.concat([df_oaid_cleantech_edge_index, temp_df_edges])

# handle the last chunk
temp_df = df_oaid_cleantech_dtm.iloc[chunks[-1]:]
temp_df_long = temp_df.melt(id_vars='oaid', var_name='keyword_yake_lemma', value_name='occurrence')
temp_df_edges = temp_df_long[temp_df_long['occurrence'] == 1]
temp_df_edges = temp_df_edges.drop(columns='occurrence')
df_oaid_cleantech_edge_index = pd.concat([df_oaid_cleantech_edge_index, temp_df_edges])

100%|██████████| 12/12 [01:21<00:00,  6.82s/it]


# Build Graph Model

In [12]:
G = nx.Graph()

In [13]:
for node in df_authors['author_id']:
    G.add_node(node, node_type='author')

for node in df_patstat_cleantech_dtm['appln_id']:
    G.add_node(node, node_type='patent')

for node in df_oaid_cleantech_dtm['oaid']:
    G.add_node(node, node_type='paper')

for node in df_cleantech_keyword['keyword_yake_lemma']:
    G.add_node(node, node_type='keyword')

In [15]:
G.add_edges_from(df_rel_pcs[['appln_id', 'oaid']].values)
G.add_edges_from(df_authors[['appln_id', 'author_id']].values)
G.add_edges_from(df_authors[['oaid', 'author_id']].values)
G.add_edges_from(df_oaid_cleantech_edge_index[['oaid', 'keyword_yake_lemma']].values)
G.add_edges_from(df_patstat_cleantech_edge_index[['appln_id', 'keyword_yake_lemma']].values)

# Visualize Graph

In [17]:
sample_size = int(0.1 * len(G.nodes()))  # Adjust sample size as needed, here 10%
sampled_nodes = random.sample(G.nodes(), sample_size)
subgraph = G.subgraph(sampled_nodes)

since Python 3.9 and will be removed in a subsequent version.
  sampled_nodes = random.sample(G.nodes(), sample_size)


In [None]:
plt.figure(figsize=(12, 12))
pos = nx.spring_layout(subgraph, scale=2)  # Layout for the subgraph
nx.draw(subgraph, pos, node_size=50, with_labels=False, font_weight='bold', node_color='skyblue')
plt.title('Sampled Subgraph')
plt.show()

# Analyze Centrality Measures

In [19]:
degree_centrality = nx.degree_centrality(G)

In [33]:
# Get the list of keys to delete
keys_to_delete = df_authors['author_id'].tolist() + df_oaid_cleantech_dtm['oaid'].tolist() + df_patstat_cleantech_dtm['appln_id'].tolist()

# Delete the keys from the degree_centrality dictionary
for key in keys_to_delete:
    if key in degree_centrality:
        del degree_centrality[key]

In [36]:
# Print the top 10 nodes with highest degree centrality
print("Top 10 nodes with highest degree centrality:")
for node in sorted(degree_centrality, key=degree_centrality.get, reverse=True)[:100]:
    print(f"Node: {node}, Degree Centrality: {degree_centrality[node]}")

Top 10 nodes with highest degree centrality:
Node: wit, Degree Centrality: 0.07498770271995357
Node: ting, Degree Centrality: 0.0702554828005819
Node: tha, Degree Centrality: 0.06995333351051293
Node: ich, Degree Centrality: 0.06403059746264307
Node: era, Degree Centrality: 0.060253622414902605
Node: ons, Degree Centrality: 0.05992881737417236
Node: ally, Degree Centrality: 0.05334492552575502
Node: ist, Degree Centrality: 0.0499805465525572
Node: low, Degree Centrality: 0.048390940662410585
Node: lea, Degree Centrality: 0.04541715554436323
Node: method, Degree Centrality: 0.045203015131863006
Node: rate, Degree Centrality: 0.04388680315594607
Node: present, Degree Centrality: 0.04204166653959114
Node: high, Degree Centrality: 0.04051479965127572
Node: tween, Degree Centrality: 0.04040326364802026
Node: part, Degree Centrality: 0.03532859334365332
Node: tal, Degree Centrality: 0.034864368298853754
Node: pres, Degree Centrality: 0.03399081483585691
Node: ratio, Degree Centrality: 0.0327

In [26]:
betweenness_centrality = nx.betweenness_centrality(G, k=10)

In [None]:
eigenvector_centrality = nx.eigenvector_centrality(G)

In [29]:
betweenness_centrality['wind energy']

3.976164601921421e-08