Query SDG: https://elsevier.digitalcommonsdata.com/datasets/y2zyy9vwzy/1

In [None]:
import pandas as pd

# Caminho do arquivo CSV
csv_file_path = '/content/SDG_9_ufrn.csv'

# Leitura do arquivo CSV para um DataFrame
df = pd.read_csv(csv_file_path, sep=',', engine='python')

# Exibir as primeiras linhas do DataFrame para verificar o resultado
print(df.head())

# Salvar os dados tratados em um novo arquivo CSV (opcional)
df.to_csv('/content/SDG_9_ufrn.csv', index=False)
print("Dados salvos no arquivo 'SDG_9_ufrn_clean.csv'.")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 429 entries, 0 to 428
Data columns (total 46 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Authors                        429 non-null    object 
 1   Author full names              429 non-null    object 
 2   Author(s) ID                   429 non-null    object 
 3   Title                          429 non-null    object 
 4   Year                           429 non-null    int64  
 5   Source title                   429 non-null    object 
 6   Volume                         332 non-null    object 
 7   Issue                          217 non-null    object 
 8   Art. No.                       148 non-null    object 
 9   Page start                     281 non-null    object 
 10  Page end                       277 non-null    object 
 11  Page count                     281 non-null    float64
 12  Cited by                       429 non-null    int

In [None]:
# Import necessary libraries
import pandas as pd
import networkx as nx

# Create an empty graph
G = nx.Graph()

# Iterate over the rows in the DataFrame
for index, row in df.iterrows():
    # Split the Author(s) ID string into a list
    author_ids = row['Author(s) ID'].split('; ')

    # Split the Author full names string into a list
    author_names = row['Author full names'].split('; ')

    # Add each author to the graph as a node
    for author_id, author_name in zip(author_ids, author_names):
        G.add_node(author_id, name=author_name)

    # Add an edge between each pair of authors who are co-authors
    for i in range(len(author_ids)):
        for j in range(i + 1, len(author_ids)):
            G.add_edge(author_ids[i], author_ids[j])

# Print the number of nodes and edges in the graph
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

Number of nodes: 1624
Number of edges: 5774


In [None]:
G.nodes(data=True)

In [None]:
# Calculate the density of the graph
density = nx.density(G)
print(f"Density: {density}")

# Calculate the average number of neighbors per node
average_neighbors = sum(len(list(nx.neighbors(G, node))) for node in G) / len(G) # iterate over nodes and get neighbors for each
print(f"Average number of neighbors: {average_neighbors}")

# Calculate the degree distribution of the graph
degree_distribution = dict(nx.degree(G))

# Create a histogram of the degree distribution
import plotly.express as px
fig = px.histogram(list(degree_distribution.values()), nbins=100)
fig.update_layout(
    title="Degree Distribution of the Co-authorship Graph",
    xaxis_title="Degree",
    yaxis_title="Frequency",
)
fig.show()


Density: 0.0043812923218876434
Average number of neighbors: 7.110837438423645


In [None]:
# Find the node with the highest degree
highest_degree_node = max(G.nodes(), key=lambda node: G.degree(node))

# Print the node with the highest degree
print(f"Node with the highest degree: {highest_degree_node}")

# Print the number of neighbors of the node with the highest degree
print(f"Number of neighbors: {G.degree(highest_degree_node)}")


Node with the highest degree: 6507655813
Number of neighbors: 93


In [None]:
G.nodes['6507655813']

{'name': 'Martínez-Huitle, Carlos A. (6507655813)'}