# Network analysis based on topic extraction
In this part we are performing a network analysis.

## Imports

In [3]:
#basic packages
import numpy as np
import pandas as pd 
import datetime
import math
import ast #for transforming dataframes
from time import time
from tqdm import tqdm
from bertopic import BERTopic

#Network analysis 
import seaborn as sns
from networkx.algorithms import bipartite
from networkx.drawing.layout import bipartite_layout
import networkx as nx


## Network Analysis
We are going to explore co-authorship network in authors that wrote about AI. 
Our goal is to get the eigenvector centrality to establish our well connected author in order to set up the recommender system. 

In [24]:
#Read the docs that had the topic modelling done with the BERTopic
docs = pd.read_csv('/Users/yasminesarraj/Documents/GitHub/M3-Assignment-Deep-Learning/Assignment_4/data/full_texts.csv')

In [25]:
docs

Unnamed: 0.1,Unnamed: 0,text,summary
0,0,Attention Is All You Need\nAshish Vaswani\nG...,"This paper proposes the Transformer, a model ..."
1,1,On the Beneﬁts of Biophysical Synapses\nJulia...,This paper examines the use of biophysical sy...
2,2,MOREA: a GPU-accelerated Evolutionary Algorit...,"This paper presents MOREA, a GPU-accelerated ..."
3,3,What Performance Indicators to Use for Self-A...,This paper investigates the use of self-adapt...
4,4,Using Affine Combinations of BBOB Problems fo...,This paper examines the use of affine combina...
5,5,arXiv:2303.04347v1 [cs.NE] 8 Mar 2023Publis...,This paper presents a method for converting a...
6,6,Evolutionary Reinforcement Learning: A Survey...,This article provides an overview of Evolutio...
7,7,RADAM: T EXTURE RECOGNITION THROUGH RANDOMIZE...,This paper proposes a new method called Rando...
8,8,A Computer Vision Enabled damage detection mo...,This paper presents a novel deep learning-bas...
9,9,Patch of Invisibility:\nNaturalistic Black-Bo...,This paper proposes a gradient-free method to...


In [26]:
docs.columns

Index(['Unnamed: 0', 'text', 'summary'], dtype='object')

In [27]:
docs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47 entries, 0 to 46
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  47 non-null     int64 
 1   text        47 non-null     object
 2   summary     47 non-null     object
dtypes: int64(1), object(2)
memory usage: 1.2+ KB


In [8]:
def clean_up(data):
    """
    Compact function to clean up the data

    """
    for i in tqdm(range(len(docs)), desc = 'Clean up'):
        df = data.copy()
        
        #Delete missings
        df_clean = df.dropna(subset=['Author Keywords'])
                
        #Format the dtypes of year column + extract the year
        df_clean = df_clean.rename(columns={"Year": "date"}) #rename year column
        df_clean["date"] = pd.to_datetime(df_clean["date"]) #format to datetime
    return df_clean

In [9]:
#Execute function
data_clean = clean_up(docs)

data_clean.head()

Clean up: 100%|██████████| 2000/2000 [00:03<00:00, 620.33it/s]


Unnamed: 0,Authors,Author(s) ID,Title,date,Source title,Volume,Issue,Art. No.,Page start,Page end,...,ISBN,CODEN,PubMed ID,Language of Original Document,Abbreviated Source Title,Document Type,Publication Stage,Open Access,Source,EID
6,"Krishnan P., Jain K., Aldweesh A., Prabu P., B...",57194605863;57215744374;57200513576;5720079815...,OpenStackDP: a scalable network security frame...,1970-01-01 00:00:00.000002023,Journal of Cloud Computing,12,1,26,,,...,,,,English,J. Cloud Comput.,Article,Final,"All Open Access, Gold",Scopus,2-s2.0-85149016620
10,"Cai J., Xu Z., Sun X., Guo X., Fu X.",57715240900;58115802100;57211301540;5811503830...,Validity and reliability of the Chinese versio...,1970-01-01 00:00:00.000002023,Psicologia: Reflexao e Critica,36,1,5,,,...,,,,English,Psicol. Reflexao Crit.,Article,Final,"All Open Access, Gold, Green",Scopus,2-s2.0-85148711886
11,"Tlili A., Shehata B., Adarkwah M.A., Bozkurt A...",57188567626;57782639700;57219025710;5656618160...,What if the devil is my guardian angel: ChatGP...,1970-01-01 00:00:00.000002023,Smart Learning Environments,10,1,15,,,...,,,,English,Smart Learn. Environ.,Article,Final,"All Open Access, Gold",Scopus,2-s2.0-85148704172
12,"Cheikh Youssef S., Haram K., Noël J., Patel V....",57891446400;58113891400;57212704902;8564080000...,Evolution of the digital operating room: the p...,1970-01-01 00:00:00.000002023,Langenbeck's Archives of Surgery,408,1,95,,,...,,LASUF,36807211.0,English,Langenbeck's Arch. Surg.,Review,Final,,Scopus,2-s2.0-85148679306
19,"Wang J., Dou J., Han J., Li G., Tao J.",57192107513;58108655300;58108630800;5595179220...,A population-based study to assess two convolu...,1970-01-01 00:00:00.000002023,BMC Oral Health,23,1,109,,,...,,,36803132.0,English,BMC Oral Health,Article,Final,"All Open Access, Gold",Scopus,2-s2.0-85148394539


We start by creating different edges that we will use as both edges and or as nodes attributes
- Author_paper: each author and the paper that they have wrote this will be the base of our bipartite graph 

It's important to note that we are using ID's instead of names as they are more Institution ID's, this might be due to the face that some institution have the same name.

In [10]:
# Multiple columns string conversion
data_clean['Abstract'] = data_clean['Abstract'].astype(str)
data_clean['Title'] = data_clean['Title'].astype(str)
print(data_clean.dtypes)

Authors                                  object
Author(s) ID                             object
Title                                    object
date                             datetime64[ns]
Source title                             object
Volume                                   object
Issue                                    object
Art. No.                                 object
Page start                               object
Page end                                 object
Page count                              float64
Cited by                                float64
DOI                                      object
Link                                     object
Affiliations                             object
Authors with affiliations                object
Abstract                                 object
Author Keywords                          object
Index Keywords                           object
Correspondence Address                   object
Editors                                 

In [11]:
author_paper= []

for i in data_clean.iterrows(): #iterate through each row
  targets_x = [i.strip() for i in i[1]['Author(s) ID'].split(';')]
  edges_x = [(i[1]['Title'], j) for j in targets_x] #connect to the paper
  
    #append each of the found edges to the list
  author_paper.extend(edges_x)

In [12]:
paper_abstract= []

for i in data_clean.iterrows(): #iterate through each row
  targets_y = [i.strip() for i in i[1]['Title']]
  edges_y = [[i[1]['Abstract'], j] for j in targets_x] #connect to the paper
  
    #append each of the found edges to the list
  paper_abstract.extend(edges_y)

In [13]:
data_clean['Abstract'] = data_clean['Abstract'].astype(str)
test = data_clean['Abstract']

In [28]:
import requests

API_URL = "https://api-inference.huggingface.co/models/fabiochiu/t5-base-tag-generation"
headers = {"Authorization": "Bearer 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"}


In [29]:
def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()


In [38]:
output = query({
	"inputs": docs.iloc[4,1],
})

In [39]:
output

[{'generated_text': 'Optimization, Software, Digital, Software Development, Programming, Coding, Code, Software Engineering'}]

In [42]:
model = BERTopic(language="english")
topics, probs = model.fit_transform(docs['text'])

In [43]:
model.get_topic_freq().head(5)

Unnamed: 0,Topic,Count
0,-1,20
1,0,14
2,1,13


In [44]:
model.get_topic(0)[:10]

[('the', 0.10321597836088299),
 ('of', 0.08382433717510913),
 ('and', 0.07348330546638471),
 ('to', 0.06685250935462915),
 ('in', 0.05935176629978951),
 ('is', 0.04007197364810682),
 ('neural', 0.03580892956103275),
 ('for', 0.034663546338450264),
 ('as', 0.02916828654800408),
 ('we', 0.029046300893989054)]

In [45]:
model.visualize_topics()


ValueError: zero-size array to reduction operation maximum which has no identity

In [40]:
paper_abstract= []

for i in data_clean.iterrows(): #iterate through each row
  targets_y = [i.strip() for i in i[1]['Title']]
  edges_y = [[i[1]['Abstract'], j] for j in targets_x] #connect to the paper
  
    #append each of the found edges to the list
  paper_abstract.extend(edges_y)

# **Create Topics**
We select the "english" as the main language for our documents. 