# Network Analysis - High-energy physics theory citation network

In [47]:
import pandas as pd
import networkx as nx

## Load dataset

In [31]:
import requests

url = 'https://raw.githubusercontent.com/imads20/BDS23/main/M2_Final_Assignment/Network_Analysis/physics_theory_citation_network.txt'

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Read the content of the response
    content = response.text

    # Process the content (e.g., split into lines)
    lines = content.split('\n')

    #df = pd.DataFrame(columns=['Node1', 'Node2'])
    # Now you can work with the lines of data
    for line in lines:
        # Process each line as needed
        print(line)
        #temp = pd.DataFrame(line, columns=['Node1', 'Node2'])
        #df = pd.concat([df, temp])
else:
    print(f"Failed to retrieve data from the URL (Status Code: {response.status_code})")

# Directed graph (each unordered pair of nodes is saved once): Cit-HepTh.txt 
# Paper citation network of Arxiv High Energy Physics Theory category
# Nodes: 27770 Edges: 352807
# FromNodeId	ToNodeId
1001	9304045
1001	9308122
1001	9309097
1001	9311042
1001	9401139
1001	9404151
1001	9407087
1001	9408099
1001	9501030
1001	9503124
1001	9504090
1001	9504145
1001	9505025
1001	9505054
1001	9505105
1001	9505162
1001	9506048
1001	9506112
1001	9506144
1001	9507050
1001	9507158
1001	9508094
1001	9508155
1001	9510142
1001	9510225
1001	9510234
1001	9511030
1001	9511171
1001	9601108
1001	9602022
1001	9602114
1001	9603003
1001	9603150
1001	9603161
1001	9603167
1001	9605184
1001	9605222
1001	9606017
1001	9606040
1001	9607163
1001	9607207
1001	9608086
1001	9609070
1001	9609071
1001	9609239
1001	9611137
1001	9612108
1001	9701162
1001	9702094
1001	9702155
1001	9702198
1001	9703082
1001	9703166
1001	9704097
1001	9705030
1001	9705044
1001	9705104
1001	9705220
1001	9706005
1001	9707014
1001	9707042
1001	970

In [54]:
# Turn the lines into a df
edges = pd.DataFrame(lines[4:])
edges.head()

Unnamed: 0,0
0,1001\t9304045\r
1,1001\t9308122\r
2,1001\t9309097\r
3,1001\t9311042\r
4,1001\t9401139\r


In [55]:
# Split each row by '\t' and remove '\r'
edges[['FromNode', 'ToNode']] = edges[0].str.split('\t', expand=True)
edges['ToNode'] = edges['ToNode'].str.replace('\r', '')
edges = edges.drop(0, axis=1)
edges.head(5)

Unnamed: 0,FromNode,ToNode
0,1001,9304045
1,1001,9308122
2,1001,9309097
3,1001,9311042
4,1001,9401139


In [59]:
# Collect all unique nodes from the edgelist
nodes = set(edges.FromNode) | set(edges.ToNode)
len(nodes)

27772

In [64]:
# Check for missing values
edges.isnull().sum()

FromNode    0
ToNode      1
dtype: int64

In [65]:
# Drop missing values
edges = edges.dropna()

In [84]:
temp = pd.DataFrame(edges['ToNode'].value_counts()).reset_index()

In [85]:
edges = pd.merge(edges, temp, on='ToNode', how='left')

In [86]:
edges

Unnamed: 0,FromNode,ToNode,Count,count
0,1001,9304045,,16
1,1001,9308122,,115
2,1001,9309097,,140
3,1001,9311042,,55
4,1001,9401139,,421
...,...,...,...,...
352802,9912286,9805150,,21
352803,9912286,9806074,,155
352804,9912286,9808140,,41
352805,9912286,9810068,,35


In [87]:
temp = edges[edges['count'] > 100]
temp

Unnamed: 0,FromNode,ToNode,Count,count
1,1001,9308122,,115
2,1001,9309097,,140
4,1001,9401139,,421
6,1001,9407087,,1299
7,1001,9408099,,1006
...,...,...,...,...
352792,9912260,9510017,,1155
352793,9912260,9610043,,1199
352794,9912260,9611050,,701
352795,9912260,9704080,,301


In [73]:
list = edges['ToNode'].value_counts() > 100

In [74]:
list

ToNode
9711200     True
9802150     True
9802109     True
9407087     True
9610043     True
           ...  
9610200    False
3293       False
5195       False
9512067    False
9508161    False
Name: count, Length: 23180, dtype: bool

In [91]:
temp.sort_values('ToNode')

Unnamed: 0,FromNode,ToNode,Count,count
140057,10108,10060,,110
182029,105012,10060,,110
266072,207180,10060,,110
247349,204163,10060,,110
310790,303233,10060,,110
...,...,...,...,...
157342,12210,9912249,,201
167583,102112,9912249,,201
127122,8101,9912249,,201
189199,106010,9912249,,201
