# Membuat model jaringan (graph)
pada tahapan ini dilakukan pembuatan jaringan dengan menggunakan attribut `username` dan `mention`.
setiap baris data akan memiliki `username` namun pada `mention` bisa terdapat lebih dari 1 username.
example:
```json
{
    "username": "kurniawan",
    "mention": ["niwa", "ronaldi", "purnama"]
}
```
dari contoh data tersebut maka terdapat 3 `edges` yang dapat dihasilkan
```json
{
    "edges": [{
        "source": "kurniawan",
        "target": "niwa"
    },{
        "source": "kurniawan",
        "target": "ronaldi"
    },{
        "source": "kurniawan",
        "target": "purnama
    }]
}
```

In [1]:
# import library yang digunakan
import pandas as pd
import networkx as nx
import numpy as np
import json

# untuk memperbaiki array yang dianggap sebagai str
from ast import literal_eval

#### Membaca data dari proses sebelumnya
pada tahapan ini hanya mengambil attribut `username` dan `mention`

In [2]:
data = pd.read_csv("../out/01-preprocessing.tsv", sep="\t")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5428 entries, 0 to 5427
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   raw_content         5428 non-null   object
 1   username            5428 non-null   object
 2   tweet_published_at  5428 non-null   object
 3   type                5428 non-null   object
 4   cleaned_content     5428 non-null   object
 5   mention             5428 non-null   object
dtypes: object(6)
memory usage: 254.6+ KB


## Membuat Graph

In [3]:
G = nx.DiGraph()

In [4]:
for x in data[['username', 'mention']][:1000].iloc:
    source = x.username
    for target in literal_eval(x.mention):
        if G.has_edge(source, target):
            G[source][target]['weight'] += 1
        else:
            G.add_edge(source, target, weight=1)

In [5]:
# karena hasil dari edges disini merupakan username 
# perlu dirubah menjadi id (numerik) alasanya supaya dapat divisualisasikan menggunakan d3js
G.edges

OutEdgeView([('LSuroyaa', 'woootamelon'), ('COMRADE_SOLAR', 'jn_jiu'), ('COMRADE_SOLAR', 'mistamosby'), ('jn_jiu', 'COMRADE_SOLAR'), ('jn_jiu', 'mistamosby'), ('bekstreettt', 'coldplay'), ('coldplay', 'rrzzaa__'), ('coldplay', 'IDWantsColdplay'), ('coldplay', 'thepauldugdale'), ('SeleccionINA', 'coldplay'), ('finyjr', 'IDWantsColdplay'), ('finyjr', 'coldplay'), ('IDWantsColdplay', 'coldplay'), ('IDWantsColdplay', 'CGV_ID'), ('IDWantsColdplay', 'cinepolisID'), ('IDWantsColdplay', 'alienradiofm'), ('Mommyshark1717', '_titaamelia'), ('buddykuofficial', 'jakartakonser'), ('leisbedumb', 'hrdbacot'), ('nciNINGIE', 'COLDEpikmi'), ('nciNINGIE', 'THVpikmi'), ('syarif_acil', 'bankbsi_id'), ('denanfalah', 'windyaqma'), ('pemudalincah', 'pinchipowww'), ('paparazilicous', 'IDWantsColdplay'), ('paparazilicous', 'coldplay'), ('arzenaers', 'ronyjrk'), ('arzenaers', 'Frimawan'), ('anggriawwan', 'astydewi'), ('anggriawwan', 'roshiro_'), ('rrex_xona', 'buahnanaaas'), ('msyanandaa', 'gisellmaf'), ('Dhikal

In [6]:
edgelist = nx.to_pandas_edgelist(G).sort_values(by="weight", ascending=False, ignore_index=True)
# temporary list
sources = list(set(edgelist.source))
targets = list(set(edgelist.target))
nodes = list(set(sources + targets))
# merubah nama to id sebagai index utama
nodelist = pd.DataFrame(nodes).reset_index()
nodelist.columns = ["id", "name"]
nodelist["id"] = np.arange(1, len(nodelist) + 1)

# mengambil id berdasarkan nama atau sebaliknya
node_dict = {k: v for k, v in zip(nodelist.name, nodelist.id)}
name_dict = {k: v for k, v in zip(nodelist.id, nodelist.name)}

# temporary variable
nodes = node_dict.keys()
node_ids = node_dict.values()
sources = [node_dict.get(s) for s in edgelist.source]
targets = [node_dict.get(s) for s in edgelist.target]
weights = [w for w in edgelist.weight]

# temporary list
edge_tuples = []
for s, t, w in zip(sources, targets, weights):
    edge_tuples.append((s, t, w))

In [7]:
G = nx.Graph()
G.add_nodes_from(list(node_ids))
G.add_weighted_edges_from(edge_tuples)
nx.set_node_attributes(G, name="username", values=name_dict)

## Social Network Analysis

In [8]:
degree_c = nx.degree_centrality(G)
betweenness_c = nx.betweenness_centrality(G)
closeness_c = nx.closeness_centrality(G)
eigenvector_c = nx.eigenvector_centrality_numpy(G)

In [9]:
nx.set_node_attributes(G, name="degree_c", values=degree_c)
nx.set_node_attributes(G, name="betweenness_c", values=betweenness_c)
nx.set_node_attributes(G, name="closeness_c", values=closeness_c)
nx.set_node_attributes(G, name="eigenvector_c", values=eigenvector_c)

### Mengkategorikan nodes
mengelompokan nodes berdasarkan nilai centrality dengan cara membulatkan nilai centrality untuk mendapatkan nilai yang setara

In [10]:
df_degree_c = pd.DataFrame(degree_c.items(), columns=['username', 'values'])
df_degree_c.apply(lambda x: round(x, 2)).groupby(by='values').count()

Unnamed: 0_level_0,username
values,Unnamed: 1_level_1
0.0,1550
0.01,16
0.02,2
0.03,3
0.09,1


In [11]:
df_betweenness_c = pd.DataFrame(betweenness_c.items(), columns=['username', 'values'])
df_betweenness_c.apply(lambda x: round(x, 6)).groupby(by='values').count()

Unnamed: 0_level_0,username
values,Unnamed: 1_level_1
0.000000,1308
0.000001,58
0.000002,23
0.000003,1
0.000004,2
...,...
0.025980,1
0.026979,1
0.027039,1
0.028520,1


In [12]:
df_closeness_c = pd.DataFrame(closeness_c.items(), columns=['username', 'values'])
df_closeness_c.apply(lambda x: round(x, 2)).groupby(by='values').count()

Unnamed: 0_level_0,username
values,Unnamed: 1_level_1
0.0,1008
0.01,62
0.02,54
0.03,35
0.04,63
0.05,112
0.06,89
0.07,142
0.08,6
0.1,1


In [13]:
df_eigenvector_c = pd.DataFrame(eigenvector_c.items(), columns=['username', 'values'])
df_eigenvector_c.apply(lambda x: round(x, 2)).groupby(by='values').count()

Unnamed: 0_level_0,username
values,Unnamed: 1_level_1
0.0,1397
0.01,17
0.02,5
0.04,2
0.05,96
0.06,14
0.07,23
0.08,15
0.1,1
0.28,1


setelah mendapatkan kategori nodes berdasarkan tiap nilai centrality selanjutnya memasukannya ke dalam attribut nodes

In [14]:
for i, (_, dfg) in enumerate(df_degree_c.apply(lambda x: round(x, 2)).groupby(by='values')):
    result = {username: i+1 for username in dfg.username}
    nx.set_node_attributes(G, name="degree_c_category", values=result)

In [15]:
for i, (_, dfg) in enumerate(df_betweenness_c.apply(lambda x: round(x, 6)).groupby(by='values')):
    result = {username: i+1 for username in dfg.username}
    nx.set_node_attributes(G, name="betweenness_c_category", values=result)

In [16]:
for i, (_, dfg) in enumerate(df_closeness_c.apply(lambda x: round(x, 2)).groupby(by='values')):
    result = {username: i+1 for username in dfg.username}
    nx.set_node_attributes(G, name="closeness_c_category", values=result)

In [17]:
for i, (_, dfg) in enumerate(df_eigenvector_c.apply(lambda x: round(x, 2)).groupby(by='values')):
    result = {username: i+1 for username in dfg.username}
    nx.set_node_attributes(G, name="eigenvector_c_category", values=result)

mengexport graph ke dalam bentuk `.gexf` dan `.json`

In [18]:
nx.write_gexf(G, "../out/02-network.gexf")

In [19]:
with open("../out/02-network.json", "w") as f:
    json.dump(nx.node_link_data(G), f)