1. Load Dataset

In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from konlpy.tag import Okt

In [78]:
blog = pd.read_csv("naver_blog.csv", index_col=0)["Content"]
stopwords = pd.read_csv("stopwords.csv")["Stopwords"].values

2. Preprocess strings

In [79]:
okt = Okt()
for i in range(len(blog)):
    pos = okt.pos(blog[i])
    text1 = list(filter(lambda x: (x[1] == 'Noun') |(x[1] == 'Adjective'), pos))    
    text2 = list(filter(lambda x: x[0] not in stopwords, text1))
    text3 = list(filter(lambda x: len(x[0]) > 1, text2))
    line = ""
    for word in text3:
        line += word[0] + " "
    blog[i] = line

3. Extract keywords

In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(blog)
lda_model = LatentDirichletAllocation(n_components = 3,  learning_method='online', random_state=777, max_iter=1)
lda_top = lda_model.fit_transform(X)

terms = vectorizer.get_feature_names() 
topics = []
for idx, topic in enumerate(lda_model.components_):
    for i in topic.argsort()[:-5 -1:-1]:
            topics.append(terms[i])
topics = pd.unique(topics).tolist()

4. Find related words and define link dataframe

In [82]:
def find_weight(df):
    indi = 2
    while len(df[df["weight"] > indi]) > 120:
        indi += 1
    return indi

In [83]:
val = []
big_txt = []
blog = blog.apply(lambda x: x.split())
for i in range(len(blog)):
    txt = pd.Series(blog[i], dtype="object")
    c_idx = txt[txt.isin(topics)].index.tolist()
    idx_mat = []
    for j in range(-3,4):
        temp = list(map(lambda x: x+j, c_idx))
        idx_mat.append(temp)
    idx_mat_t = np.transpose(idx_mat)
    
    num = [0,1,2,4,5,6]
    temp_val = []
    for k in num:
        idx_df = pd.DataFrame(idx_mat_t)
        for l in range(len(idx_df)):
            temp_val.append(idx_df.iloc[l, [k,3]].values)
    val.append(temp_val)
big_txt = []
for i in range(len(val)):
    for j in range(len(val[i])):
        f = val[i][j][0]
        l = val[i][j][1]
        if (f < 0) or (f >= len(blog[i]))  :
            pass
        else:
            big_txt.append([blog[i][f], blog[i][l]])
            
df = pd.DataFrame(big_txt, columns = ["from", "to"])
df["xx"] = df["from"] + " " + df["to"]
counter = df.groupby("xx").count().reset_index().drop('to', axis=1).rename(columns = {"from" : "weight"})
links = pd.merge(df, counter, on = "xx")
links = links.drop_duplicates()
indi = find_weight(links)
links = links[(links["weight"] > indi) & (links["weight"] < 1000)]
links = links.reset_index().drop("index", axis=1)

5. Define node dataframe

In [84]:
nodes = pd.unique(links[["from", "to"]].values.flatten())
vv = links.iloc[:,1].values
val = np.repeat(min(links["weight"]), len(nodes))
matin = []
for i in range(len(vv)):
    temp = np.where(nodes == vv[i])[0]
    matin = np.concatenate([matin, temp])
matin = pd.unique(matin)
matin = list(map(lambda x: int(x), matin))
val[matin] = links.iloc[matin, 3]

nodes = pd.DataFrame(nodes, nodes).reset_index()
nodes["value"] = val
nodes = nodes.rename(columns = {"index" : "id", 0:"label"})

6. Color

In [85]:
color = pd.read_csv("color.csv")["good_color"].values
color_idx = list(map(lambda x: int(x), np.linspace(start=0, stop=len(color)-1, num=len(nodes))))
nodes["color"] = color[color_idx]

7. Using networkx, plot a graph

In [None]:
import networkx as nx
plt.figure(figsize=(25,25))

#1 create networkx graph object
G = nx.Graph()

#2 Based on the counts of node dataframe, create circles as 'nodes' in network graph

for index, row in nodes.iterrows():
    G.add_node(row['label'], nodesize=row['value'], node_color = row["color"])

#3 Create circles as 'relations" in network graph 
for index, row in links.iterrows():
    G.add_weighted_edges_from([(row['from'], row['to'], row['weight'])])

#4 Set parameters related with graph design
pos = nx.spring_layout(G, k=1, iterations=50)
sizes = [G.nodes[node]['nodesize']*250 for node in G]
nx.draw(G, pos=pos, node_size=sizes, node_color = nodes["color"])

nx.draw_networkx_labels(G, pos=pos, font_family="AppleGothic", font_size=25)

ax = plt.gca()
plt.show()