In [37]:
import pandas as pd
from model.comment import Comment
from model.user import User
from datetime import datetime
from tqdm import tqdm
from copy import deepcopy
import networkx as nx
from karateclub import SCD
import json

In [2]:
df = pd.read_csv('../data/comments.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,by,id,kids,parent,text,time,type,deleted,dead
0,0,['Jonnax'],[24563821],"[24564014, 24563951, 24565071, 24563883, 24564...",[24563698],['There&#x27;s a quote from the CEO saying tha...,['2020-09-23 08:58:07'],['comment'],{},{}
1,1,['pkphilip'],[24563987],"[24564606, 24564531, 24564249]",[24563698],['It is interesting to note that Brendan Eich&...,['2020-09-23 09:20:14'],['comment'],{},{}
2,2,['rattray'],[24565829],[24565871],[24563698],['This is a pretty unfair article.<p>The autho...,['2020-09-23 13:45:24'],['comment'],{},{}
3,3,['bambax'],[24564488],"[24564530, 24565153, 24564755, 24565320]",[24563698],['&gt; <i>Mobile browsing numbers are bleak: F...,['2020-09-23 10:30:43'],['comment'],{},{}
4,4,['shp0ngle'],[24563843],"[24564784, 24563894, 24563980, 24563875, 24564...",[24563698],['I&#x27;m two fences on Firefox doing &quot;t...,['2020-09-23 09:01:00'],['comment'],{},{}


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77728 entries, 0 to 77727
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  77728 non-null  int64 
 1   by          77728 non-null  object
 2   id          77728 non-null  object
 3   kids        77728 non-null  object
 4   parent      77728 non-null  object
 5   text        77728 non-null  object
 6   time        77728 non-null  object
 7   type        77728 non-null  object
 8   deleted     77728 non-null  object
 9   dead        77728 non-null  object
dtypes: int64(1), object(9)
memory usage: 5.9+ MB


In [5]:
len(df['by'].unique())

25912

In [6]:
len(df['id'].unique())

77651

In [7]:
def row_to_comment(row):

    kids = []
    if row['kids'] != '{}':
        kids = list(map(int, row['kids'][1:-1].split(', ')))[0],

    return Comment(
        by_id=row['by'][2:-2],
        id=int(row['id'][1:-1]),
        parent_id=int(row['parent'][1:-1]),
        kids_ids=kids,
        text=row['text'][2:-2],
        time=datetime.strptime(row['time'][2:-2], '%Y-%m-%d %H:%M:%S'),
        type=row['type'][2:-2],
        by=None,
        parent=None,
        kids=None
    )

In [8]:
def row_to_user(row):

    return User(
        name=row['by'][2:-2],
        comments_ids=[],
        comments=[]
    )

In [9]:
users = {}
comments = {}

for row_id in tqdm(range(len(df))):
    row = df.iloc[row_id]

    try:

        #add comment
        next_comment = row_to_comment(row)
        if next_comment.id in comments:
            # print(row)
            pass
        else:
            comments[next_comment.id] = next_comment

        #add user and comment to user
        next_user = row_to_user(row)
        if next_user.name not in users:
            users[next_user.name] = next_user
        users[next_user.name].comments_ids.append(next_comment.id)
        users[next_user.name].comments.append(next_comment)

    except Exception as e:
        print(e)
        print(row)

100%|██████████| 77728/77728 [00:13<00:00, 5724.77it/s]


In [10]:
len(users)

25912

In [11]:
len(comments)

77651

In [12]:
def create_empty_comment(id):
    return Comment(
        id=id,
        by_id=None,
        parent_id=None,
        kids_ids=None,
        text=None,
        time=None,
        type=None,
        by=None,
        parent=None,
        kids=None
    )

In [13]:
# fill by, parent, kids fields in comments
# if comment not found in comments create that object
current_comments_list = deepcopy(list(comments.values()))

for comment in tqdm(current_comments_list):

    comments[comment.id].by = users[comment.by_id]

    if comment.parent_id not in comments:
        # comments[comment.parent_id] = create_empty_comment(comment.parent_id)
        comments[comment.parent_id] = Comment(
            id=comment.parent_id,
            by_id=None,
            parent_id=None,
            kids_ids=[comment.id],
            text=None,
            time=None,
            type=None,
            by=None,
            parent=None,
            kids=[comment]
        )
    elif comment.id not in comments[comment.parent_id].kids_ids:
        comments[comment.parent_id].kids_ids.append(comment.id)
        comments[comment.parent_id].kids.append(comment)
    comments[comment.id].parent = comments[comment.parent_id]

    comments[comment.id].kids = []
    for subcomment_id in comment.kids_ids:
        if subcomment_id not in comments:
            # comments[subcomment_id] = create_empty_comment(subcomment_id)
            comments[subcomment_id] = Comment(
                id=subcomment_id,
                by_id=None,
                parent_id=comment.id,
                kids_ids=None,
                text=None,
                time=None,
                type=None,
                by=None,
                parent=comment,
                kids=None
            )
        comments[comment.id].kids.append(comments[subcomment_id])

100%|██████████| 77651/77651 [00:00<00:00, 170876.55it/s]


In [14]:
len(comments)

121804

In [15]:
len(set(map(lambda x: x.id, list(comments.values())))) == len(comments)

True

## Waga - zliczanie wystąpień

In [16]:
roots = list(filter(lambda x: x.parent==None, list(comments.values())))

In [17]:
len(roots)

4683

In [18]:
groups = []

for root in roots:

    group = []
    if root.by_id is not None:
        group.append(root.by_id)
    kids = root.kids

    while len(kids) > 0:
        kid = kids.pop()
        # group.append(kid)
        if kid.by_id is not None and kid.by_id not in group:
            group.append(kid.by_id)
        if kid.kids is not None:
            kids.append(*kid.kids)

    groups.append(group)

In [19]:
len(groups)

4683

In [20]:
min(list(map(len, groups)))

1

In [21]:
max(list(map(len, groups)))

620

In [22]:
G = nx.Graph()

In [23]:
for group in groups:
    group = list(set(group))
    while len(group) > 0:
        user_id = group.pop()
        for other_user_id in group:
            if G.has_edge(user_id, other_user_id):
                G[user_id][other_user_id]['w'] += 1
            else:
                G.add_edge(user_id, other_user_id, w=1)

In [24]:
len(G.nodes)

25882

In [25]:
len(G.edges)

1714178

In [26]:
nx.density(G)

0.0051180794937399025

In [27]:
components = list(nx.connected_components(G))
print(len(components))

15


In [28]:
list(map(len, components))

[25849, 2, 2, 4, 2, 2, 2, 4, 2, 2, 2, 2, 2, 3, 2]

In [29]:
G = G.subgraph(components[0])

In [30]:
G = nx.relabel.convert_node_labels_to_integers(G, label_attribute='user_id')

In [31]:
detector = SCD(iterations=6)
detector.fit(G)
members = detector.get_memberships()

In [35]:
max(list(members.values()))

3640

In [38]:
with open('members.json', 'w') as outfile:
    json.dump(members, outfile)