In [178]:
import pandas as pd
from model.comment import Comment
from model.user import User
from datetime import datetime
from tqdm import tqdm
from copy import deepcopy

In [179]:
df = pd.read_csv('../data/comments.csv')

In [180]:
df.head()

Unnamed: 0.1,Unnamed: 0,by,id,kids,parent,text,time,type
0,0,['Jonnax'],[24563821],"[24564014, 24563951, 24565071, 24563883, 24564...",[24563698],['There&#x27;s a quote from the CEO saying tha...,['2020-09-23 08:58:07'],['comment']
1,1,['pkphilip'],[24563987],"[24564606, 24564531, 24564249]",[24563698],['It is interesting to note that Brendan Eich&...,['2020-09-23 09:20:14'],['comment']
2,2,['rattray'],[24565829],[24565871],[24563698],['This is a pretty unfair article.<p>The autho...,['2020-09-23 13:45:24'],['comment']
3,3,['bambax'],[24564488],"[24564530, 24565153, 24564755, 24565320]",[24563698],['&gt; <i>Mobile browsing numbers are bleak: F...,['2020-09-23 10:30:43'],['comment']
4,4,['shp0ngle'],[24563843],"[24564784, 24563894, 24563980, 24563875, 24564...",[24563698],['I&#x27;m two fences on Firefox doing &quot;t...,['2020-09-23 09:01:00'],['comment']


In [181]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12012 entries, 0 to 12011
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  12012 non-null  int64 
 1   by          12012 non-null  object
 2   id          12012 non-null  object
 3   kids        12012 non-null  object
 4   parent      12012 non-null  object
 5   text        12012 non-null  object
 6   time        12012 non-null  object
 7   type        12012 non-null  object
dtypes: int64(1), object(7)
memory usage: 750.9+ KB


In [182]:
len(df['by'].unique())

7140

In [183]:
len(df['id'].unique())

12001

In [184]:
def row_to_comment(row):

    kids = []
    if row['kids'] != '{}':
        kids = list(map(int, row['kids'][1:-1].split(', ')))[0],

    return Comment(
        by_id=row['by'][2:-2],
        id=int(row['id'][1:-1]),
        parent_id=int(row['parent'][1:-1]),
        kids_ids=kids,
        text=row['text'][2:-2],
        time=datetime.strptime(row['time'][2:-2], '%Y-%m-%d %H:%M:%S'),
        type=row['type'][2:-2],
        by=None,
        parent=None,
        kids=None
    )

In [185]:
def row_to_user(row):

    return User(
        name=row['by'][2:-2],
        comments_ids=[],
        comments=[]
    )

In [186]:
users = {}
comments = {}

for row_id in tqdm(range(len(df))):
    row = df.iloc[row_id]

    try:

        #add comment
        next_comment = row_to_comment(row)
        if next_comment.id in comments:
            # print(row)
            pass
        else:
            comments[next_comment.id] = next_comment

        #add user and comment to user
        next_user = row_to_user(row)
        if next_user.name not in users:
            users[next_user.name] = next_user
        users[next_user.name].comments_ids.append(next_comment.id)
        users[next_user.name].comments.append(next_comment)

    except Exception as e:
        print(e)
        print(row)

100%|██████████| 12012/12012 [00:02<00:00, 4743.79it/s]


In [187]:
len(users)

7140

In [188]:
len(comments)

12001

In [189]:
def create_empty_comment(id):
    return Comment(
        id=id,
        by_id=None,
        parent_id=None,
        kids_ids=None,
        text=None,
        time=None,
        type=None,
        by=None,
        parent=None,
        kids=None
    )

In [190]:
# fill by, parent, kids fields in comments
# if comment not found in comments create that object
current_comments_list = deepcopy(list(comments.values()))

for comment in tqdm(current_comments_list):

    comment.by = users[comment.by_id]

    if comment.parent_id not in comments:
        comments[comment.parent_id] = create_empty_comment(comment.parent_id)
    comment.parent = comments[comment.parent_id]

    comment.kids = []
    for subcomment_id in comment.kids_ids:
        if subcomment_id not in comments:
            comments[subcomment_id] = create_empty_comment(subcomment_id)
        comment.kids.append(comments[subcomment_id])

100%|██████████| 12001/12001 [00:00<00:00, 418391.48it/s]


In [191]:
len(comments)

18987

In [194]:
len(set(map(lambda x: x.id, list(comments.values())))) == len(comments)

True