In [None]:
import pandas as pd
import time
import seaborn as sns
import datetime as dt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd /content/drive/MyDrive/Spiderum/Data

/content/drive/MyDrive/Spiderum/Data


# Clean useractionhistories df

In [None]:
def clean_useractionhistories(path):
    # Read file csv
    data = pd.read_csv(path)
    
    # Remove not-register user
    data = data[data['user_id'] != -1]

    # Drop unnecessary columns
    data.drop(columns=['_id', '__v', 'status'], inplace=True)

    # Clean time and convert into datetime type
    data['created_at'] = data['created_at'].str[:19]
    def convert(x):
        year = int(x[:4])
        month = int(x[5:7])
        day = int(x[8:10])
        hour = int(x[11:13])
        minute = int(x[14:16])
        second = int(x[17:])
        return dt.datetime(year, month, day, hour, minute, second)

    data['created_at'] = data['created_at'].apply(lambda x: convert(x))

    return data



In [None]:
clean_useractionhistories('./useractionhistories.csv')

# Clean posts df

In [None]:
def clean_posts(path):
    # Read file posts csv
    posts = pd.read_csv(path)

    # Drop comment child columns
    child_comment_cols = [name for name in posts.columns if 'child_comments' in name]
    posts.drop(columns=child_comment_cols, inplace=True)

    # Keep important cols
    posts = posts[['views_count', 'comment_count', 'title', 'body', 'slug', 'created_at']]

    return posts

In [None]:
clean_posts('./posts.csv')

# Create rating df

In [None]:
def create_rating_df(path_useraction):
    rating_df = pd.read_csv(path_useraction)
    
    # Preprocess rating information
    rating_df = rating_df[(rating_df['action'] == 'upvote') | (rating_df['action'] == 'downvote') | (rating_df['action'] == 'unvote')][rating_df['object'] == 'post']
    rating_df = rating_df.groupby(['user_id', 'object_id', 'object', 'redirect_url']).agg({'action': lambda x: list(x), 'created_at': lambda x: max(list(x))}).reset_index()

    # Convert action to rating
    def convert(x):
        rating = 0
        for action in x:
            if action == 'upvote':
                rating = 1
            elif action == 'downvote':
                rating = -1
            elif action == 'unvote':
                rating = 0
        return rating

    rating_df['rating'] = rating_df['action'].apply(lambda x: convert(x))

    # Remove rating = 0
    rating_df = rating_df[rating_df['rating'] != 0]

    # Drop 'action' column
    rating_df.drop(['action'], axis=1, inplace=True)

    return rating_df

In [None]:
create_rating_df('./useraction_cleaned.csv')

# Create view df

In [None]:
def create_view_df(path_useraction):
    view_df = pd.read_csv(path_useraction)
    view_df = view_df[view_df['action'] == 'view']
    # Remove duplicated view information, only keep the last one
    view_df = view_df.groupby(['user_id', 'object_id', 'object', 'redirect_url']).agg({'created_at': lambda x: max(list(x))}).reset_index()
    return view_df

In [None]:
view_df = create_view_df('./useraction_cleaned.csv')
view_df.head()

Unnamed: 0,user_id,object_id,object,redirect_url,created_at
0,8,36,post,https://spiderum.com/bai-dang/Mau-nam-nhan-ly-...,2019-08-09T05:26:19
1,8,45,post,https://spiderum.com/bai-dang/Teamwork-va-nhun...,2018-06-12T04:24:15
2,8,195,post,https://spiderum.com/bai-dang/Vat-chat-quyet-d...,2019-08-09T05:31:16
3,8,319,post,https://spiderum.com/bai-dang/Nhung-bi-an-xoay...,2017-05-09T10:34:37
4,8,378,post,https://spiderum.com/bai-dang/Long-tot-va-dat-...,2018-08-01T07:29:19


In [None]:
mapping_url_postid = view_df[['object_id', 'redirect_url']].drop_duplicates()
mapping_url_postid = mapping_url_postid.set_index('redirect_url').to_dict()['object_id']

# Create comment df
(to prove that user read and interact with comment section)

In [None]:
def create_comment_df(path_useraction):
    comment_df = pd.read_csv(path_useraction)
    
    # Preprocess rating information
    comment_df = comment_df[(comment_df['action'] == 'comment') | (comment_df['object'] == 'comment')]
    comment_df = comment_df.groupby(['user_id', 'object_id', 'object', 'redirect_url']).agg({'created_at': lambda x: max(list(x))}).reset_index()

    # convert url link comment to link post
    def convert(x):
        if '?comment' not in x:
            return x
        else:
            return x[:x.index('?comment')]
    comment_df['redirect_url'] = comment_df['redirect_url'].apply(lambda x: convert(x))

    # Mapping url to object id
    def mapping(x):
        if x in mapping_url_postid:
            return mapping_url_postid[x]
        else:
            return 0

    comment_df['object_id'] = comment_df['redirect_url'].apply(lambda x: mapping(x))

    # Remove object_id = 0 because no mapping
    comment_df = comment_df[comment_df['object_id'] != 0]

    # Remove duplicate rows
    comment_df.drop_duplicates(subset=['user_id', 'object_id'], inplace=True)

    # Assign rating = 1 for every rows
    comment_df['rating'] = 1

    return comment_df

In [None]:
create_comment_df('./useraction_cleaned.csv')