In [1]:
import os
import sqlite3
import pandas as pd
import re
import datetime
import numpy as np
import time
from dateutil.relativedelta import relativedelta

In [2]:
directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# Tools

In [4]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from bs4 import BeautifulSoup
stops = set(stopwords.words("english"))
regex = re.compile('[%s]' % re.escape(string.punctuation))
words = word_tokenize(regex.sub('', 'text sdsd, and sds')) 
words = [word for word in words if word not in stops]

In [60]:
def count_length(body):
    text = re.sub('\n+', ' ', body).replace('\xa0', '')
    words = word_tokenize(regex.sub('', text))
    words = [word for word in words if word not in stops]
    return len(words)

def image_count(body):
    soup = BeautifulSoup(body, 'lxml')
    tags = soup.find_all('figure')
    image_count = len(tags)
    for tag in tags:
        if tag.find('div', class_='video-player') != None:
            image_count = image_count - 1
    return image_count

def video_count(body):
    soup = BeautifulSoup(body, 'lxml')
    video_count = len(soup.find_all('div', class_='video-player')) + len(soup.find_all('iframe'))
    return video_count

# do it before separating.
def reply_time_last(cmt):
    num = len(cmt)
    result = pd.DataFrame({'pid':[], 'cmt_id':[], 'cmt_datetime':[], 'cmt_rpt_l':[]})
    for i in range(num):
        pid = cmt.iloc[i].pid
        cmt_datetime = cmt.iloc[i].cmt_datetime
        cmt_id = cmt.iloc[i].cmt_id
        if (cmt.iloc[i].cmt_collaborator != 0) & (i != num-1):
            if (cmt.iloc[i + 1].cmt_collaborator == 0) & (cmt.iloc[i + 1].pid == pid):
                rep_time = cmt.iloc[i].cmt_datetime - cmt.iloc[i + 1].cmt_datetime
                df_temp = pd.DataFrame({'pid':[pid], 'cmt_id':[cmt_id], 
                                        'cmt_datetime':[cmt_datetime], 'cmt_rpt_l':[rep_time]})
                result = result.append(df_temp)
    return result

# another version
def reply_time_first(cmt):
    num = len(cmt)
    result = pd.DataFrame({'pid':[], 'cmt_id':[], 'cmt_datetime':[], 'cmt_rpt_f':[]})
    for i in range(num):
        pid = cmt.iloc[i].pid
        cmt_datetime = cmt.iloc[i].cmt_datetime
        cmt_id = cmt.iloc[i].cmt_id
        if (cmt.iloc[i].cmt_collaborator != 0) & (i != num-1):
            j = i
            while (cmt.iloc[j + 1].cmt_collaborator == 0) & (cmt.iloc[j + 1].pid == pid):
                j = j + 1
                if j == num-1: 
                    break
            if i != j:
                rep_time = cmt.iloc[i].cmt_datetime - cmt.iloc[j].cmt_datetime
                df_temp = pd.DataFrame({'pid':[pid], 'cmt_id':[cmt_id], 'cmt_datetime':[cmt_datetime], 
                                        'cmt_rpt_f':[rep_time]})
                result = result.append(df_temp)
    return result

def wait_rep(cmt):
    num = len(cmt)
    result = pd.DataFrame({'pid':[], 'cmt_id':[], 'cmt_datetime':[], 'cmt_wt':[]})
    for i in range(num):
        pid = cmt.iloc[i].pid
        cmt_datetime = cmt.iloc[i].cmt_datetime
        cmt_id = cmt.iloc[i].cmt_id
        if (cmt.iloc[i].cmt_collaborator != 0) & (i != num-1):
            j = i
            while (cmt.iloc[j + 1].cmt_collaborator == 0) & (cmt.iloc[j + 1].pid == pid):
                j = j + 1
                if j == num-1: 
                    break
            if i != j:
                num_bac = j-i
                df_temp = pd.DataFrame({'pid':[pid], 'cmt_id':[cmt_id], 'cmt_datetime':[cmt_datetime], 
                                        'cmt_wt':[num_bac]})
                result = result.append(df_temp)
    return result


# list backers' names by projects
def name_list(df):
    pid_lst = df.pid.drop_duplicates().tolist()
    result = pd.DataFrame({'pid':[], 'name_lst':[]})
    for i in range(len(pid_lst)):
        pid = pid_lst[i]
        name_lst = list(set(cmt_all_n[(cmt_all_n.pid==pid) & (cmt_all_n.cmt_collaborator==0)].first_name))
        df_temp = pd.DataFrame({'pid':[pid], 'name_lst':[name_lst]})
        result = result.append(df_temp)
    return result


def check_rep(cmt): # True or list of strings
    def _filter(str_lst):
        temp = str_lst.copy()
        for x in range(len(temp)):
            if (temp[x][:1]=='@'):
                temp = True
                return temp
            else:
                temp[x] = temp[x][:40]
        return temp 
    def _check_names(cmt_df):
        def _names(pid):
            return pid_name[pid_name.pid==pid].name_lst[0]
        df_temp = cmt_df.copy()

        pid = df_temp.iloc[0].pid 
        last_loc = len(df_temp[df_temp.pid==pid])
        names = _names(pid)

        for i in range(len(cmt_df)):
            while i >= last_loc:
                pid = df_temp.iloc[i].pid  
                last_loc = last_loc + len(df_temp[df_temp.pid==pid])
                names = _names(pid)

            str_lst = df_temp.iloc[i].cmt_content
            if str_lst!=True:
                df_temp.iloc[i, df_temp.columns.get_loc('cmt_content')] = any(
                    y in x for x in str_lst for y in names)
        return df_temp.cmt_content
    
    df_temp = cmt.copy()
    df_temp.cmt_content = df_temp.cmt_content.str.lower() # must transform to lowercase !
    df_temp.cmt_content = df_temp.cmt_content.apply(lambda str_cmt: sent_tokenize(str_cmt))
    df_temp.cmt_content = df_temp.cmt_content.apply(lambda str_lst: _filter(str_lst))
    df = _check_names(df_temp)
    return df

In [27]:
def save_db(db_name, df_name, *path):
    if len(path) == 1:
        conn_save = sqlite3.connect(directory + path[0] + db_name + '.db')
    else:
        conn_save = sqlite3.connect(directory + '/dataPrep/source/' + db_name + '.db')
    df_name.to_sql(name=db_name, con=conn_save, if_exists='replace', index=False)

# Data

In [5]:
conn_crt_exp = sqlite3.connect(directory + '/dataPrep/source/constructs/' + 'crt_exp.db')
crt_exp = pd.read_sql_query("SELECT * FROM crt_exp", conn_crt_exp)

conn_dates = sqlite3.connect(directory + '/dataPrep/source/overall/' + 'dates.db')
dates = pd.read_sql_query("SELECT * FROM dates", conn_dates)

conn_cmt_all = sqlite3.connect(directory + '/dataPrep/source/ds_ind_26229/' + 'cmt_all.db')
cmt_all = pd.read_sql_query("SELECT * FROM cmt_all", conn_cmt_all)

conn_backer_names = sqlite3.connect(directory + '/dataPrep/source/body_name/' + 'backer_names.db')
backer_names = pd.read_sql_query('SELECT * FROM backer_names', conn_backer_names)

In [15]:
ta_pid = crt_exp.pid.drop_duplicates().tolist()

In [50]:
backer_name = backer_names.drop_duplicates()

In [17]:
dates = dates[dates.pid.isin(ta_pid)].copy()
cmt_all = cmt_all[cmt_all.pid.isin(ta_pid)].copy()
cmt_all = cmt_all.drop_duplicates('cmt_id', keep='last')

In [21]:
backer_id = cmt_all[['cmt_profile_id']].drop_duplicates()

In [99]:
dates.proj_start_date = dates.proj_start_date.apply(
    lambda str_date: datetime.datetime.strptime(str_date, "%Y-%m-%d %H:%M:%S"))
dates.first_date = dates.first_date.apply(
    lambda str_date: datetime.datetime.strptime(str_date, "%Y-%m-%d %H:%M:%S"))
dates.after_d_3 = dates.after_d_3.apply(
    lambda str_date: datetime.datetime.strptime(str_date, "%Y-%m-%d %H:%M:%S"))
dates.after_d_6 = dates.after_d_6.apply(
    lambda str_date: datetime.datetime.strptime(str_date, "%Y-%m-%d %H:%M:%S"))
dates.after_d_9 = dates.after_d_9.apply(
    lambda str_date: datetime.datetime.strptime(str_date, "%Y-%m-%d %H:%M:%S"))
dates.after_d_1 = dates.after_d_1.apply(
    lambda str_date: datetime.datetime.strptime(str_date, "%Y-%m-%d %H:%M:%S"))

In [54]:
# users not found
len(backer_id[~backer_id.cmt_profile_id.isin(backer_names.cid)])

18

# Creator's comments

In [61]:
cmt_all.cmt_datetime = cmt_all.cmt_datetime.apply(
    lambda str_date: datetime.datetime.strptime(str_date, "%Y-%m-%d %H:%M:%S"))

cmt_all_crt = cmt_all[cmt_all.cmt_collaborator==1].copy()
cmt_all_bac = cmt_all[cmt_all.cmt_collaborator==0].copy()


# count comment length
cmt_all_crt['cmt_len'] = cmt_all_crt.apply(lambda row: count_length(row.cmt_content), axis=1)


# check whether using email
cmt_all_crt['cmt_email'] = cmt_all_crt.apply(lambda row: re.search("\w+@\w+", row.cmt_content) != None, axis=1)
# save_db('cmt_all_crt', cmt_all_crt)

In [62]:
# reply_time
cmt_all = cmt_all.sort_values(['pid','cmt_datetime'], ascending=False).copy() # Important!!!
rpt_l = reply_time_last(cmt_all)

In [63]:
rpt_f = reply_time_first(cmt_all)

In [64]:
rpt_l.head(1)

Unnamed: 0,cmt_datetime,cmt_id,cmt_rpt_l,pid
0,2015-09-17 19:11:47,comment-11112693,01:14:17,99825633


In [26]:
# backers were waiting
# waiting_bac = wait_rep(cmt_all)

In [67]:
# check whether using names

# find first name excepting name is just a letter or stop words
stops_titles = set(stopwords.words("english")) | set(['miss', 'mr', 'ms', 'mrs'])

first_name = backer_name.copy().rename(index=str, columns={'name': 'first_name'})
first_name.first_name = first_name.first_name.apply(lambda name: str(name).lower())

backer_name = backer_name.merge(first_name, on=['cid'])
backer_name.first_name = backer_name.first_name.apply(lambda name: name if name == None else regex.sub('', name))
backer_name.first_name = backer_name.first_name.apply(lambda name: word_tokenize(name))
backer_name.first_name = backer_name.first_name.apply(lambda name: [word for word in name if word not in stops_titles])
backer_name.first_name = backer_name.first_name.apply(lambda lst: None if len(lst) == 0 else 
    (lst[0] if len(lst[0]) > 2 else (max(lst, key=len) if len(max(lst, key=len)) > 1 else ' '.join(lst))))
backer_name.first_name = backer_name.first_name.apply(
    lambda name: name if name == None else (None if len(name) <= 2 else name))
backer_name.first_name = backer_name.first_name.apply(lambda name: str(name))

backer_name = backer_name[backer_name.first_name!='None']
backer_name = backer_name[backer_name.name != 'non-exist'].copy()
backer_name = backer_name.rename(index=str, columns={'cid':'cmt_profile_id'})

# prepared for two definitions: name_list, check_rep.
cmt_all_n = cmt_all.merge(backer_name, on=['cmt_profile_id']).copy()
pid_name = name_list(cmt_all_n) # return the list of names by pid

In [84]:
pid_name.head(1)

Unnamed: 0,name_lst,pid
0,"[janahan, stephen, stuart, allan, rob, nick, a...",998633411


In [81]:
cmt_rbn = cmt_all_crt.copy()
cmt_rbn['cmt_rbn'] = check_rep(cmt_rbn).copy()

In [85]:
cmt = cmt_all.merge(cmt_all_crt[['cmt_id', 'cmt_len', 'cmt_email']], on=['cmt_id'], how='outer').copy()
cmt = cmt.merge(rpt_l[['cmt_id', 'cmt_rpt_l']], on=['cmt_id'], how='outer').copy()
cmt = cmt.merge(rpt_f[['cmt_id', 'cmt_rpt_f']], on=['cmt_id'], how='outer').copy()
# cmt = cmt.merge(waiting_bac[['cmt_id', 'cmt_wt']], on=['cmt_id'], how='outer').copy()
cmt = cmt.merge(cmt_rbn[['cmt_id', 'cmt_rbn']], on=['cmt_id'], how='outer').copy()

cmt.cmt_len = cmt.cmt_len.fillna(0)
cmt.cmt_email = cmt.cmt_email.fillna(False)
cmt.cmt_rbn = cmt.cmt_rbn.fillna(False)
cmt.cmt_email = cmt.cmt_email.apply(lambda bl: int(bl))
cmt.cmt_rbn = cmt.cmt_rbn.apply(lambda bl: int(bl))
cmt.cmt_rpt_l = cmt.cmt_rpt_l.apply(lambda td: (td / np.timedelta64(1, 'D')))
cmt.cmt_rpt_f = cmt.cmt_rpt_f.apply(lambda td: (td / np.timedelta64(1, 'D')))
#time: time.total_seconds())#sec: datetime.timedelta(seconds=sec))

In [92]:
cmt.head(1)

Unnamed: 0,cmt_collaborator,cmt_content,cmt_creator,cmt_datetime,cmt_id,cmt_profile_id,cmt_superbaker,pid,proj_creator_id,cmt_len,cmt_email,cmt_rpt_l,cmt_rpt_f,cmt_rbn
0,0.0,And the bairn revealed that a parcel of plectr...,0,2016-01-27 02:01:18,comment-12320544,1391895283,0.0,998633411,1944039953,0.0,0,,,0


In [86]:
cmt_crt = cmt[cmt.cmt_collaborator == 1].copy()
cmt_bac = cmt[cmt.cmt_collaborator == 0].copy()

# cmt_crt_count, cmt_bac_count
cmt_pid = dates.merge(cmt.groupby('pid').cmt_id.count().reset_index().rename(
    index=str, columns={'cmt_id': 'cmt_count'}), on=['pid'], how='outer')
cmt_pid = cmt_pid.merge(cmt_crt.groupby('pid').cmt_id.count().reset_index().rename(
    index=str, columns={'cmt_id': 'cmt_crt_count'}), on=['pid'], how='outer')
cmt_pid = cmt_pid.merge(cmt_bac.groupby('pid').cmt_id.count().reset_index().rename(
    index=str, columns={'cmt_id': 'cmt_bac_count'}), on=['pid'], how='outer')

cmt_stat = cmt_crt.groupby('pid').sum().reset_index().copy() # sum all
cmt_stat = cmt_pid.merge(cmt_stat, on=['pid'], how='outer').copy()

# cmt_crt_rate
cmt_stat['cmt_crt_rate'] = cmt_stat.cmt_crt_count / cmt_stat.cmt_count
# cmt_avg_len
cmt_stat['cmt_avg_len'] =  cmt_stat.cmt_len / cmt_stat.cmt_crt_count
# cmt_avg_rpt_f
cmt_stat['cmt_avg_rpt_f'] =  cmt_stat.cmt_rpt_f / cmt_stat.cmt_crt_count
# cmt_avg_rpt_l
cmt_stat['cmt_avg_rpt_l'] =  cmt_stat.cmt_rpt_l / cmt_stat.cmt_crt_count
# cmt_avg_wt
# cmt_stat['cmt_avg_wt'] =  cmt_stat.cmt_wt / cmt_stat.cmt_crt_count
# cmt_email_rate
cmt_stat['cmt_email_rate'] =  cmt_stat.cmt_email / cmt_stat.cmt_crt_count
# cmt_name_rate
cmt_stat['cmt_rbn_rate'] =  cmt_stat.cmt_rbn / cmt_stat.cmt_crt_count

In [93]:
cmt_stat.head(1)

Unnamed: 0,pid,proj_start_date,first_date,after_d_3,after_d_6,after_d_9,after_d_1,interval_0,interval_3,interval_6,...,cmt_email,cmt_rpt_l,cmt_rpt_f,cmt_rbn,cmt_crt_rate,cmt_avg_len,cmt_avg_rpt_f,cmt_avg_rpt_l,cmt_email_rate,cmt_rbn_rate
0,1369691611,2015-06-10,2015-10-01 00:00:00,2016-01-01,2016-04-01,2016-07-01,2016-10-01,113.0,92.0,183.0,...,0.0,0.097373,454.124213,0.0,0.076923,26.0,454.124213,0.097373,0.0,0.0


In [94]:
def inter_cmt(name, str_lb, str_ub):
    temp = cmt.merge(dates, on=['pid']).copy()
    temp = temp[(temp.cmt_datetime >= temp[str_lb]) & (temp.cmt_datetime < temp[str_ub])]
        
    result = dates[dates.pid.isin(ta_pid)].copy()
    result = result.merge(temp.groupby('pid').count().reset_index()[['pid', 'cmt_datetime']].rename(
        index=str, columns={'cmt_datetime':'cmt_count_' + name}), on=['pid'], how='outer')
    result = result.merge(temp[temp.cmt_collaborator==1].groupby('pid').count().reset_index()[
        ['pid', 'cmt_datetime']].rename(index=str, columns={'cmt_datetime':'cmt_crt_count_' + name}), 
                          on=['pid'], how='outer')
    result = result.merge(temp[temp.cmt_collaborator==0].groupby('pid').count().reset_index()[
        ['pid', 'cmt_datetime']].rename(index=str, columns={'cmt_datetime':'cmt_bac_count_' + name}), 
                          on=['pid'], how='outer')

    # including cmt_crt_rate, cmt_avg_len, cmt_avg_rpt_f, cmt_avg_rpt_l, cmt_avg_wt, cmt_email_rate, cmt_name_rate
    temp = temp.groupby('pid').sum().reset_index().copy()[
        ['pid', 'cmt_len', 'cmt_rpt_l', 'cmt_rpt_f', 'cmt_email', 'cmt_rbn']] #, 'cmt_wt'
    result = result.merge(temp, on=['pid'], how='outer').rename(
        index=str, columns={'cmt_len': 'cmt_len_' + name, 'cmt_rpt_f': 'cmt_rpt_f_' + name, 
                            'cmt_rpt_l': 'cmt_rpt_l_' + name, #'cmt_wt': 'cmt_wt_' + name, 
                            'cmt_email': 'cmt_email_' + name, 'cmt_rbn': 'cmt_rbn_' + name})
    
    # cmt_crt_rate
    result['cmt_crt_rate_' + name] = result['cmt_crt_count_' + name] / result['cmt_count_' + name]
    # cmt_avg_len
    result['cmt_avg_len_' + name] = result['cmt_len_' + name] / result['cmt_crt_count_' + name]
    # cmt_avg_rpt_f
    result['cmt_avg_rpt_f_' + name] = result['cmt_rpt_f_' + name] / result['cmt_crt_count_' + name]
    # cmt_avg_rpt_l
    result['cmt_avg_rpt_l_' + name] = result['cmt_rpt_l_' + name] / result['cmt_crt_count_' + name]
    # cmt_avg_wt
#     result['cmt_avg_wt_' + name] = result['cmt_wt_' + name] / result['cmt_crt_count_' + name]
    # cmt_email_rate
    result['cmt_email_rate_' + name] = result['cmt_email_' + name] / result['cmt_crt_count_' + name]
    # cmt_name_rate
    result['cmt_rbn_rate_' + name] = result['cmt_rbn_' + name] / result['cmt_crt_count_' + name]

    return result

In [100]:
cmt_0 = inter_cmt('0', 'proj_start_date', 'first_date')

cmt_3 = inter_cmt('3', 'first_date', 'after_d_3')
cmt_6 = inter_cmt('6', 'first_date', 'after_d_6')
cmt_9 = inter_cmt('9', 'first_date', 'after_d_9')
cmt_1 = inter_cmt('1', 'first_date', 'after_d_1')

cmt_0103 = inter_cmt('0103', 'first_date', 'after_d_3')
cmt_0406 = inter_cmt('0406', 'after_d_3', 'after_d_6')
cmt_0709 = inter_cmt('0709', 'after_d_6', 'after_d_9')
cmt_1012 = inter_cmt('1012', 'after_d_9', 'after_d_1')

cmt_0106 = inter_cmt('0106', 'first_date', 'after_d_6')
cmt_0712 = inter_cmt('0712', 'after_d_6', 'after_d_1')

In [101]:
interact_cmt = cmt_stat[['pid'] + cmt_stat.columns.tolist()[19:]]
interact_cmt = interact_cmt.merge(cmt_0[['pid'] + cmt_0.columns.tolist()[19:]], on=['pid'], how='outer')

interact_cmt = interact_cmt.merge(cmt_3[['pid'] + cmt_3.columns.tolist()[19:]], on=['pid'], how='outer')
interact_cmt = interact_cmt.merge(cmt_6[['pid'] + cmt_6.columns.tolist()[19:]], on=['pid'], how='outer')
interact_cmt = interact_cmt.merge(cmt_9[['pid'] + cmt_9.columns.tolist()[19:]], on=['pid'], how='outer')
interact_cmt = interact_cmt.merge(cmt_1[['pid'] + cmt_1.columns.tolist()[19:]], on=['pid'], how='outer')

interact_cmt = interact_cmt.merge(cmt_0103[['pid'] + cmt_0103.columns.tolist()[19:]], on=['pid'], how='outer')
interact_cmt = interact_cmt.merge(cmt_0406[['pid'] + cmt_0406.columns.tolist()[19:]], on=['pid'], how='outer')
interact_cmt = interact_cmt.merge(cmt_0709[['pid'] + cmt_0709.columns.tolist()[19:]], on=['pid'], how='outer')
interact_cmt = interact_cmt.merge(cmt_1012[['pid'] + cmt_1012.columns.tolist()[19:]], on=['pid'], how='outer')

interact_cmt = interact_cmt.merge(cmt_0106[['pid'] + cmt_0106.columns.tolist()[19:]], on=['pid'], how='outer')
interact_cmt = interact_cmt.merge(cmt_0712[['pid'] + cmt_0712.columns.tolist()[19:]], on=['pid'], how='outer')

In [102]:
interact_cmt = interact_cmt.rename(
        index=str, columns={'rpt_l': 'cmt_rpt_l', 'rpt_f': 'cmt_rpt_f', 
                            'num_bac': 'cmt_wt', 'rep_by_name': 'cmt_rbn', 'cmt_name_rate': 'cmt_rbn_rate'})

In [103]:
save_db('interact_cmt', interact_cmt)