In [153]:
import numpy as np
import pandas as pd
import os
import sqlite3
import re
import datetime

In [2]:
directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# Tools

In [3]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from bs4 import BeautifulSoup
stops = set(stopwords.words("english"))
regex = re.compile('[%s]' % re.escape(string.punctuation))
words = word_tokenize(regex.sub('', 'text sdsd, and sds')) 
words = [word for word in words if word not in stops]

In [5]:
def save_db(db_name, df_name, *path):
    if len(path) == 1:
        conn_save = sqlite3.connect(directory + path[0] + db_name + '.db')
    else:
#         conn_save = sqlite3.connect(directory + '/' + db_name + '.db')
        conn_save = sqlite3.connect(directory + '/dataPrep/source/overall/' + db_name + '.db')
    df_name.to_sql(name=db_name, con=conn_save, if_exists='replace', index=False)

In [9]:
def pid_count(df, *the_rest):
    def _cal(df):
        return len(df.drop_duplicates('pid'))
    print(*list(map(_cal, [df]+list(the_rest))), sep=', ')

In [4]:
def count_length(body):
    soup = BeautifulSoup(body, 'lxml')
    text = re.sub('\n+', ' ', soup.text.strip()).replace('\xa0', '')
    words = word_tokenize(regex.sub('', text)) 
    words = [word for word in words if word not in stops][14:]   
    return len(words)

def image_count(body):
    soup = BeautifulSoup(body, 'lxml')
    tags = soup.find_all('figure')
    image_count = len(tags)
    for tag in tags:
        if tag.find('div', class_='video-player') != None:
            image_count = image_count - 1
    return image_count

def video_count(body):
    soup = BeautifulSoup(body, 'lxml')
    video_count = len(soup.find_all('div', class_='video-player')) + len(soup.find_all('iframe'))
    return video_count

# Data

In [6]:
conn_cmt_rmO_lm_b03 = sqlite3.connect(directory + '/dataPrep/source/overall/' + 'cmt_rmO_lm_b03.db')
cmt_rmO_lm_b03 = pd.read_sql_query("SELECT * FROM cmt_rmO_lm_b03", conn_cmt_rmO_lm_b03)

conn_neg_rmO_lm_b03 = sqlite3.connect(directory + '/dataPrep/source/overall/' + 'neg_rmO_lm_b03.db')
neg_rmO_lm_b03 = pd.read_sql_query("SELECT * FROM neg_rmO_lm_b03", conn_neg_rmO_lm_b03)

In [41]:
conn_proj = sqlite3.connect(directory + '/dataPrep/source/ds_ind_26229/' + 'proj.db')
proj = pd.read_sql_query("SELECT * FROM proj", conn_proj)

conn_proj_body = sqlite3.connect(directory + '/dataPrep/source/body_name/' + 'proj_body.db')
proj_body = pd.read_sql_query('SELECT * FROM proj_body', conn_proj_body)

conn_cmt = sqlite3.connect(directory + '/dataPrep/source/ds_ind_26229/' + 'cmt.db')
cmt = pd.read_sql_query('SELECT * FROM cmt', conn_cmt)

conn_rew = sqlite3.connect(directory + '/dataPrep/source/ds_ind_26229/' + 'rew.db')
rew = pd.read_sql_query('SELECT * FROM rew', conn_rew)

In [55]:
proj = proj[proj.pid.isin(cmt_rmO_lm_b03.pid)].copy()
proj_body = proj_body[proj_body.pid.isin(cmt_rmO_lm_b03.pid)].copy()
cmt = cmt[cmt.pid.isin(cmt_rmO_lm_b03.pid)].copy()
rew = rew[rew.pid.isin(cmt_rmO_lm_b03.pid)].copy()

In [149]:
conn_crt_about = sqlite3.connect(directory + '/dataPrep/source/ds_crt_24404/' + 'crt_about.db')
conn_crt_created = sqlite3.connect(directory + '/dataPrep/source/ds_crt_24404/' + 'crt_created.db')
conn_crt_backed = sqlite3.connect(directory + '/dataPrep/source/ds_crt_24404/' + 'crt_backed.db')

crt_about = pd.read_sql_query("SELECT * FROM crt_about", conn_crt_about)
crt_created = pd.read_sql_query('SELECT * FROM crt_created', conn_crt_created)
crt_backed = pd.read_sql_query('SELECT * FROM crt_backed', conn_crt_backed)

In [203]:
conn_date_fund = sqlite3.connect(directory + '/dataPrep/source/overall/' + 'date_fund.db')
date_fund = pd.read_sql_query('SELECT * FROM date_funding', conn_date_fund)

# Project Factors

In [178]:
# percent_raised
proj['percent_raised'] = proj.proj_amount_pledged / proj.proj_goal

In [172]:
# duration
proj.proj_start_date = proj.proj_start_date.apply(lambda date: str(date))
proj.proj_start_date = proj.proj_start_date.apply(lambda str_date: re.sub('-.[\d]:.[\d]$', '',str_date))
proj.proj_start_date = proj.proj_start_date.apply(lambda str_date: re.sub('T', ' ',str_date))
proj.proj_start_date = proj.proj_start_date.apply(lambda str_date: 
                                                  datetime.datetime.strptime(str_date, "%Y-%m-%d %H:%M:%S"))
proj.proj_end_date = proj.proj_end_date.apply(lambda date: str(date))
proj.proj_end_date = proj.proj_end_date.apply(lambda str_date: re.sub('-.[\d]:.[\d]$', '',str_date))
proj.proj_end_date = proj.proj_end_date.apply(lambda str_date: re.sub('T', ' ',str_date))
proj.proj_end_date = proj.proj_end_date.apply(
    lambda str_date: datetime.datetime.strptime(str_date, "%Y-%m-%d %H:%M:%S"))
proj['duration'] = proj.proj_end_date - proj.proj_start_date

proj_factor = proj[['pid', 'proj_category', 'proj_subcategory', 'proj_location', 'proj_currency', 'proj_goal', 
      'percent_raised', 'duration', 'proj_backer_count']].copy()
proj_factor.duration = proj_factor.duration.apply(lambda td: (td / np.timedelta64(1, 'D')).astype(int))

In [176]:
# has_collaborator
cmt_year_target_crt[cmt_year_target_crt.cmt_creator!=1].groupby('pid').count().cmt_content.count()
only 30 (0.043%) creators have collaborators of their projects

proj_has_col = cmt[(cmt.cmt_collaborator == 1) & (cmt.cmt_creator!=1)].pid.drop_duplicates()
proj_factor['has_collaborator'] = proj_factor.pid.isin(proj_has_col)

In [179]:
# reward_quota_rate, reward_ship_limit_rate
rew = rew.fillna('None').copy()
rew['rew_limited'] = rew.rew_backer_limit != 'None'
rew['ship_limited'] = (rew.rew_ships_to != 'None') & (rew.rew_ships_to != 'Anywhere in the world')

rew_count = rew.groupby('pid').count().rew_id.reset_index().rename(index=str, columns={'rew_id':'rew_count'})
rew_rew_l_count = rew[rew.rew_limited==True].groupby('pid').count().rew_id.reset_index().rename(
    index=str, columns={'rew_id':'rew_rew_l_count'})
rew_ship_l_count = rew[rew.ship_limited==True].groupby('pid').count().rew_id.reset_index().rename(
    index=str, columns={'rew_id':'rew_ship_l_count'})

rew_count = rew_count.merge(rew_rew_l_count, on=['pid'], how='outer')
rew_count = rew_count.merge(rew_ship_l_count, on=['pid'], how='outer')
rew_count = rew_count.fillna(0).copy()
rew_count['reward_quota_rate'] = rew_count.rew_rew_l_count / rew_count.rew_count
rew_count['reward_ship_limit_rate'] = rew_count.rew_ship_l_count / rew_count.rew_count
rew_count = rew_count[['pid', 'reward_quota_rate', 'reward_ship_limit_rate']]

proj_factor = proj_factor.merge(rew_count, on=['pid']).copy()

In [180]:
# body_length
proj_body['body_length'] = proj_body.apply(lambda row: count_length(row.proj_body), axis=1)

# body_image_count
proj_body['body_image_count'] = proj_body.apply(lambda row: image_count(row.proj_body), axis=1)

# body_video_count
proj_body['body_video_count'] = proj_body.apply(lambda row: video_count(row.proj_body), axis=1)


proj_factor = proj_factor.merge(
    proj_body[['pid', 'body_length', 'body_image_count', 'body_video_count']], on=['pid']).copy()

## Early Bird

In [334]:
rew_text = rew[['pid', 'rew_id', 'rew_description']].copy()
rew_text.rew_description = rew_text.rew_description.apply(lambda up_str: up_str.lower())
def check_keywords(test_df, keywords):
    df = test_df.copy()
    df = df.apply(lambda row: any(x in row.rew_description for x in keywords), axis=1)
    return df
early_bird = ['early bird']
rew_text['early_bird'] = check_keywords(rew_text, early_bird)

early_bird = proj_factor[['pid']].merge(rew_text.groupby('pid').sum().early_bird.reset_index().rename(index=str, 
                                            columns={'early_bird':'early_bird_count'}), on=['pid'])
early_bird = early_bird.merge(rew_text.groupby('pid').count().early_bird.reset_index().rename(index=str, 
                                            columns={'early_bird':'rew_count'}), on=['pid'])
early_bird['eb_rate'] = early_bird.early_bird_count / early_bird.rew_count
# save_db('early_bird', early_bird)

In [337]:
proj_factor = proj_factor.merge(early_bird, on='pid')

In [338]:
proj_factor = proj_factor[['pid', 'proj_category', 'proj_subcategory', 'proj_location',
 'proj_currency', 'proj_goal', 'percent_raised', 'duration', 'proj_backer_count',
 'has_collaborator', 'early_bird_count', 'rew_count', 'eb_rate', 'reward_quota_rate',
 'reward_ship_limit_rate', 'body_length', 'body_image_count', 'body_video_count']]

In [339]:
save_db('proj_factor', proj_factor)

# Creators' Experience

In [184]:
len(proj)

2137

In [328]:
crt = crt_about[crt_about.cid.isin(proj.proj_creator_id)].copy()
created = crt_created[crt_created.cid.isin(crt_about.cid)].drop_duplicates()
backed = crt_backed[crt_backed.cid.isin(crt_about.cid)].drop_duplicates()

In [255]:
len(crt)

1961

## Created

In [188]:
# next_suc

## find the created order of project
created_no = pd.DataFrame({'pid': [], 'cid': [], 'no': [], 'state': [], 'next_proj': [], 'next_suc': []})
for i in created.drop_duplicates('cid').cid:
    cid = i
    total = created[created.cid==i].pid.drop_duplicates().count()
    next_suc = False
    for j in created[created.cid==i].pid.drop_duplicates():
        pid = j
        no = total
        total = no - 1
        state = created[created.pid==j].project_state.values[0]
        next_proj = True if no != created[created.cid==i].pid.drop_duplicates().count() else False
        temp = pd.DataFrame({'pid': [pid], 'cid': [cid], 'no': [no], 'state': [state], 
                             'next_proj': [next_proj], 'next_suc': [next_suc]})
        if state == 'successful':
            next_suc = True
        else:
            next_suc = False
        created_no = created_no.append(temp)

In [195]:
## created_state
created_state = pd.DataFrame({'pid': [], 'cid': [], 'no': [], 'state': [], 'crt_suc': [], 
                              'crt_fail': [], 'crt_cancel': []})
for i in created_no.drop_duplicates('cid').cid:
    cid = i
    crt_suc = 0
    crt_fail = 0
    crt_cancel = 0
    for j in created_no[created_no.cid==i].sort_values('no').pid:
        pid = j
        no = created_no[created_no.pid==j].no.values[0]
        state = created_no[created_no.pid==j].state.values[0]
        temp = pd.DataFrame({'pid': [pid], 'cid': [cid], 'no': [no], 'state': [state], 'crt_suc': [crt_suc], 
                             'crt_fail': [crt_fail], 'crt_cancel': [crt_cancel]})
        
        if state == 'successful':
            crt_suc = crt_suc + 1
        elif state == 'failed':
            crt_fail = crt_fail + 1
        elif state == 'canceled':
            crt_cancel = crt_cancel + 1   
        
        created_state = created_state.append(temp)

In [197]:
len(created_state)

42696

In [198]:
len(created_no)

42696

In [199]:
created_state.head()

Unnamed: 0,cid,crt_cancel,crt_fail,crt_suc,no,pid,state
0,brexitthemovie,0.0,0.0,0.0,1.0,1264072338,successful
0,brexitthemovie,0.0,0.0,1.0,2.0,1531926102,successful
0,1209541329,0.0,0.0,0.0,1.0,911532921,successful
0,1579383219,0.0,0.0,0.0,1.0,2063094794,successful
0,plxdevices,0.0,0.0,0.0,1.0,1595337096,successful


## Backed

Since we do have the funding date data of successfully funded projects, we only can count how many successful projects backed by the creator.


In [329]:
# backed
backed = backed.merge(date_fund[['pid', 'proj_end_date']], on=['pid']).copy()
backed = backed.rename(index=str, columns={'pid': 'backed_pid', 'cid': 'proj_creator_id', 
                                           'proj_end_date': 'backed_proj_end_date'}).copy()
proj_crt_backed = proj.merge(backed[['proj_creator_id', 'backed_pid', 'backed_proj_end_date', 
                                     'project_state']], on=['proj_creator_id'])

proj_crt_backed = proj_crt_backed[proj_crt_backed.backed_proj_end_date!='hidden project'].copy()
proj_crt_backed = proj_crt_backed[proj_crt_backed.backed_proj_end_date!='hidden'].copy()

# proj_crt_backed.proj_end_date = proj_crt_backed.proj_end_date.apply(
#     lambda str_date: re.sub('-.[\d]:.[\d]$', '',str_date))
# proj_crt_backed.proj_end_date = proj_crt_backed.proj_end_date.apply(
#     lambda str_date: datetime.datetime.strptime(str_date, "%Y-%m-%dT%H:%M:%S"))

proj_crt_backed.backed_proj_end_date = proj_crt_backed.backed_proj_end_date.apply(
    lambda str_date: re.sub('-.[\d]:.[\d]$', '',str_date))
proj_crt_backed.backed_proj_end_date = proj_crt_backed.backed_proj_end_date.apply(
    lambda str_date: datetime.datetime.strptime(str_date, "%Y-%m-%dT%H:%M:%S"))


# backed_suc_count

proj_crt_backed = proj_crt_backed[proj_crt_backed.backed_proj_end_date < proj_crt_backed.proj_end_date].copy()
proj_crt_backed_count = proj_crt_backed.groupby('pid').count()[['backed_pid']].copy().reset_index()
proj_crt_backed_count = proj_crt_backed_count.rename(index=str, columns={'backed_pid': 'backed_count'}).copy()


## Experience 

In [331]:
crt_exp = proj[['pid','proj_creator_id', 'proj_start_date']
              ].rename(index=str, columns={'proj_creator_id':'cid'}).copy()
crt_exp = crt_exp.merge(crt[['cid', 'joined_date']], on=['cid']).copy()
## but created_count and backed_count are not correspond the number in that times

crt_exp.joined_date = crt_exp.joined_date.apply(lambda str_date: str(str_date))
crt_exp = crt_exp.loc[crt_exp.joined_date != 'None'] # 6 creators has been deleted.

crt_exp.joined_date = crt_exp.joined_date.apply(lambda str_date: re.sub('T.[\d]:.[\d]:.[\d]-.[\d]:.[\d]$', '',str_date))
crt_exp.joined_date = crt_exp.joined_date.apply(
    lambda str_date: datetime.datetime.strptime(str_date, "%Y-%m-%d"))


# firstTime

crt_exp = crt_exp.merge(created_state[['pid', 'no', 'crt_suc']], on=['pid']).copy()
crt_exp = crt_exp.merge(created_no[['pid', 'next_proj', 'next_suc']], on=['pid']).copy()

crt_exp = crt_exp.merge(proj_crt_backed_count, on=['pid'], how='outer').fillna(0).copy()
crt_exp['firstTime'] =  crt_exp.no==1

# created_count

# created_suc_rate

crt_exp = crt_exp[['pid', 'cid', 'firstTime', 'no', 'crt_suc', 'backed_count', 'next_suc']]
crt_exp = crt_exp.rename(index=str, columns={'no': 'created_count', 'crt_suc': 'created_suc_rate'}).copy()

In [333]:
crt_exp = crt_exp.drop_duplicates()
save_db('crt_exp', crt_exp)