In [75]:
import pandas as pd
import sqlite3
import os
import re
import numpy as np

In [2]:
directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# Tools

In [3]:
def pid_count(df, *the_rest):
    def _cal(df):
        return len(df.drop_duplicates('pid'))
    print(*list(map(_cal, [df]+list(the_rest))), sep=', ')

In [4]:
def save_db(db_name, df_name, *path):
    if len(path) == 1:
        conn_save = sqlite3.connect(directory + path[0] + db_name + '.db')
    else:
        conn_save = sqlite3.connect(directory + '/dataPrep/source/' + db_name + '.db')
    df_name.to_sql(name=db_name, con=conn_save, if_exists='replace', index=False)

# Data

In [172]:
conn_proj_factor = sqlite3.connect(directory + '/dataPrep/source/constructs/' + 'proj_factor.db')
proj_factor = pd.read_sql_query("SELECT * FROM proj_factor", conn_proj_factor)

conn_crt_exp = sqlite3.connect(directory + '/dataPrep/source/constructs/' + 'crt_exp.db')
crt_exp = pd.read_sql_query("SELECT * FROM crt_exp", conn_crt_exp)

conn_interact_faq_upd = sqlite3.connect(directory + '/dataPrep/source/constructs/' + 'interact_faq_upd.db')
interact_faq_upd = pd.read_sql_query("SELECT * FROM interact_faq_upd", conn_interact_faq_upd)

conn_interact_cmt = sqlite3.connect(directory + '/dataPrep/source/constructs/' + 'interact_cmt.db')
interact_cmt = pd.read_sql_query("SELECT * FROM interact_cmt", conn_interact_cmt)

In [8]:
pid_count(proj_factor, crt_exp, interact_faq_upd, interact_cmt)

2137, 2137, 2137, 2137


In [138]:
conn_proj = sqlite3.connect(directory + '/dataPrep/source/ds_ind_26229/' + 'proj.db')
proj = pd.read_sql_query("SELECT * FROM proj", conn_proj)
proj = proj[proj.pid.isin(proj_factor.pid)].copy()

In [32]:
conn_rmO_lm_b03 = sqlite3.connect(directory + '/dataPrep/source/constructs/' + 'rmO_lm_b03.db')
rmO_lm_b03 = pd.read_sql_query("SELECT * FROM rmO_lm_b03", conn_rmO_lm_b03)

conn_rmO_lm_b03_neg = sqlite3.connect(directory + '/dataPrep/source/constructs/' + 'rmO_lm_b03_neg.db')
rmO_lm_b03_neg = pd.read_sql_query("SELECT * FROM rmO_lm_b03_neg", conn_rmO_lm_b03_neg)

In [46]:
pid_count(proj, rmO_lm_b03, neg_rmO_lm_b03)

2137, 2137, 762


In [61]:
conn_dates = sqlite3.connect(directory + '/dataPrep/source/overall/' + 'dates.db')
dates = pd.read_sql_query("SELECT * FROM dates", conn_dates)

# Merge

In [142]:
proj = proj[['pid', 'proj_start_date', 'proj_end_date', 'proj_url']].copy()
proj.proj_start_date = proj.proj_start_date.apply(lambda str_date: re.sub('-.[\d]:.[\d]$', '', str_date))
proj.proj_start_date = proj.proj_start_date.apply(
    lambda str_date: datetime.datetime.strptime(str_date, "%Y-%m-%dT%H:%M:%S"))
proj.proj_end_date = proj.proj_end_date.apply(lambda str_date: re.sub('-.[\d]:.[\d]$', '', str_date))
proj.proj_end_date = proj.proj_end_date.apply(
    lambda str_date: datetime.datetime.strptime(str_date, "%Y-%m-%dT%H:%M:%S"))
proj.proj_url = proj.proj_url.apply(lambda i: re.sub(' ', '', i))

In [178]:
mergeAll = proj.merge(proj_factor, on=['pid'])
mergeAll = mergeAll.merge(crt_exp, on=['pid'])
mergeAll = mergeAll.merge(interact_faq_upd, on=['pid'])
mergeAll = mergeAll.merge(interact_cmt, on=['pid']).copy()

In [180]:
mergeAll = mergeAll.merge(rmO_lm_b03[[
    'pid', 'cmt_total', 'backer_cmt', 'backer_rate', 'neg_1', 'neg_2', 'neg_3', 'neg_all', 'neg_2_3',
    'neg_1_rate', 'neg_2_rate', 'neg_3_rate', 'neg_all_rate', 'neg_2_3_rate']], on=['pid'])

In [182]:
# check whether is severe case
mergeAll['severity'] = mergeAll.neg_2_3_rate > mergeAll.neg_all_rate * 0.5
mergeAll.severity = mergeAll.severity.apply(lambda i: 1 if i==True else 0)

In [183]:
# calculate creation period
creation_period = dates[['pid', 'first_date', 'proj_end_date']].copy()
creation_period.first_date = creation_period.first_date.apply(
    lambda str_date: datetime.datetime.strptime(str_date, "%Y-%m-%d %H:%M:%S"))
creation_period.proj_end_date = creation_period.proj_end_date.apply(
    lambda str_date: datetime.datetime.strptime(str_date, "%Y-%m-%d %H:%M:%S"))

creation_period['waiting_days'] = creation_period.first_date - creation_period.proj_end_date
creation_period.waiting_days = creation_period.waiting_days.apply(lambda td: (td / np.timedelta64(1, 'D')).astype(int))

mergeAll = mergeAll.merge(creation_period[['pid', 'waiting_days']], on=['pid'])

In [184]:
# check whether using early bird and shipping limit
mergeAll['early_bird'] = mergeAll.eb_rate > 0
mergeAll.early_bird = mergeAll.early_bird.apply(lambda i: 1 if i==True else 0)

mergeAll['ship_limit'] = mergeAll.reward_ship_limit_rate > 0
mergeAll.ship_limit = mergeAll.ship_limit.apply(lambda i: 1 if i==True else 0)

In [185]:
mergeAll.columns.values

array(['pid', 'proj_start_date', 'proj_end_date', 'proj_url',
       'proj_category', 'proj_subcategory', 'proj_location',
       'proj_currency', 'proj_goal', 'percent_raised', 'duration',
       'proj_backer_count', 'has_collaborator', 'early_bird_count',
       'rew_count', 'eb_rate', 'reward_quota_rate',
       'reward_ship_limit_rate', 'body_length', 'body_image_count',
       'body_video_count', 'cid', 'joined_days', 'firstTime',
       'created_count', 'created_suc_rate', 'backed_count', 'next_suc',
       'has_faq', 'has_faq_0', 'has_faq_1', 'upd_count', 'upd_count_b',
       'upd_freq', 'backer_only', 'upd_len', 'upd_image', 'upd_video',
       'upd_avg_len', 'upd_avg_image', 'upd_avg_video', 'upd_count_0',
       'upd_count_a_0', 'upd_freq_0', 'upd_len_0', 'upd_image_0',
       'upd_video_0', 'upd_avg_len_0', 'upd_avg_image_0',
       'upd_avg_video_0', 'upd_count_3', 'upd_count_a_3', 'upd_freq_3',
       'upd_len_3', 'upd_image_3', 'upd_video_3', 'upd_avg_len_3',
       'upd

In [186]:
save_db('mergeAll', mergeAll)

In [187]:
dataset = mergeAll[[
    'pid', 'proj_start_date', 'proj_end_date', 'waiting_days',
    'proj_category', 'proj_subcategory', 'proj_url', 'percent_raised',
    'duration', 'proj_backer_count', 'early_bird', 'ship_limit',
    'body_length', 'body_image_count', 'body_video_count', 'has_faq',
    'joined_days', 'firstTime', 'created_count', 'backed_count', 
    
    'upd_count_0',
    'upd_avg_len_0', 'upd_avg_image_0', 'upd_avg_video_0',
    'cmt_crt_rate_0', 'cmt_avg_len_0', 'cmt_avg_rpt_l_0',
    'cmt_email_rate_0', 'cmt_rbn_rate_0', 
    
    'upd_count_1',
    'upd_avg_len_1', 'upd_avg_image_1', 'upd_avg_video_1',
    'cmt_crt_rate_1', 'cmt_avg_len_1', 'cmt_avg_rpt_l_1',
    'cmt_email_rate_1', 'cmt_rbn_rate_1', 
    
    'upd_count_0106',
    'upd_avg_len_0106', 'upd_avg_image_0106', 'upd_avg_video_0106',
    'cmt_crt_rate_0106', 'cmt_avg_len_0106', 'cmt_avg_rpt_l_0106',
    'cmt_email_rate_0106', 'cmt_rbn_rate_0106', 
    
    'upd_count_0712',
    'upd_avg_len_0712', 'upd_avg_image_0712', 'upd_avg_video_0712',
    'cmt_crt_rate_0712', 'cmt_avg_len_0712', 'cmt_avg_rpt_l_0712',
    'cmt_email_rate_0712', 'cmt_rbn_rate_0712', 
    
    'neg_1_rate','neg_2_rate', 'neg_3_rate', 'neg_2_3_rate', 'neg_all_rate', 'severity']]

In [191]:
dataset = dataset.rename(index=str, columns={
    'percent_raised':'percentRaised',
    'duration':'projectDuration',    
    'proj_backer_count':'backers',
    'early_bird':'earlyBird',
    'ship_limit':'limit',
    'body_length':'word_count',
    'body_image_count':'image_count', 
    'body_video_count':'video_count',
    'has_faq':'faqs',
    'joined_days': 'joinDays',
    'firstTime':'novice',
    'created_count':'created',
    'backed_count':'backed',
    'upd_count_0':'updates_BEDD',
    'upd_avg_len_0':'upd_avg_word_BEDD', 
    'upd_avg_image_0':'upd_avg_image_BEDD',
    'upd_avg_video_0':'upd_avg_video_BEDD', 
    'cmt_crt_rate_0':'replyRate_BEDD', 
    'cmt_avg_len_0':'avgWord_BEDD',
    'cmt_avg_rpt_l_0':'avgTime_BEDD', 
    'cmt_email_rate_0':'emailRate_BEDD', 
    'cmt_rbn_rate_0':'nameRate_BEDD', 
    'upd_count_1':'updates_AEDD',
    'upd_avg_len_1':'upd_avg_word_AEDD', 
    'upd_avg_image_1':'upd_avg_image_AEDD', 
    'upd_avg_video_1':'upd_avg_video_AEDD',
    'cmt_crt_rate_1':'replyRate_AEDD', 
    'cmt_avg_len_1':'avgWord_AEDD', 
    'cmt_avg_rpt_l_1':'avgTime_AEDD', 
    'cmt_email_rate_1':'emailRate_AEDD',
    'cmt_rbn_rate_1':'nameRate_AEDD', 
    'upd_count_0106':'updates_0106', 
    'upd_avg_len_0106':'upd_avg_word_0106', 
    'cmt_crt_rate_0106':'replyRate_0106',
    'cmt_avg_len_0106':'avgWord_0106', 
    'cmt_avg_rpt_l_0106':'avgTime_0106', 
    'cmt_email_rate_0106':'emailRate_0106', 
    'cmt_rbn_rate_0106':'nameRate_0106',
    'upd_count_0712':'updates_0712', 
    'upd_avg_len_0712':'upd_avg_word_0712', 
    'cmt_crt_rate_0712':'replyRate_0712', 
    'cmt_avg_len_0712':'avgWord_0712',
    'cmt_avg_rpt_l_0712':'avgTime_0712', 
    'cmt_email_rate_0712':'emailRate_0712', 
    'cmt_rbn_rate_0712':'nameRate_0712',
    'neg_all_rate':'complaintRate'})

In [208]:
# re-defined the 'novice'
# if the creator had created a project: 0
# if the creator had never created a project: 1/days since the creator joined KS
dataset.novice = dataset.novice/dataset.joinDays

# Null Value

In [211]:
count = 0
for i in dataset.columns.values.tolist():
    while dataset[i].isnull().values.any(): count+=1;print(i); break

updates_BEDD
upd_avg_word_BEDD
upd_avg_image_BEDD
upd_avg_video_BEDD
replyRate_BEDD
avgWord_BEDD
avgTime_BEDD
emailRate_BEDD
nameRate_BEDD
updates_AEDD
upd_avg_word_AEDD
upd_avg_image_AEDD
upd_avg_video_AEDD
replyRate_AEDD
avgWord_AEDD
avgTime_AEDD
emailRate_AEDD
nameRate_AEDD
updates_0106
upd_avg_word_0106
upd_avg_image_0106
upd_avg_video_0106
replyRate_0106
avgWord_0106
avgTime_0106
emailRate_0106
nameRate_0106
updates_0712
upd_avg_word_0712
upd_avg_image_0712
upd_avg_video_0712
replyRate_0712
avgWord_0712
avgTime_0712
emailRate_0712
nameRate_0712
neg_1_rate
neg_2_rate
neg_3_rate
neg_2_3_rate
complaintRate


In [212]:
fill_0 = [
'updates_BEDD',
'updates_AEDD',
'updates_0106',
'updates_0712',
'neg_1_rate','neg_2_rate','neg_3_rate', 'neg_2_3_rate', 'complaintRate'
]

fill_mean = [
    'replyRate_BEDD','replyRate_AEDD','replyRate_0106','replyRate_0712',
    
    'upd_avg_word_BEDD','upd_avg_image_BEDD','upd_avg_video_BEDD',
    'avgWord_BEDD','avgTime_BEDD','emailRate_BEDD','nameRate_BEDD',
    
    'upd_avg_word_AEDD','upd_avg_image_AEDD','upd_avg_video_AEDD',
    'avgWord_AEDD','avgTime_AEDD','emailRate_AEDD','nameRate_AEDD',
    
    'upd_avg_word_0106','upd_avg_image_0106','upd_avg_video_0106',
    'avgWord_0106','avgTime_0106','emailRate_0106','nameRate_0106',
    
    'upd_avg_word_0712','upd_avg_image_0712','upd_avg_video_0712',
    'avgWord_0712','avgTime_0712','emailRate_0712','nameRate_0712']

In [213]:
dataSet_fill0 = dataset.copy()
for i in fill_0:
    dataSet_fill0[i] = dataSet_fill0[i].fillna(0)

In [214]:
dataSet_fillMean = dataSet_fill0.copy()
for i in fill_mean:
    dataSet_fillMean[i] = dataSet_fillMean[i].fillna(dataSet_fillMean[i].mean())

In [215]:
count = 0
for i in dataSet_fillMean.columns.values.tolist():
    while dataSet_fillMean[i].isnull().values.any(): count+=1;print(i); break

In [216]:
dataSet_fillMean.head()

Unnamed: 0,pid,proj_start_date,proj_end_date,waiting_days,proj_category,proj_subcategory,proj_url,percentRaised,projectDuration,backers,...,avgWord_0712,avgTime_0712,emailRate_0712,nameRate_0712,neg_1_rate,neg_2_rate,neg_3_rate,neg_2_3_rate,complaintRate,severity
0,156564766,2016-02-10 09:57:01,2016-03-11 09:57:01,21,design,Product Design,https://www.kickstarter.com/projects/541540245...,1.035771,30,508.0,...,40.6,1.186699,0.0,0.8,0.24,0.13,0.03,0.15,0.37,0
1,52884144,2015-12-07 15:17:35,2016-01-19 15:17:35,73,design,Product Design,https://www.kickstarter.com/projects/123777977...,11.327083,43,674.0,...,25.0,3.236111,0.0,1.0,0.150327,0.137255,0.0,0.137255,0.267974,1
2,1770840100,2016-01-16 16:37:37,2016-03-01 16:37:37,31,design,Product Design,https://www.kickstarter.com/projects/optiscoop...,1.0892,45,228.0,...,30.325107,3.776239,0.103495,0.723373,0.454545,0.090909,0.0,0.090909,0.545455,0
3,1907099186,2016-01-05 21:00:53,2016-02-19 21:00:53,11,technology,Hardware,https://www.kickstarter.com/projects/593018456...,3.53048,45,541.0,...,95.777778,0.207773,0.222222,0.555556,0.171515,0.125376,0.054664,0.166499,0.2999,1
4,215985998,2015-04-14 11:01:06,2015-05-15 01:00:00,231,games,Tabletop Games,https://www.kickstarter.com/projects/ruddygame...,2.2392,30,1049.0,...,30.325107,3.776239,0.103495,0.723373,0.053571,0.035714,0.0,0.035714,0.089286,0


In [217]:
save_db('dataset_0515', dataSet_fillMean)
dataSet_fillMean.to_csv(directory + '/dataPrep/source/' + 'dataset_0515' + '.csv', encoding='utf-8-sig')

In [218]:
dataSet_fillMean_neg = dataSet_fillMean[dataSet_fillMean.pid.isin(rmO_lm_b03_neg.pid)].copy()

In [219]:
save_db('dataset_0515_neg', dataSet_fillMean_neg)
dataSet_fillMean_neg.to_csv(directory + '/dataPrep/source/' + 'dataset_0515_neg' + '.csv', encoding='utf-8-sig')

In [220]:
len(dataSet_fillMean_neg)

762

In [221]:
dataSet_fillMean_pos = dataSet_fillMean[~dataSet_fillMean.pid.isin(dataSet_fillMean_neg.pid)]

In [222]:
len(dataSet_fillMean_pos)

1375

In [223]:
save_db('dataset_0515_pos', dataSet_fillMean_pos)
dataSet_fillMean_pos.to_csv(directory + '/dataPrep/source/' + 'dataset_0515_pos' + '.csv', encoding='utf-8-sig')