In [2]:
import os
import sqlite3
import pandas as pd
import re
import datetime
import numpy as np
import time
from dateutil.relativedelta import relativedelta

In [3]:
directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))# Connections

# Tools

In [4]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from bs4 import BeautifulSoup
stops = set(stopwords.words("english"))
regex = re.compile('[%s]' % re.escape(string.punctuation))
words = word_tokenize(regex.sub('', 'text sdsd, and sds')) 
words = [word for word in words if word not in stops]

In [185]:
def count_length(body):
    soup = BeautifulSoup(body, 'lxml')
    text = re.sub('\n+', ' ', soup.text.strip()).replace('\xa0', '')
    words = word_tokenize(regex.sub('', text))
    words = [word for word in words if word not in stops]
    return len(words)

def image_count(body):
    soup = BeautifulSoup(body, 'lxml')
    tags = soup.find_all('figure')
    image_count = len(tags)
    for tag in tags:
        if tag.find('div', class_='video-player') != None:
            image_count = image_count - 1
    return image_count

def video_count(body):
    soup = BeautifulSoup(body, 'lxml')
    video_count = len(soup.find_all('div', class_='video-player')) + len(soup.find_all('iframe'))
    return video_count

In [5]:
def save_db(db_name, df_name, *path):
    if len(path) == 1:
        conn_save = sqlite3.connect(directory + path[0] + db_name + '.db')
    else:
#         conn_save = sqlite3.connect(directory + '/' + db_name + '.db')
        conn_save = sqlite3.connect(directory + '/dataPrep/source/overall/' + db_name + '.db')
    df_name.to_sql(name=db_name, con=conn_save, if_exists='replace', index=False)

# Data

In [6]:
conn_proj = sqlite3.connect(directory + '/dataPrep/source/ds_ind_26229/' + 'proj.db')
proj = pd.read_sql_query("SELECT * FROM proj", conn_proj)

conn_crt_exp = sqlite3.connect(directory + '/dataPrep/source/constructs/' + 'crt_exp.db')
crt_exp = pd.read_sql_query("SELECT * FROM crt_exp", conn_crt_exp)

In [7]:
conn_first_last_date = sqlite3.connect(directory + '/dataPrep/source/overall/' + 'first_last_date.db')
first_last_date = pd.read_sql_query("SELECT * FROM first_last_date", conn_first_last_date)

In [8]:
conn_all_date_year = sqlite3.connect(directory + '/dataPrep/source/overall/' + 'all_date_year.db')
all_date_year = pd.read_sql_query("SELECT * FROM all_date_year", conn_all_date_year)

In [9]:
conn_faq_all = sqlite3.connect(directory + '/dataPrep/source/ds_ind_26229/' + 'faq_all.db')
conn_faq_year = sqlite3.connect(directory + '/dataPrep/source/ds_ind_26229/' + 'faq_year.db')
conn_faq_pre = sqlite3.connect(directory + '/dataPrep/source/ds_ind_26229/' + 'faq_pre.db')

faq_all = pd.read_sql_query("SELECT * FROM faq_all", conn_faq_all)
faq_year = pd.read_sql_query("SELECT * FROM faq_year", conn_faq_year)
faq_pre = pd.read_sql_query("SELECT * FROM faq_pre", conn_faq_pre)

In [188]:
conn_upd_all = sqlite3.connect(directory + '/dataPrep/source/ds_ind_26229/' + 'upd_all.db')
upd_all = pd.read_sql_query("SELECT * FROM upd_all", conn_upd_all)

In [187]:
conn_upd_body = sqlite3.connect(directory + '/dataPrep/source/body_name/' + 'upd_body.db')
upd_body = pd.read_sql_query("SELECT * FROM upd_body", conn_upd_body)

In [15]:
len(crt_exp)

2137

In [76]:
len(crt_exp[~crt_exp.pid.isin(upd_body.pid)].pid.drop_duplicates())

4

# has_faq

In [158]:
all_date_year = all_date_year[all_date_year.pid.isin(crt_exp.pid)]
interact_faq_upd = all_date_year[['pid', 'category', 'subcategory']].drop_duplicates().reset_index(drop=True).copy()

# has_faq
proj_has_faq = faq_all.pid.drop_duplicates()
interact_faq_upd['has_faq'] = interact_faq_upd.pid.isin(proj_has_faq)

# has_faq_0
proj_has_faq_0 = faq_pre.pid.drop_duplicates()
interact_faq_upd['has_faq_0'] = interact_faq_upd.pid.isin(proj_has_faq_0)

# has_faq_1
proj_has_faq_1 = faq_year.pid.drop_duplicates()
interact_faq_upd['has_faq_1'] = interact_faq_upd.pid.isin(proj_has_faq_1)

interact_faq_upd = interact_faq_upd[['pid', 'has_faq', 'has_faq_0', 'has_faq_1']]

In [150]:
len(interact_faq_upd)

2137

In [134]:
interact_faq_upd.head(1)

Unnamed: 0,pid,has_faq,has_faq_0,has_faq_1
0,1446685548,False,False,False


# Dates

In [84]:
first_last_date.first_date = first_last_date.first_date.apply(
    lambda str_date: datetime.datetime.strptime(str_date, "%Y-%m-%d %H:%M:%S"))
all_date_year.proj_start_date = all_date_year.proj_start_date.apply(
    lambda str_date: datetime.datetime.strptime(str_date, "%Y-%m-%d"))
all_date_year.proj_end_date = all_date_year.proj_end_date.apply(
    lambda str_date: datetime.datetime.strptime(str_date, "%Y-%m-%d"))

dates = all_date_year[['pid', 'proj_start_date', 'proj_end_date']].drop_duplicates().copy()
dates = dates.merge(first_last_date[['pid', 'first_date']], on=['pid'])
dates['after_d_3'] = dates.first_date.apply(lambda ori_date: ori_date + relativedelta(months=3))
dates['after_d_6'] = dates.first_date.apply(lambda ori_date: ori_date + relativedelta(months=6))
dates['after_d_9'] = dates.first_date.apply(lambda ori_date: ori_date + relativedelta(months=9))
dates['after_d_1'] = dates.first_date.apply(lambda ori_date: ori_date + relativedelta(months=12))

# set intervals for calculating frequencies

dates['interval_0'] = dates.first_date - dates.proj_start_date # before the first delivery date

dates['interval_3'] = dates.after_d_3 - dates.first_date # 1-3
dates['interval_6'] = dates.after_d_6 - dates.first_date # 1-6
dates['interval_9'] = dates.after_d_9 - dates.first_date # 1-9
dates['interval_1'] = dates.after_d_1 - dates.first_date # 1-12

dates['interval'] = dates.interval_0 + dates.interval_1 # whole

dates['interval_0103'] = dates.after_d_3 - dates.first_date # 1-3
dates['interval_0406'] = dates.after_d_6 - dates.after_d_3 # 4-6
dates['interval_0709'] = dates.after_d_9 - dates.after_d_6 # 7-9
dates['interval_1012'] = dates.after_d_1 - dates.first_date # 10-12

dates['interval_0106'] = dates.after_d_6 - dates.first_date # 1-6
dates['interval_0712'] = dates.after_d_1 - dates.after_d_6 # 7-12

dates.interval = dates.interval.apply(lambda td: (td / np.timedelta64(1, 'D')))
dates.interval_0 = dates.interval_0.apply(lambda td: (td / np.timedelta64(1, 'D')))

dates.interval_3 = dates.interval_3.apply(lambda td: (td / np.timedelta64(1, 'D')))
dates.interval_6 = dates.interval_6.apply(lambda td: (td / np.timedelta64(1, 'D')))
dates.interval_9 = dates.interval_9.apply(lambda td: (td / np.timedelta64(1, 'D')))
dates.interval_1 = dates.interval_1.apply(lambda td: (td / np.timedelta64(1, 'D')))

dates.interval_0103 = dates.interval_0103.apply(lambda td: (td / np.timedelta64(1, 'D')))
dates.interval_0406 = dates.interval_0406.apply(lambda td: (td / np.timedelta64(1, 'D')))
dates.interval_0709 = dates.interval_0709.apply(lambda td: (td / np.timedelta64(1, 'D')))
dates.interval_1012 = dates.interval_1012.apply(lambda td: (td / np.timedelta64(1, 'D')))

dates.interval_0106 = dates.interval_0106.apply(lambda td: (td / np.timedelta64(1, 'D')))
dates.interval_0712 = dates.interval_0712.apply(lambda td: (td / np.timedelta64(1, 'D')))

In [85]:
dates_df = dates.copy()
dates_df.proj_start_date = dates_df.apply(lambda row: str(row.proj_start_date), axis=1)
dates_df.proj_end_date = dates_df.apply(lambda row: str(row.proj_end_date), axis=1)
dates_df.first_date = dates_df.apply(lambda row: str(row.first_date), axis=1)
dates_df.after_d_3 = dates_df.apply(lambda row: str(row.after_d_3), axis=1)
dates_df.after_d_6 = dates_df.apply(lambda row: str(row.after_d_6), axis=1)
dates_df.after_d_9 = dates_df.apply(lambda row: str(row.after_d_9), axis=1)
dates_df.after_d_1 = dates_df.apply(lambda row: str(row.after_d_1), axis=1)
# save_db('dates', dates_df)

In [86]:
len(dates)

26229

In [123]:
dates = dates[dates.pid.isin(crt_exp.pid)]

In [191]:
len(dates)

2137

# Update

In [189]:
# drop sys. updates
upd_all['sys'] = upd_all.upd_id.apply(lambda uid: True if 'sys' in uid else False)
upd_all = upd_all[upd_all.sys==False].copy()

# update body

upd_body = upd_body.drop_duplicates()

# For backers only.
upd_body['backer_only'] = upd_body.upd_body=='None' 
backer_only = upd_body[upd_body.backer_only==True].upd_id.tolist()
# stat_backer_only = upd_body.groupby('pid').sum().reset_index().merge(update_count, on=['pid'])
# backer_only_rate = stat_backer_only.backer_only / stat_backer_only.upd_count
upd_body_a = upd_body[~upd_body.upd_id.isin(backer_only)].copy()

# calculate each body
upd_body['body_length'] = upd_body.apply(lambda row: count_length(row.upd_body) 
                                         if row.upd_body != 'None' else 0, axis=1)
upd_body['body_image_count'] = upd_body.apply(lambda row: image_count(row.upd_body), axis=1)
upd_body['body_video_count'] = upd_body.apply(lambda row: video_count(row.upd_body), axis=1)

# all

# upd_count
upd = dates.merge(upd_body.groupby('pid').upd_body.count().reset_index().rename(
    index=str, columns={'upd_body':'upd_count'}), on=['pid'], how='outer')
upd = upd.merge(upd_body_a.groupby('pid').upd_body.count().reset_index().rename(
    index=str, columns={'upd_body':'upd_count_b'}), on=['pid'], how='outer')


# upd_freq
upd['upd_freq'] = upd.interval / upd.upd_count


# upd_avg_len, upd_avg_image, upd_avg_video
upd_stat_all = upd_body.groupby('pid').sum().reset_index().copy() # sum all
upd = upd.merge(upd_stat_all, on=['pid'])
upd = upd.rename(index=str, columns={'body_length': 'upd_len', 'body_image_count': 'upd_image', 
                                     'body_video_count': 'upd_video'})

upd['upd_avg_len'] = upd.upd_len / upd.upd_count_b
upd['upd_avg_image'] = upd.upd_image / upd.upd_count_b
upd['upd_avg_video'] = upd.upd_video / upd.upd_count_b


In [92]:
upd_date_all = upd_all[upd_all.upd_id.isin(upd_body.upd_id)][['pid', 'upd_date', 'upd_id']].copy()
upd_date_all.upd_date = upd_date_all.upd_date.apply(
    lambda str_date: datetime.datetime.strptime(str_date, "%Y-%m-%d %H:%M:%S"))

def inter_upd(name, str_lb, str_ub):
    temp = upd_date_all.merge(dates, on=['pid']).copy()
    temp = temp[(temp.upd_date >= temp[str_lb]) & (temp.upd_date < temp[str_ub])]
    # not backer only
    temp_b = temp[~temp.upd_id.isin(backer_only)].copy()
    
    temp_upd_id = temp[['pid', 'upd_id']].copy()
    
    result = crt_exp[['pid']].merge(dates, on=['pid']).copy()
    result = result.merge(temp.groupby('pid').count().reset_index()[['pid', 'upd_date']].rename(
        index=str, columns={'upd_date':'upd_count_' + name}), on=['pid'], how='outer')
    
    result = result.merge(temp_b.groupby('pid').count().reset_index()[['pid', 'upd_date']].rename(
        index=str, columns={'upd_date':'upd_count_a_' + name}), on=['pid'], how='outer')
    
    # frequency
    result['upd_freq_' + name] = result['interval_' + name] / result['upd_count_' + name]
#     temp = temp.fillna(0)
    
    
    # including body_length, body_image_count, body_video_count
    temp_body = upd_body[upd_body.upd_id.isin(temp_upd_id.upd_id)] 
    temp_body = temp_body.groupby('pid').sum().reset_index().copy()[
        ['pid', 'body_length', 'body_image_count', 'body_video_count']]
    result = result.merge(temp_body, on=['pid'], how='outer').rename(
        index=str, columns={'body_length': 'upd_len_' + name, 
                 'body_image_count': 'upd_image_' + name, 'body_video_count': 'upd_video_' + name})
    
    result['upd_avg_len_' + name] = result['upd_len_' + name] / result['upd_count_a_' + name]
    result['upd_avg_image_' + name] = result['upd_image_' + name] / result['upd_count_a_' + name]
    result['upd_avg_video_' + name] = result['upd_video_' + name] / result['upd_count_a_' + name]
    
    return result

In [125]:
upd_0 = inter_upd('0', 'proj_start_date', 'first_date')

upd_3 = inter_upd('3', 'first_date', 'after_d_3')
upd_6 = inter_upd('6', 'first_date', 'after_d_6')
upd_9 = inter_upd('9', 'first_date', 'after_d_9')
upd_1 = inter_upd('1', 'first_date', 'after_d_1')

upd_0103 = inter_upd('0103', 'first_date', 'after_d_3')
upd_0406 = inter_upd('0406', 'after_d_3', 'after_d_6')
upd_0709 = inter_upd('0709', 'after_d_6', 'after_d_9')
upd_1012 = inter_upd('1012', 'after_d_9', 'after_d_1')

upd_0106 = inter_upd('0106', 'first_date', 'after_d_6')
upd_0712 = inter_upd('0712', 'after_d_6', 'after_d_1')

In [142]:
upd_0709

Unnamed: 0,pid,proj_start_date,first_date,after_d_3,after_d_6,after_d_9,after_d_1,interval_0,interval_3,interval_6,...,interval_0712,upd_count_0709,upd_count_a_0709,upd_freq_0709,upd_len_0709,upd_image_0709,upd_video_0709,upd_avg_len_0709,upd_avg_image_0709,upd_avg_video_0709
0,156564766,2016-02-10,2016-04-01,2016-07-01,2016-10-01,2017-01-01,2017-04-01,51.0,91.0,183.0,...,182.0,,,,,,,,,
1,52884144,2015-12-07,2016-04-01,2016-07-01,2016-10-01,2017-01-01,2017-04-01,116.0,91.0,183.0,...,182.0,,,,,,,,,
2,1770840100,2016-01-16,2016-04-01,2016-07-01,2016-10-01,2017-01-01,2017-04-01,76.0,91.0,183.0,...,182.0,,,,,,,,,
3,1907099186,2016-01-05,2016-03-01,2016-06-01,2016-09-01,2016-12-01,2017-03-01,56.0,92.0,184.0,...,181.0,4.0,,22.750000,0.0,0.0,0.0,,,
4,215985998,2015-04-14,2016-01-01,2016-04-01,2016-07-01,2016-10-01,2017-01-01,262.0,91.0,182.0,...,184.0,,,,,,,,,
5,319905823,2015-11-02,2016-05-01,2016-08-01,2016-11-01,2017-02-01,2017-05-01,181.0,92.0,184.0,...,181.0,1.0,,92.000000,0.0,0.0,0.0,,,
6,145077719,2016-04-27,2016-06-01,2016-09-01,2016-12-01,2017-03-01,2017-06-01,35.0,92.0,183.0,...,182.0,,,,,,,,,
7,1134710774,2015-05-26,2016-04-01,2016-07-01,2016-10-01,2017-01-01,2017-04-01,311.0,91.0,183.0,...,182.0,,,,,,,,,
8,650958390,2016-04-20,2016-06-01,2016-09-01,2016-12-01,2017-03-01,2017-06-01,42.0,92.0,183.0,...,182.0,5.0,5.0,18.000000,950.0,0.0,1.0,190.000000,0.000000,0.2
9,1619196326,2016-04-25,2016-05-01,2016-08-01,2016-11-01,2017-02-01,2017-05-01,6.0,92.0,184.0,...,181.0,2.0,1.0,46.000000,211.0,8.0,0.0,211.000000,8.000000,0.0


In [127]:
len(upd_0)

2137

In [94]:
backer_only_rate = upd_body[upd_body.backer_only==True].groupby(by=['pid']).count()[['upd_body']].reset_index().rename(
    columns={'upd_body':'true'}).merge(upd_body[upd_body.backer_only==False].groupby(
    'pid').count()[['upd_body']].reset_index().rename(columns={'upd_body':'false'}), on='pid')
backer_only_rate['rate'] = backer_only_rate.true /(backer_only_rate.true+backer_only_rate.false)

In [95]:
backer_only_rate.rate.mean()

0.28436961899034863

In [159]:
interact_faq_upd = interact_faq_upd.merge(upd[['pid'] + upd.columns.tolist()[19:]], on=['pid'], how='outer')
interact_faq_upd = interact_faq_upd.merge(upd_0[['pid'] + upd_0.columns.tolist()[19:]], on=['pid'], how='outer')

interact_faq_upd = interact_faq_upd.merge(upd_3[['pid'] + upd_3.columns.tolist()[19:]], on=['pid'], how='outer')
interact_faq_upd = interact_faq_upd.merge(upd_6[['pid'] + upd_6.columns.tolist()[19:]], on=['pid'], how='outer')
interact_faq_upd = interact_faq_upd.merge(upd_9[['pid'] + upd_9.columns.tolist()[19:]], on=['pid'], how='outer')
interact_faq_upd = interact_faq_upd.merge(upd_1[['pid'] + upd_1.columns.tolist()[19:]], on=['pid'], how='outer')

interact_faq_upd = interact_faq_upd.merge(upd_0103[['pid'] + upd_0103.columns.tolist()[19:]], on=['pid'], how='outer')
interact_faq_upd = interact_faq_upd.merge(upd_0406[['pid'] + upd_0406.columns.tolist()[19:]], on=['pid'], how='outer')
interact_faq_upd = interact_faq_upd.merge(upd_0709[['pid'] + upd_0709.columns.tolist()[19:]], on=['pid'], how='outer')
interact_faq_upd = interact_faq_upd.merge(upd_1012[['pid'] + upd_1012.columns.tolist()[19:]], on=['pid'], how='outer')

interact_faq_upd = interact_faq_upd.merge(upd_0106[['pid'] + upd_0106.columns.tolist()[19:]], on=['pid'], how='outer')
interact_faq_upd = interact_faq_upd.merge(upd_0712[['pid'] + upd_0712.columns.tolist()[19:]], on=['pid'], how='outer')

In [160]:
interact_faq_upd.head()

Unnamed: 0,pid,has_faq,has_faq_0,has_faq_1,upd_count,upd_count_b,upd_freq,backer_only,upd_len,upd_image,...,upd_avg_video_0106,upd_count_0712,upd_count_a_0712,upd_freq_0712,upd_len_0712,upd_image_0712,upd_video_0712,upd_avg_len_0712,upd_avg_image_0712,upd_avg_video_0712
0,1369691611,False,False,False,10.0,4.0,47.9,6.0,79.0,0.0,...,,,,,,,,,,
1,1560156136,True,True,False,19.0,13.0,25.263158,6.0,284.0,13.0,...,0.0,,,,,,,,,
2,1122724205,True,True,False,15.0,3.0,26.4,12.0,187.0,6.0,...,0.0,,,,,,,,,
3,1681248498,True,True,False,28.0,28.0,27.857143,0.0,1315.0,41.0,...,0.0,,,,,,,,,
4,1652781117,False,False,False,6.0,4.0,62.166667,2.0,342.0,8.0,...,0.666667,1.0,1.0,183.0,126.0,1.0,0.0,126.0,1.0,0.0


the following only have backer only update...

In [161]:
len(interact_faq_upd)

2137

In [162]:
interact_faq_upd = interact_faq_upd.rename(
        index=str, columns={'avg_len': 'upd_avg_len', 'avg_image': 'upd_avg_image', 'avg_video': 'upd_avg_video'})

In [163]:
save_db('interact_faq_upd', interact_faq_upd)

In [164]:
# we already remove them

# drop unseen projects
# interact_faq_upd_a = interact_faq_upd[~interact_faq_upd.pid.isin(
#     upd_all[~upd_all.pid.isin(upd_body_a.pid)].pid.drop_duplicates().tolist())].copy()

# save_db('interact_faq_upd_a', interact_faq_upd_a)

In [165]:
len(interact_faq_upd_a)

2137