In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import ast
import glob
import tensorflow as tf
from tensorflow import keras

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras import layers

plt.style.use('ggplot')
%matplotlib inline

In [2]:
pd.set_option('display.max_columns', 50)

In [164]:
master = pd.read_csv('data/master_cleaned.csv')

In [165]:
comments = pd.read_csv('data/comments.csv')

In [166]:
dates = pd.read_csv('data/comments_dates.csv', names=['name','url','dates'])

In [167]:
print(comments.shape)
print(dates.shape)

(17642, 3)
(6902, 3)


In [168]:
comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17642 entries, 0 to 17641
Data columns (total 3 columns):
Unnamed: 0    17642 non-null int64
url           17642 non-null object
comments      17642 non-null object
dtypes: int64(1), object(2)
memory usage: 413.6+ KB


In [169]:
comments.drop('Unnamed: 0', axis=1, inplace=True)

In [170]:
comments.head()

Unnamed: 0,url,comments
0,https://www.kickstarter.com/projects/117862918...,[]
1,https://www.kickstarter.com/projects/lizardsho...,[]
2,https://www.kickstarter.com/projects/184368767...,[]
3,https://www.kickstarter.com/projects/376746530...,[]
4,https://www.kickstarter.com/projects/116371417...,[]


In [171]:
comments['comments'][11] = []
comments['comments'][12] = []

In [172]:
urls = pd.read_csv('data/comments_urls_copy.csv')

In [173]:
urls.head()

Unnamed: 0.1,Unnamed: 0,id,name,url
0,0,751419376,Iron Age Kingdoms - First Free To Win Mobile S...,https://www.kickstarter.com/projects/117862918...
1,1,289952460,"""Lizard"" a short film",https://www.kickstarter.com/projects/lizardsho...
2,2,37585826,Total Franchise Football 2016,https://www.kickstarter.com/projects/184368767...
3,3,1727301949,The Trans-American Psychogeographic Literary C...,https://www.kickstarter.com/projects/376746530...
4,4,144196412,The Golden Ticket to the Wonka Factory,https://www.kickstarter.com/projects/116371417...


In [174]:
# merge comments table with url tables to get campaign id
full = pd.merge(urls, comments, how='right', on= 'url', )

In [175]:
full.shape

(17642, 5)

In [176]:
full_with_dates = pd.merge(full, dates, on='url')
full_with_dates.drop(columns=['Unnamed: 0', 'name_y'], inplace=True)

In [177]:
full_with_dates.rename(columns={'name_x': 'name'}, inplace=True)


In [178]:

full_with_dates.head()

Unnamed: 0,id,name,url,comments,dates
0,1769794304,Burma Storybook,https://www.kickstarter.com/projects/618155130...,['Oh! My! The book has arrived and I am speech...,"['June 30, 2017 10:52 AM PDT', 'May 26, 2017 3..."
1,526305087,The Alchemy of Collaboration,https://www.kickstarter.com/projects/punchgrap...,"[""I know you read the messages. I've been chas...","['June 1, 2020 10:55 AM PDT', 'May 21, 2020 9:..."
2,1103726466,Frack This? - The Wyoming Artist Expedition,https://www.kickstarter.com/projects/331530956...,"['To the Artists:', 'I encourage each artist t...","['May 30, 2012 3:37 PM PDT', 'May 14, 2012 2:4..."
3,2045995373,Cookietownworld,https://www.kickstarter.com/projects/cookietow...,['I think Cookietownworld is an awesome idea. ...,"['June 23, 2016 8:55 AM PDT']"
4,1228292914,Dunpets Colors: Monster-catching RPG,https://www.kickstarter.com/projects/gugamesde...,"['Any updates about the premier? :)', 'Any pro...","['April 17, 2018 6:49 PM PDT', 'January 22, 20..."


In [179]:
full_with_dates.shape

(6911, 5)

## Merge with Master

In [180]:
# full df with comments and dates
df = pd.merge(master, full_with_dates, on='name')
df.drop(columns=['Unnamed: 0', 'country_displayable_name'], inplace=True)
df.shape

(6959, 21)

In [181]:
# master list with outer join, with comments or no comments 
df_full = pd.merge(master, full_with_dates, on='name', how='left')
df_full.drop(columns=['Unnamed: 0','country_displayable_name'], inplace=True)
df_full.shape

(169652, 21)

In [182]:
cols_to_convert = ['deadline', 'launched_at']
for c in cols_to_convert:
    df[c] = pd.to_datetime(df[c])

In [183]:
cols_to_convert = ['deadline', 'launched_at']
for c in cols_to_convert:
    df_full[c] = pd.to_datetime(df_full[c])

In [184]:
df['mid_campaign'] = df['launched_at'] + pd.DateOffset(15)
df['mid_campaign'] = df['mid_campaign'].dt.round('d')
df['dates'] = df['dates'].map(lambda x: x.split("',"))
df['comment_length'] = df['dates'].map(lambda x: len(x))

In [129]:
# df_full['mid_campaign'] = df_full['launched_at'] + pd.DateOffset(15)
# df_full['mid_campaign'] = df_full['mid_campaign'].dt.round('d')
# df_full['dates'] = df_full['dates'].map(lambda x: x.split("',"))
# df_full['comment_length'] = df_full['dates'].map(lambda x: len(x))

In [186]:
df

Unnamed: 0,backers_count,blurb,category,deadline,goal,launched_at,name,pledged,slug,staff_pick,state,launched_year,launched_month,campaign_length,category_type,blurb_length,country_name,id,url,comments,dates,mid_campaign,comment_length
0,152,A photography and poetry book featuring contem...,"{""id"":278,""name"":""People"",""slug"":""photography/...",2017-05-28 17:00:00,15000.0,2017-05-03 16:57:18,Burma Storybook,15777.00,burma-storybook,True,successful,2017,5,25,photography,13,the Netherlands,1769794304,https://www.kickstarter.com/projects/618155130...,['Oh! My! The book has arrived and I am speech...,"[['June 30, 2017 10:52 AM PDT, 'May 26, 2017 ...",2017-05-19,2
1,116,100 sumie ink originals by David Mack on top o...,"{""id"":276,""name"":""Fine Art"",""slug"":""photograph...",2018-07-29 21:05:01,8000.0,2018-06-29 21:05:01,The Alchemy of Collaboration,16706.00,the-alchemy-of-collaboration,False,successful,2018,6,30,photography,21,the United States,526305087,https://www.kickstarter.com/projects/punchgrap...,"[""I know you read the messages. I've been chas...","[['June 1, 2020 10:55 AM PDT, 'May 21, 2020 9...",2018-07-15,21
2,50,"8 artists explore the collision of art, the en...","{""id"":54,""name"":""Mixed Media"",""slug"":""art/mixe...",2012-06-04 03:59:00,3300.0,2012-05-03 12:13:56,Frack This? - The Wyoming Artist Expedition,3805.00,frack-this-the-wyoming-artist-expedition,False,successful,2012,5,32,art,16,the United States,1103726466,https://www.kickstarter.com/projects/331530956...,"['To the Artists:', 'I encourage each artist t...","[['May 30, 2012 3:37 PM PDT, 'May 14, 2012 2:...",2012-05-19,4
3,8,Cookietownworld is an interactive website desi...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2016-08-07 16:15:57,95000.0,2016-06-08 16:15:57,Cookietownworld,3229.00,cookietownworld,False,failed,2016,6,60,games,13,the United States,2045995373,https://www.kickstarter.com/projects/cookietow...,['I think Cookietownworld is an awesome idea. ...,"[['June 23, 2016 8:55 AM PDT']]",2016-06-24,1
4,59,Explore a magical world full of monsters and d...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2017-12-22 20:25:20,15000.0,2017-11-22 20:25:20,Dunpets Colors: Monster-catching RPG,4310.75,dunpets-colors-dungeon-crawl-and-pets,True,failed,2017,11,30,games,22,Spain,1228292914,https://www.kickstarter.com/projects/gugamesde...,"['Any updates about the premier? :)', 'Any pro...","[['April 17, 2018 6:49 PM PDT, 'January 22, 2...",2017-12-08,18
5,24,I am recreating the I Dream of Jeannie bottle ...,"{""id"":54,""name"":""Mixed Media"",""slug"":""art/mixe...",2013-03-22 18:38:24,1500.0,2013-02-20 19:38:24,My Dream of Jeannie,1780.00,my-dream-of-jeannie,False,successful,2013,2,30,art,17,the United States,1617261531,https://www.kickstarter.com/projects/163694144...,"[""Congratulations on reaching your goal early!...","[['March 17, 2013 3:11 PM PDT, 'March 17, 201...",2013-03-08,5
6,464,Watch our FULL Pilot Short Film Exclusively on...,"{""id"":291,""name"":""Action"",""slug"":""film & video...",2019-12-13 19:00:00,25000.0,2019-11-14 03:01:07,Reign of Judges: FINAL STAND to Fund our Full ...,52760.45,reign-of-judges-final-stand-to-fund-our-full-f...,False,successful,2019,11,30,film & video,13,the United States,540877514,https://www.kickstarter.com/projects/darinsout...,['I gave $25. I wish it was more. I hope that...,"[['December 12, 2019 8:54 PM PST, 'December 1...",2019-11-29,12
7,150,LoveFound | A mind's journey through love | A ...,"{""id"":254,""name"":""Performances"",""slug"":""dance/...",2017-01-11 04:59:00,6000.0,2016-12-07 15:08:28,LoveFound by Jon Rua,13225.00,lovefound-by-jon-rua,True,successful,2016,12,35,dance,23,the United States,506042363,https://www.kickstarter.com/projects/makehisto...,"['Jon, I just watched the film - with earbuds ...","[['January 31, 2017 2:26 PM PST, 'January 9, ...",2016-12-23,3
8,32,"We're going to PAX Australia, who's coming wit...","{""id"":295,""name"":""Festivals"",""slug"":""film & vi...",2018-10-06 14:00:08,3000.0,2018-09-06 14:00:08,Send GenerOZity to the Moon! (Live Performance...,4946.32,send-generozity-to-the-moon-live-performance-a...,False,successful,2018,9,30,film & video,9,Australia,214046313,https://www.kickstarter.com/projects/generozit...,"['All the best guys & girls, see you at PAX!']","[['September 24, 2018 7:25 PM PDT']]",2018-09-22,1
9,305,"2 Days, 1 Outdoor Stage, over 30 performers.\r...","{""id"":39,""name"":""Hip-Hop"",""slug"":""music/hip-ho...",2017-07-01 04:04:44,25000.0,2017-06-01 04:04:44,Beats n Bars Festival 2017,29811.00,beats-n-bars-festival-2017,False,successful,2017,6,30,music,21,the United States,1093019685,https://www.kickstarter.com/projects/beatsnbar...,"['Knew you could do it.', 'Congratulations! Th...","[['July 3, 2017 3:23 PM PDT, 'July 1, 2017 8:...",2017-06-16,3


In [185]:
df_full

Unnamed: 0,backers_count,blurb,category,deadline,goal,launched_at,name,pledged,slug,staff_pick,state,launched_year,launched_month,campaign_length,category_type,blurb_length,country_name,id,url,comments,dates
0,5,A Free to Win Online Mobile Strategy Game to e...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2017-10-09 14:41:03,1000.0,2017-09-25 14:41:03,Iron Age Kingdoms - First Free To Win Mobile S...,105.00,iron-age-kingdoms-first-free-to-win-mobile-str...,False,failed,2017,9,14,games,24,Canada,,,,
1,19,"Lizard is a dark comedy and modern western, th...","{""id"":11,""name"":""Film & Video"",""slug"":""film & ...",2019-09-21 23:39:48,1100.0,2019-07-23 23:39:48,"""Lizard"" a short film",2358.00,lizard-a-short-film,False,successful,2019,7,60,film & video,15,the United States,,,,
2,5,My friend and I are making an app that has nev...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2016-03-22 21:11:17,25000.0,2016-01-22 22:11:17,Total Franchise Football 2016,172.00,total-franchise-football-2016,False,failed,2016,1,60,games,25,the United States,,,,
3,23,"More than a bike trip, a state of mind. And a ...","{""id"":48,""name"":""Nonfiction"",""slug"":""publishin...",2012-09-25 02:33:06,500.0,2012-09-15 02:33:06,The Trans-American Psychogeographic Literary C...,501.00,the-trans-american-psychogeographic-literary-c...,False,successful,2012,9,10,publishing,12,the United States,,,,
4,0,This is the birth of a revolutionary artist......,"{""id"":39,""name"":""Hip-Hop"",""slug"":""music/hip-ho...",2017-12-11 02:43:11,2500.0,2017-11-10 02:43:11,The Golden Ticket to the Wonka Factory,0.00,the-golden-ticket-to-the-wonka-factory,False,failed,2017,11,31,music,23,the United States,,,,
5,182,A digital archive of New Orleans hiphop and bo...,"{""id"":39,""name"":""Hip-Hop"",""slug"":""music/hip-ho...",2013-12-22 20:36:09,6000.0,2013-12-02 20:36:09,NOLA Hiphop Archive,6173.05,nola-hiphop-archive,True,successful,2013,12,20,music,18,the United States,,,,
6,114,"For one night only, TOKYOtheCOMPANY performs l...","{""id"":254,""name"":""Performances"",""slug"":""dance/...",2014-09-04 03:59:00,20000.0,2014-08-13 23:45:22,TOKYOtheCOMPANY Presents: Where It Began,20552.00,tokyothecompany-presents-where-it-began,True,successful,2014,8,21,dance,15,the United States,,,,
7,14,What do you get when you mix members of Cypres...,"{""id"":39,""name"":""Hip-Hop"",""slug"":""music/hip-ho...",2014-08-26 00:19:16,3000.0,2014-07-29 00:19:16,"Sen Dog of Cypress Hill presents ""HIP HOP"" by ...",3260.00,sen-dog-of-cypress-hill-presents-hip-hop-by-bi...,True,successful,2014,7,28,music,19,the United States,,,,
8,31,Seattle dance company celebrates six years wit...,"{""id"":254,""name"":""Performances"",""slug"":""dance/...",2018-02-16 19:43:05,1500.0,2018-01-17 19:43:05,Encore Performance,2051.00,encore-performance,False,successful,2018,1,30,dance,11,the United States,,,,
9,1,DJ A.P. has been focusing on his production an...,"{""id"":38,""name"":""Electronic Music"",""slug"":""mus...",2013-08-07 21:10:38,2500.0,2013-07-08 21:10:38,DJ A.P. Debut Studio Album,1.00,dj-ap-debut-studio-album,False,failed,2013,7,30,music,26,the United States,,,,


In [187]:
def extract(cat):
    x = ast.literal_eval(re.search('({.+})', cat).group(0))
    if 'parent_name' not in x.keys():
        return x['slug']
    else:
        return x['parent_name'].lower()

In [191]:
df['category_type'] = df['category'].map(lambda x: extract(x))
df = df[df['state'].isin(['successful', 'failed'])]
df_full['category_type'] = df_full['category'].map(lambda x: extract(x))
df_full = df_full[df_full['state'].isin(['successful', 'failed'])]

In [192]:
print(df.shape)
print(df_full.shape)

(6959, 23)
(169652, 21)


In [136]:
df_full.head()

Unnamed: 0,backers_count,blurb,category,deadline,goal,launched_at,name,pledged,slug,staff_pick,state,launched_year,launched_month,campaign_length,category_type,blurb_length,country_name,id,url,comments,dates,mid_campaign
0,5,A Free to Win Online Mobile Strategy Game to e...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2017-10-09 14:41:03,1000.0,2017-09-25 14:41:03,Iron Age Kingdoms - First Free To Win Mobile S...,105.0,iron-age-kingdoms-first-free-to-win-mobile-str...,False,failed,2017,9,14,games,24,Canada,,,,0,2017-10-11
1,19,"Lizard is a dark comedy and modern western, th...","{""id"":11,""name"":""Film & Video"",""slug"":""film & ...",2019-09-21 23:39:48,1100.0,2019-07-23 23:39:48,"""Lizard"" a short film",2358.0,lizard-a-short-film,False,successful,2019,7,60,film & video,15,the United States,,,,0,2019-08-08
2,5,My friend and I are making an app that has nev...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2016-03-22 21:11:17,25000.0,2016-01-22 22:11:17,Total Franchise Football 2016,172.0,total-franchise-football-2016,False,failed,2016,1,60,games,25,the United States,,,,0,2016-02-07
3,23,"More than a bike trip, a state of mind. And a ...","{""id"":48,""name"":""Nonfiction"",""slug"":""publishin...",2012-09-25 02:33:06,500.0,2012-09-15 02:33:06,The Trans-American Psychogeographic Literary C...,501.0,the-trans-american-psychogeographic-literary-c...,False,successful,2012,9,10,publishing,12,the United States,,,,0,2012-09-30
4,0,This is the birth of a revolutionary artist......,"{""id"":39,""name"":""Hip-Hop"",""slug"":""music/hip-ho...",2017-12-11 02:43:11,2500.0,2017-11-10 02:43:11,The Golden Ticket to the Wonka Factory,0.0,the-golden-ticket-to-the-wonka-factory,False,failed,2017,11,31,music,23,the United States,,,,0,2017-11-25


In [137]:
df['dates'][2]

["['May 30, 2012 3:37 PM PDT",
 " 'May 14, 2012 2:46 PM PDT",
 " 'May 12, 2012 9:48 AM PDT",
 " 'May 7, 2012 8:00 PM PDT']"]

In [193]:
def to_date(x):
    lst = []
    for i in x:
        i = i.replace("[", '').replace("]", '')
        d = pd.to_datetime(i[:-4])
        lst.append(d)
    return lst

In [194]:
df['dates'] = df['dates'].map(lambda x: to_date(x))

In [195]:
def date_subtract(df):
    lst = []
    for index, row in df.iterrows():
        row_lst =[]
        r = row['dates']
        for i in r:
            if i < row['mid_campaign']:
                row_lst.append(i)
        lst.append(row_lst)
    return lst 

In [196]:
def real_comments(df):
    lst = []
    for index, row in df.iterrows():
        r = eval(row['comments'])
        a = row['pre_mid_campaign_dates_length']
        if a > 0:
            lst.append(r[-a:])
        else:
            lst.append([])
    return lst


# eval(df['comments'][2])[-3:]

In [197]:
def have_comments(x):
    if x > 0:
        return 1
    else:
        return 0

In [198]:
df['pre_mid_campaign_dates'] = date_subtract(df)
df['pre_mid_campaign_dates_length'] = df['pre_mid_campaign_dates'].apply(lambda x: len(x))
df['have_comments'] = df['pre_mid_campaign_dates_length'].apply(lambda x: have_comments(x))

In [199]:
df['pre_mid_campaign_comments'] = real_comments(df)

In [200]:
df.head()

Unnamed: 0,backers_count,blurb,category,deadline,goal,launched_at,name,pledged,slug,staff_pick,state,launched_year,launched_month,campaign_length,category_type,blurb_length,country_name,id,url,comments,dates,mid_campaign,comment_length,pre_mid_campaign_dates,pre_mid_campaign_dates_length,have_comments,pre_mid_campaign_comments
0,152,A photography and poetry book featuring contem...,"{""id"":278,""name"":""People"",""slug"":""photography/...",2017-05-28 17:00:00,15000.0,2017-05-03 16:57:18,Burma Storybook,15777.0,burma-storybook,True,successful,2017,5,25,photography,13,the Netherlands,1769794304,https://www.kickstarter.com/projects/618155130...,['Oh! My! The book has arrived and I am speech...,"[2017-06-30 10:52:00, 2017-05-26 03:53:00]",2017-05-19,2,[],0,0,[]
1,116,100 sumie ink originals by David Mack on top o...,"{""id"":276,""name"":""Fine Art"",""slug"":""photograph...",2018-07-29 21:05:01,8000.0,2018-06-29 21:05:01,The Alchemy of Collaboration,16706.0,the-alchemy-of-collaboration,False,successful,2018,6,30,photography,21,the United States,526305087,https://www.kickstarter.com/projects/punchgrap...,"[""I know you read the messages. I've been chas...","[2020-06-01 10:55:00, 2020-05-21 09:46:00, 202...",2018-07-15,21,[],0,0,[]
2,50,"8 artists explore the collision of art, the en...","{""id"":54,""name"":""Mixed Media"",""slug"":""art/mixe...",2012-06-04 03:59:00,3300.0,2012-05-03 12:13:56,Frack This? - The Wyoming Artist Expedition,3805.0,frack-this-the-wyoming-artist-expedition,False,successful,2012,5,32,art,16,the United States,1103726466,https://www.kickstarter.com/projects/331530956...,"['To the Artists:', 'I encourage each artist t...","[2012-05-30 15:37:00, 2012-05-14 14:46:00, 201...",2012-05-19,4,"[2012-05-14 14:46:00, 2012-05-12 09:48:00, 201...",3,1,"[Lori, the kids will want you and Uncle Michae..."
3,8,Cookietownworld is an interactive website desi...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2016-08-07 16:15:57,95000.0,2016-06-08 16:15:57,Cookietownworld,3229.0,cookietownworld,False,failed,2016,6,60,games,13,the United States,2045995373,https://www.kickstarter.com/projects/cookietow...,['I think Cookietownworld is an awesome idea. ...,[2016-06-23 08:55:00],2016-06-24,1,[2016-06-23 08:55:00],1,1,[I think Cookietownworld is an awesome idea. I...
4,59,Explore a magical world full of monsters and d...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2017-12-22 20:25:20,15000.0,2017-11-22 20:25:20,Dunpets Colors: Monster-catching RPG,4310.75,dunpets-colors-dungeon-crawl-and-pets,True,failed,2017,11,30,games,22,Spain,1228292914,https://www.kickstarter.com/projects/gugamesde...,"['Any updates about the premier? :)', 'Any pro...","[2018-04-17 18:49:00, 2018-01-22 01:59:00, 201...",2017-12-08,18,"[2017-12-03 12:14:00, 2017-12-03 07:10:00, 201...",11,1,"[Any progress update on the premiere?, Sorry t..."


In [201]:
df.shape

(6959, 27)

In [46]:
eval(df['comments'][2])[-3:]

['Lori, the kids will want you and Uncle Michael to sign their card!  Enjoy your trip.',
 'My $25 pledge was on behalf of Nina Gibbons, who donated to support this project.  Thanks so much for your donation.',
 'Hope you guys have a great trip! My only request-my brother and Lori have to sign my book!']

In [202]:
df.columns

Index(['backers_count', 'blurb', 'category', 'deadline', 'goal', 'launched_at',
       'name', 'pledged', 'slug', 'staff_pick', 'state', 'launched_year',
       'launched_month', 'campaign_length', 'category_type', 'blurb_length',
       'country_name', 'id', 'url', 'comments', 'dates', 'mid_campaign',
       'comment_length', 'pre_mid_campaign_dates',
       'pre_mid_campaign_dates_length', 'have_comments',
       'pre_mid_campaign_comments'],
      dtype='object')

In [203]:
df_full.columns

Index(['backers_count', 'blurb', 'category', 'deadline', 'goal', 'launched_at',
       'name', 'pledged', 'slug', 'staff_pick', 'state', 'launched_year',
       'launched_month', 'campaign_length', 'category_type', 'blurb_length',
       'country_name', 'id', 'url', 'comments', 'dates'],
      dtype='object')

In [156]:
df_full.shape

(169652, 22)

In [210]:
df_filtered = df[['name', 'have_comments']].copy()

(6959, 2)

In [216]:
df_master = pd.merge(df_full, df_filtered, on='name', how='left')
df_master = df_master[df_master['state'].isin(['successful', 'failed'])]
df_master['category_type'] = df_master['category'].map(lambda x: extract(x))
df_master['have_comments'].fillna(0, inplace=True)

In [221]:
df_master['have_comments'].value_counts()

0.0    166600
1.0      3588
Name: have_comments, dtype: int64

## One Hot Encoding 

In [225]:
df_master.head()

Unnamed: 0,backers_count,blurb,category,deadline,goal,launched_at,name,pledged,slug,staff_pick,state,launched_year,launched_month,campaign_length,category_type,blurb_length,country_name,id,url,comments,dates,have_comments
0,5,A Free to Win Online Mobile Strategy Game to e...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2017-10-09 14:41:03,1000.0,2017-09-25 14:41:03,Iron Age Kingdoms - First Free To Win Mobile S...,105.0,iron-age-kingdoms-first-free-to-win-mobile-str...,False,failed,2017,9,14,games,24,Canada,,,,,0.0
1,19,"Lizard is a dark comedy and modern western, th...","{""id"":11,""name"":""Film & Video"",""slug"":""film & ...",2019-09-21 23:39:48,1100.0,2019-07-23 23:39:48,"""Lizard"" a short film",2358.0,lizard-a-short-film,False,successful,2019,7,60,film & video,15,the United States,,,,,0.0
2,5,My friend and I are making an app that has nev...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2016-03-22 21:11:17,25000.0,2016-01-22 22:11:17,Total Franchise Football 2016,172.0,total-franchise-football-2016,False,failed,2016,1,60,games,25,the United States,,,,,0.0
3,23,"More than a bike trip, a state of mind. And a ...","{""id"":48,""name"":""Nonfiction"",""slug"":""publishin...",2012-09-25 02:33:06,500.0,2012-09-15 02:33:06,The Trans-American Psychogeographic Literary C...,501.0,the-trans-american-psychogeographic-literary-c...,False,successful,2012,9,10,publishing,12,the United States,,,,,0.0
4,0,This is the birth of a revolutionary artist......,"{""id"":39,""name"":""Hip-Hop"",""slug"":""music/hip-ho...",2017-12-11 02:43:11,2500.0,2017-11-10 02:43:11,The Golden Ticket to the Wonka Factory,0.0,the-golden-ticket-to-the-wonka-factory,False,failed,2017,11,31,music,23,the United States,,,,,0.0


In [226]:
df_master_ohe = pd.get_dummies(df_master[['category_type', 'country_name', 'staff_pick']])

In [227]:
df_master.columns

Index(['backers_count', 'blurb', 'category', 'deadline', 'goal', 'launched_at',
       'name', 'pledged', 'slug', 'staff_pick', 'state', 'launched_year',
       'launched_month', 'campaign_length', 'category_type', 'blurb_length',
       'country_name', 'id', 'url', 'comments', 'dates', 'have_comments'],
      dtype='object')

In [234]:
data = pd.merge(df_master, df_master_ohe, left_index=True, right_index=True)
data.shape

(170188, 48)

In [235]:
good_data = data[data['state'].isin(['successful', 'failed'])]
live = data[data['state'].isin(['live'])]
print(good_data.shape)
print(live.shape)

(170188, 48)
(0, 48)


In [236]:
good_data['state'] = good_data['state'].replace(to_replace=['successful', 'failed'], value=[1,0])

In [237]:
model_data = good_data.copy()

In [238]:
model_data.shape

(170188, 48)

In [239]:
# model_data.dropna(inplace=True)
# print(model_data.shape)

In [241]:
model_data.columns

Index(['backers_count', 'blurb', 'category', 'deadline', 'goal', 'launched_at',
       'name', 'pledged', 'slug', 'staff_pick_x', 'state', 'launched_year',
       'launched_month', 'campaign_length', 'category_type', 'blurb_length',
       'country_name', 'id', 'url', 'comments', 'dates', 'have_comments',
       'staff_pick_y', 'category_type_art', 'category_type_comics',
       'category_type_crafts', 'category_type_dance', 'category_type_design',
       'category_type_fashion', 'category_type_film & video',
       'category_type_food', 'category_type_games', 'category_type_journalism',
       'category_type_music', 'category_type_photography',
       'category_type_publishing', 'category_type_technology',
       'category_type_theater', 'country_name_Australia',
       'country_name_Canada', 'country_name_France', 'country_name_Germany',
       'country_name_Italy', 'country_name_Mexico', 'country_name_Spain',
       'country_name_other', 'country_name_the Netherlands',
       'count

In [244]:
X = model_data.drop(['backers_count','blurb', 'category', 'deadline', 'launched_at', 
                     'name', 'pledged', 'slug', 'state', 'category_type', 'country_name',
                    'staff_pick_y', 'id', 'url', 'comments', 'dates'],
                   axis=1)
y = model_data['state']


In [245]:
X.shape

(170188, 32)

## Training on Scraped Campaigns Only


In [264]:
df_filtered = df[['name', 'have_comments']].copy()

In [301]:
model_data_ = pd.merge(full, df_filtered, on='name', how='left')
model_data_.fillna(0,inplace=True)


In [277]:
model_data1.head()

Unnamed: 0.1,Unnamed: 0,id,name,url,comments,have_comments
0,0,751419376,Iron Age Kingdoms - First Free To Win Mobile S...,https://www.kickstarter.com/projects/117862918...,[],0.0
1,1,289952460,"""Lizard"" a short film",https://www.kickstarter.com/projects/lizardsho...,[],0.0
2,2,37585826,Total Franchise Football 2016,https://www.kickstarter.com/projects/184368767...,[],0.0
3,3,1727301949,The Trans-American Psychogeographic Literary C...,https://www.kickstarter.com/projects/376746530...,[],0.0
4,4,144196412,The Golden Ticket to the Wonka Factory,https://www.kickstarter.com/projects/116371417...,[],0.0


In [278]:
model_data1 = pd.merge(data, model_data_, on='name', how='right')

In [303]:
model_data1.fillna(0, inplace=True)
# model_data1['state'] = model_data1['state'].apply(lambda x: 1 if x == '')
model_data1['state'] = model_data1['state'].replace(to_replace=['successful', 'failed'], value=[1,0])

In [304]:
X1 = model_data1.drop(['backers_count','blurb', 'category', 'deadline', 
                     'launched_at', 'name', 'pledged', 'slug', 'state', 'category_type', 'country_name', 'id_x', 'url_x',
                       'comments_x', 'dates', 'staff_pick_y', 'Unnamed: 0', 'id_y', 'url_y', 'comments_y', 'have_comments_y'],
                   axis=1)
y1 = model_data1['state']*1

## Modeling

In [246]:
## Old model with added "have comments" features. FULL DATASET
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [248]:
def get_scores(classifier, X_train, X_test, y_train, y_test, **kwargs):
    model = classifier(**kwargs)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    return model.score(X_test, y_test), precision_score(y_test, y_predict), recall_score(y_test, y_predict)
print("Model, Accuracy, Precision, Recall")
print("    Random Forest:", get_scores(RandomForestClassifier, X_train, X_test, y_train, y_test))
print("    Logistic Regression:", get_scores(LogisticRegression, X_train, X_test, y_train, y_test))
print("    Gradient Boost:", get_scores(GradientBoostingClassifier, X_train, X_test, y_train, y_test))

Model, Accuracy, Precision, Recall
    Random Forest: (0.7353985004818201, 0.7583190126542728, 0.7929429061504534)
    Logistic Regression: (0.622441065174983, 0.6177770319404822, 0.9020664869721473)
    Gradient Boost: (0.7413683690977038, 0.7497591700629863, 0.8264314302050151)


In [306]:
## Old model with added "have comments" features. 17K DATAPOINTS, THOSE THAT ARE SCRAPED
X_train, X_test, y_train, y_test = train_test_split(X1, y1)

In [307]:
def get_scores(classifier, X_train, X_test, y_train, y_test, **kwargs):
    model = classifier(**kwargs)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    return model.score(X_test, y_test), precision_score(y_test, y_predict), recall_score(y_test, y_predict)
print("Model, Accuracy, Precision, Recall")
print("    Random Forest:", get_scores(RandomForestClassifier, X_train, X_test, y_train, y_test))
print("    Logistic Regression:", get_scores(LogisticRegression, X_train, X_test, y_train, y_test))
print("    Gradient Boost:", get_scores(GradientBoostingClassifier, X_train, X_test, y_train, y_test))

Model, Accuracy, Precision, Recall
    Random Forest: (0.8157587872730302, 0.8385218931915971, 0.876223221370008)
    Logistic Regression: (0.6591704147926037, 0.6668590113483363, 0.9169531869875694)
    Gradient Boost: (0.8150924537731135, 0.8308149616051523, 0.8870669135149432)


In [319]:
df2 = pd.read_csv('data/comments_2.csv')

In [320]:
def have_comments(x):
    if len(x) > 2:
        return 1
    else:
        return 0

In [321]:
df2['have_comments'] = df2['comments'].apply(lambda x: have_comments(x))

In [325]:
df_comments = df2[df2['have_comments']==1]


In [323]:
df_comments.shape

(667, 399)

In [None]:
df_comments['state'].value_counts()

In [324]:
df_comments[['url', 'comments']].to_csv('df_comments.csv')

In [312]:
x_test = full_with_dates['comments'][:100].values

In [313]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(x_test)
xtest= tokenizer.texts_to_sequences(x_test)


In [314]:
vocab_size=len(tokenizer.word_index)+1

In [315]:
maxlen=10
xtest=pad_sequences(xtest,padding='post', maxlen=maxlen)


In [316]:
model=Sequential()

In [317]:
reconstructed_model = keras.models.load_model('test_model')

OSError: Unable to open file (file read failed: time = Fri Jun  5 11:31:54 2020
, filename = 'test_model', file descriptor = 74, errno = 21, error message = 'Is a directory', buf = 0x7ffeea9c0f68, total read size = 8, bytes this sub-read = 8, bytes actually read = 18446744073709551615, offset = 0)

In [None]:
np.testing.assert_allclose(
  model.predict(xtest),
  reconstructed_model.predict(xtest))