In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import ast
import glob
import tensorflow as tf
from tensorflow import keras

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras import layers

plt.style.use('ggplot')
%matplotlib inline

In [3]:
pd.set_option('display.max_columns', 50)

In [4]:
master = pd.read_csv('data/master_cleaned.csv')

In [5]:
comments = pd.read_csv('data/comments.csv')

In [6]:
dates = pd.read_csv('data/comments_dates.csv', names=['name','url','dates'])

In [7]:
print(comments.shape)
print(dates.shape)

(18870, 3)
(7636, 3)


In [8]:
comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18870 entries, 0 to 18869
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  18870 non-null  int64 
 1   url         18870 non-null  object
 2   comments    18870 non-null  object
dtypes: int64(1), object(2)
memory usage: 442.4+ KB


In [9]:
comments.drop('Unnamed: 0', axis=1, inplace=True)

In [10]:
comments.head()

Unnamed: 0,url,comments
0,https://www.kickstarter.com/projects/117862918...,[]
1,https://www.kickstarter.com/projects/lizardsho...,[]
2,https://www.kickstarter.com/projects/184368767...,[]
3,https://www.kickstarter.com/projects/376746530...,[]
4,https://www.kickstarter.com/projects/116371417...,[]


In [11]:
comments['comments'][11] = []
comments['comments'][12] = []

In [12]:
urls = pd.read_csv('data/comments_urls_copy.csv')

In [13]:
urls.head()

Unnamed: 0.1,Unnamed: 0,id,name,url
0,0,751419376,Iron Age Kingdoms - First Free To Win Mobile S...,https://www.kickstarter.com/projects/117862918...
1,1,289952460,"""Lizard"" a short film",https://www.kickstarter.com/projects/lizardsho...
2,2,37585826,Total Franchise Football 2016,https://www.kickstarter.com/projects/184368767...
3,3,1727301949,The Trans-American Psychogeographic Literary C...,https://www.kickstarter.com/projects/376746530...
4,4,144196412,The Golden Ticket to the Wonka Factory,https://www.kickstarter.com/projects/116371417...


In [14]:
# merge comments table with url tables to get campaign id
full = pd.merge(urls, comments, how='right', on= 'url', )

In [15]:
full.shape

(18870, 5)

In [16]:
full_with_dates = pd.merge(full, dates, on='url')
full_with_dates.drop(columns=['Unnamed: 0', 'name_y'], inplace=True)

In [17]:
full_with_dates.rename(columns={'name_x': 'name'}, inplace=True)


In [18]:

full_with_dates.head()

Unnamed: 0,id,name,url,comments,dates
0,1769794304,Burma Storybook,https://www.kickstarter.com/projects/618155130...,['Oh! My! The book has arrived and I am speech...,"['June 30, 2017 10:52 AM PDT', 'May 26, 2017 3..."
1,526305087,The Alchemy of Collaboration,https://www.kickstarter.com/projects/punchgrap...,"[""I know you read the messages. I've been chas...","['June 1, 2020 10:55 AM PDT', 'May 21, 2020 9:..."
2,1103726466,Frack This? - The Wyoming Artist Expedition,https://www.kickstarter.com/projects/331530956...,"['To the Artists:', 'I encourage each artist t...","['May 30, 2012 3:37 PM PDT', 'May 14, 2012 2:4..."
3,2045995373,Cookietownworld,https://www.kickstarter.com/projects/cookietow...,['I think Cookietownworld is an awesome idea. ...,"['June 23, 2016 8:55 AM PDT']"
4,1228292914,Dunpets Colors: Monster-catching RPG,https://www.kickstarter.com/projects/gugamesde...,"['Any updates about the premier? :)', 'Any pro...","['April 17, 2018 6:49 PM PDT', 'January 22, 20..."


In [19]:
full_with_dates.shape

(7645, 5)

## Merge with Master

In [20]:
# full df with comments and dates
df = pd.merge(master, full_with_dates, on='name')
df.drop(columns=['Unnamed: 0', 'country_displayable_name'], inplace=True)
df.shape

(7702, 21)

In [21]:
# master list with outer join, with comments or no comments 
df_full = pd.merge(master, full_with_dates, on='name', how='left')
df_full.drop(columns=['Unnamed: 0','country_displayable_name'], inplace=True)
df_full.shape

(169652, 21)

In [22]:
cols_to_convert = ['deadline', 'launched_at']
for c in cols_to_convert:
    df[c] = pd.to_datetime(df[c])

In [23]:
cols_to_convert = ['deadline', 'launched_at']
for c in cols_to_convert:
    df_full[c] = pd.to_datetime(df_full[c])

In [24]:
df['mid_campaign'] = df['launched_at'] + pd.DateOffset(15)
df['mid_campaign'] = df['mid_campaign'].dt.round('d')
df['dates'] = df['dates'].map(lambda x: x.split("',"))
df['comment_length'] = df['dates'].map(lambda x: len(x))

In [25]:
# df_full['mid_campaign'] = df_full['launched_at'] + pd.DateOffset(15)
# df_full['mid_campaign'] = df_full['mid_campaign'].dt.round('d')
# df_full['dates'] = df_full['dates'].map(lambda x: x.split("',"))
# df_full['comment_length'] = df_full['dates'].map(lambda x: len(x))

In [26]:
df

Unnamed: 0,backers_count,blurb,category,deadline,goal,launched_at,name,pledged,slug,staff_pick,state,launched_year,launched_month,campaign_length,category_type,blurb_length,country_name,id,url,comments,dates,mid_campaign,comment_length
0,152,A photography and poetry book featuring contem...,"{""id"":278,""name"":""People"",""slug"":""photography/...",2017-05-28 17:00:00,15000.0,2017-05-03 16:57:18,Burma Storybook,15777.00,burma-storybook,True,successful,2017,5,25,photography,13,the Netherlands,1769794304,https://www.kickstarter.com/projects/618155130...,['Oh! My! The book has arrived and I am speech...,"[['June 30, 2017 10:52 AM PDT, 'May 26, 2017 ...",2017-05-19,2
1,116,100 sumie ink originals by David Mack on top o...,"{""id"":276,""name"":""Fine Art"",""slug"":""photograph...",2018-07-29 21:05:01,8000.0,2018-06-29 21:05:01,The Alchemy of Collaboration,16706.00,the-alchemy-of-collaboration,False,successful,2018,6,30,photography,21,the United States,526305087,https://www.kickstarter.com/projects/punchgrap...,"[""I know you read the messages. I've been chas...","[['June 1, 2020 10:55 AM PDT, 'May 21, 2020 9...",2018-07-15,21
2,50,"8 artists explore the collision of art, the en...","{""id"":54,""name"":""Mixed Media"",""slug"":""art/mixe...",2012-06-04 03:59:00,3300.0,2012-05-03 12:13:56,Frack This? - The Wyoming Artist Expedition,3805.00,frack-this-the-wyoming-artist-expedition,False,successful,2012,5,32,art,16,the United States,1103726466,https://www.kickstarter.com/projects/331530956...,"['To the Artists:', 'I encourage each artist t...","[['May 30, 2012 3:37 PM PDT, 'May 14, 2012 2:...",2012-05-19,4
3,8,Cookietownworld is an interactive website desi...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2016-08-07 16:15:57,95000.0,2016-06-08 16:15:57,Cookietownworld,3229.00,cookietownworld,False,failed,2016,6,60,games,13,the United States,2045995373,https://www.kickstarter.com/projects/cookietow...,['I think Cookietownworld is an awesome idea. ...,"[['June 23, 2016 8:55 AM PDT']]",2016-06-24,1
4,59,Explore a magical world full of monsters and d...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2017-12-22 20:25:20,15000.0,2017-11-22 20:25:20,Dunpets Colors: Monster-catching RPG,4310.75,dunpets-colors-dungeon-crawl-and-pets,True,failed,2017,11,30,games,22,Spain,1228292914,https://www.kickstarter.com/projects/gugamesde...,"['Any updates about the premier? :)', 'Any pro...","[['April 17, 2018 6:49 PM PDT, 'January 22, 2...",2017-12-08,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7697,9,Handcrafted Goodyear Welt Leather Shoes,"{""id"":266,""name"":""Footwear"",""slug"":""fashion/fo...",2018-07-12 17:35:48,3000.0,2018-06-12 17:35:48,Goodyear Welted Leather Shoes,843.00,goodyear-welted-leather-shoes,False,failed,2018,6,30,fashion,5,the United States,1567021590,https://www.kickstarter.com/projects/104193279...,['Here’s hoping you come back for another try....,"[['July 14, 2018 4:36 PM PDT, 'July 14, 2018 ...",2018-06-28,6
7698,37,The Pastmaster time travel series continues.,"{""id"":47,""name"":""Fiction"",""slug"":""publishing/f...",2013-04-03 17:35:55,5000.0,2013-03-04 18:35:55,The Test of Time: Number Six in the Pastmaster...,5186.99,the-test-of-time-number-six-in-the-pastmaster-...,False,successful,2013,3,30,publishing,6,the United States,277103342,https://www.kickstarter.com/projects/897799656...,"[""I just wanted to let you folks know that I h...","[['July 12, 2013 4:54 AM PDT, 'April 14, 2013...",2013-03-20,4
7699,94,"mcSquares Stickies are Static-cling, dry-erase...","{""id"":353,""name"":""Stationery"",""slug"":""crafts/s...",2018-07-29 11:56:47,500.0,2018-07-10 11:56:47,"Stickies are Dry Erase, Static Cling Stickers ...",2735.00,stickies-are-dry-erase-static-cling-stickers-q...,False,successful,2018,7,19,crafts,19,the United States,639105699,https://www.kickstarter.com/projects/mcsquares...,['I use my stickies daily and they make me hap...,"[['October 23, 2019 8:40 PM PDT, 'January 18,...",2018-07-25,11
7700,2,A married woman's desire for a younger man awa...,"{""id"":293,""name"":""Drama"",""slug"":""film & video/...",2015-04-27 23:16:21,50000.0,2015-02-27 00:58:11,"""The Awakening"" Film Adaptation",101.00,the-awakening-5,False,failed,2015,2,60,film & video,20,the United States,1059520520,https://www.kickstarter.com/projects/awakening...,"[""Your story is engaging and I simply couldn't...","[['March 2, 2015 4:26 AM PST']]",2015-03-14,1


In [27]:
df_full

Unnamed: 0,backers_count,blurb,category,deadline,goal,launched_at,name,pledged,slug,staff_pick,state,launched_year,launched_month,campaign_length,category_type,blurb_length,country_name,id,url,comments,dates
0,5,A Free to Win Online Mobile Strategy Game to e...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2017-10-09 14:41:03,1000.0,2017-09-25 14:41:03,Iron Age Kingdoms - First Free To Win Mobile S...,105.0,iron-age-kingdoms-first-free-to-win-mobile-str...,False,failed,2017,9,14,games,24,Canada,,,,
1,19,"Lizard is a dark comedy and modern western, th...","{""id"":11,""name"":""Film & Video"",""slug"":""film & ...",2019-09-21 23:39:48,1100.0,2019-07-23 23:39:48,"""Lizard"" a short film",2358.0,lizard-a-short-film,False,successful,2019,7,60,film & video,15,the United States,,,,
2,5,My friend and I are making an app that has nev...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2016-03-22 21:11:17,25000.0,2016-01-22 22:11:17,Total Franchise Football 2016,172.0,total-franchise-football-2016,False,failed,2016,1,60,games,25,the United States,,,,
3,23,"More than a bike trip, a state of mind. And a ...","{""id"":48,""name"":""Nonfiction"",""slug"":""publishin...",2012-09-25 02:33:06,500.0,2012-09-15 02:33:06,The Trans-American Psychogeographic Literary C...,501.0,the-trans-american-psychogeographic-literary-c...,False,successful,2012,9,10,publishing,12,the United States,,,,
4,0,This is the birth of a revolutionary artist......,"{""id"":39,""name"":""Hip-Hop"",""slug"":""music/hip-ho...",2017-12-11 02:43:11,2500.0,2017-11-10 02:43:11,The Golden Ticket to the Wonka Factory,0.0,the-golden-ticket-to-the-wonka-factory,False,failed,2017,11,31,music,23,the United States,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169647,48,The bee all end all Kickstarter for crazy bitc...,"{""id"":10,""name"":""Food"",""slug"":""food"",""position...",2019-03-29 22:35:40,550.0,2019-02-27 23:35:40,Bitches Bee Crazy Hives,2145.0,bitches-bee-crazy-hives,False,successful,2019,2,30,food,15,the United States,,,,
169648,179,Detective Tori Jones and her partner Officer R...,"{""id"":33,""name"":""Webseries"",""slug"":""film & vid...",2012-05-19 04:00:00,30000.0,2012-04-15 19:31:32,Lesbian Cops: The Movie - Season 2,30045.5,lesbian-cops-the-movie-season-2-0,False,successful,2012,4,33,film & video,19,the United States,,,,
169649,18,Help me produce the play I have written for my...,"{""id"":285,""name"":""Plays"",""slug"":""theater/plays...",2014-09-30 16:00:00,500.0,2014-08-31 14:03:20,Generations (Senior Project),606.0,generations-senior-project,False,successful,2014,8,30,theater,12,the United States,,,,
169650,0,This documentary asks if the twenty-four hour ...,"{""id"":21,""name"":""Digital Art"",""slug"":""art/digi...",2013-06-07 18:39:19,10000.0,2013-05-08 18:39:19,US: A Story of our United States,0.0,us-a-story-of-our-united-states,False,failed,2013,5,30,art,22,the United States,,,,


In [28]:
def extract(cat):
    x = ast.literal_eval(re.search('({.+})', cat).group(0))
    if 'parent_name' not in x.keys():
        return x['slug']
    else:
        return x['parent_name'].lower()

In [29]:
df['category_type'] = df['category'].map(lambda x: extract(x))
df = df[df['state'].isin(['successful', 'failed'])]
df_full['category_type'] = df_full['category'].map(lambda x: extract(x))
df_full = df_full[df_full['state'].isin(['successful', 'failed'])]

In [30]:
print(df.shape)
print(df_full.shape)

(7702, 23)
(169652, 21)


In [31]:
df_full.head()

Unnamed: 0,backers_count,blurb,category,deadline,goal,launched_at,name,pledged,slug,staff_pick,state,launched_year,launched_month,campaign_length,category_type,blurb_length,country_name,id,url,comments,dates
0,5,A Free to Win Online Mobile Strategy Game to e...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2017-10-09 14:41:03,1000.0,2017-09-25 14:41:03,Iron Age Kingdoms - First Free To Win Mobile S...,105.0,iron-age-kingdoms-first-free-to-win-mobile-str...,False,failed,2017,9,14,games,24,Canada,,,,
1,19,"Lizard is a dark comedy and modern western, th...","{""id"":11,""name"":""Film & Video"",""slug"":""film & ...",2019-09-21 23:39:48,1100.0,2019-07-23 23:39:48,"""Lizard"" a short film",2358.0,lizard-a-short-film,False,successful,2019,7,60,film & video,15,the United States,,,,
2,5,My friend and I are making an app that has nev...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2016-03-22 21:11:17,25000.0,2016-01-22 22:11:17,Total Franchise Football 2016,172.0,total-franchise-football-2016,False,failed,2016,1,60,games,25,the United States,,,,
3,23,"More than a bike trip, a state of mind. And a ...","{""id"":48,""name"":""Nonfiction"",""slug"":""publishin...",2012-09-25 02:33:06,500.0,2012-09-15 02:33:06,The Trans-American Psychogeographic Literary C...,501.0,the-trans-american-psychogeographic-literary-c...,False,successful,2012,9,10,publishing,12,the United States,,,,
4,0,This is the birth of a revolutionary artist......,"{""id"":39,""name"":""Hip-Hop"",""slug"":""music/hip-ho...",2017-12-11 02:43:11,2500.0,2017-11-10 02:43:11,The Golden Ticket to the Wonka Factory,0.0,the-golden-ticket-to-the-wonka-factory,False,failed,2017,11,31,music,23,the United States,,,,


In [32]:
df['dates'][2]

["['May 30, 2012 3:37 PM PDT",
 " 'May 14, 2012 2:46 PM PDT",
 " 'May 12, 2012 9:48 AM PDT",
 " 'May 7, 2012 8:00 PM PDT']"]

In [33]:
def to_date(x):
    lst = []
    for i in x:
        i = i.replace("[", '').replace("]", '')
        d = pd.to_datetime(i[:-4])
        lst.append(d)
    return lst

In [34]:
df['dates'] = df['dates'].map(lambda x: to_date(x))

In [35]:
def date_subtract(df):
    lst = []
    for index, row in df.iterrows():
        row_lst =[]
        r = row['dates']
        for i in r:
            if i < row['mid_campaign']:
                row_lst.append(i)
        lst.append(row_lst)
    return lst 

In [36]:
def real_comments(df):
    lst = []
    for index, row in df.iterrows():
        r = eval(row['comments'])
        a = row['pre_mid_campaign_dates_length']
        if a > 0:
            lst.append(r[-a:])
        else:
            lst.append([])
    return lst


# eval(df['comments'][2])[-3:]

In [37]:
def have_comments(x):
    if x > 0:
        return 1
    else:
        return 0

In [38]:
df['pre_mid_campaign_dates'] = date_subtract(df)
df['pre_mid_campaign_dates_length'] = df['pre_mid_campaign_dates'].apply(lambda x: len(x))
df['have_comments'] = df['pre_mid_campaign_dates_length'].apply(lambda x: have_comments(x))

In [39]:
df['pre_mid_campaign_comments'] = real_comments(df)

In [40]:
df.head()

Unnamed: 0,backers_count,blurb,category,deadline,goal,launched_at,name,pledged,slug,staff_pick,state,launched_year,launched_month,campaign_length,category_type,blurb_length,country_name,id,url,comments,dates,mid_campaign,comment_length,pre_mid_campaign_dates,pre_mid_campaign_dates_length,have_comments,pre_mid_campaign_comments
0,152,A photography and poetry book featuring contem...,"{""id"":278,""name"":""People"",""slug"":""photography/...",2017-05-28 17:00:00,15000.0,2017-05-03 16:57:18,Burma Storybook,15777.0,burma-storybook,True,successful,2017,5,25,photography,13,the Netherlands,1769794304,https://www.kickstarter.com/projects/618155130...,['Oh! My! The book has arrived and I am speech...,"[2017-06-30 10:52:00, 2017-05-26 03:53:00]",2017-05-19,2,[],0,0,[]
1,116,100 sumie ink originals by David Mack on top o...,"{""id"":276,""name"":""Fine Art"",""slug"":""photograph...",2018-07-29 21:05:01,8000.0,2018-06-29 21:05:01,The Alchemy of Collaboration,16706.0,the-alchemy-of-collaboration,False,successful,2018,6,30,photography,21,the United States,526305087,https://www.kickstarter.com/projects/punchgrap...,"[""I know you read the messages. I've been chas...","[2020-06-01 10:55:00, 2020-05-21 09:46:00, 202...",2018-07-15,21,[],0,0,[]
2,50,"8 artists explore the collision of art, the en...","{""id"":54,""name"":""Mixed Media"",""slug"":""art/mixe...",2012-06-04 03:59:00,3300.0,2012-05-03 12:13:56,Frack This? - The Wyoming Artist Expedition,3805.0,frack-this-the-wyoming-artist-expedition,False,successful,2012,5,32,art,16,the United States,1103726466,https://www.kickstarter.com/projects/331530956...,"['To the Artists:', 'I encourage each artist t...","[2012-05-30 15:37:00, 2012-05-14 14:46:00, 201...",2012-05-19,4,"[2012-05-14 14:46:00, 2012-05-12 09:48:00, 201...",3,1,"[Lori, the kids will want you and Uncle Michae..."
3,8,Cookietownworld is an interactive website desi...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2016-08-07 16:15:57,95000.0,2016-06-08 16:15:57,Cookietownworld,3229.0,cookietownworld,False,failed,2016,6,60,games,13,the United States,2045995373,https://www.kickstarter.com/projects/cookietow...,['I think Cookietownworld is an awesome idea. ...,[2016-06-23 08:55:00],2016-06-24,1,[2016-06-23 08:55:00],1,1,[I think Cookietownworld is an awesome idea. I...
4,59,Explore a magical world full of monsters and d...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2017-12-22 20:25:20,15000.0,2017-11-22 20:25:20,Dunpets Colors: Monster-catching RPG,4310.75,dunpets-colors-dungeon-crawl-and-pets,True,failed,2017,11,30,games,22,Spain,1228292914,https://www.kickstarter.com/projects/gugamesde...,"['Any updates about the premier? :)', 'Any pro...","[2018-04-17 18:49:00, 2018-01-22 01:59:00, 201...",2017-12-08,18,"[2017-12-03 12:14:00, 2017-12-03 07:10:00, 201...",11,1,"[Any progress update on the premiere?, Sorry t..."


In [41]:
df.shape

(7702, 27)

In [42]:
eval(df['comments'][2])[-3:]

['Lori, the kids will want you and Uncle Michael to sign their card!  Enjoy your trip.',
 'My $25 pledge was on behalf of Nina Gibbons, who donated to support this project.  Thanks so much for your donation.',
 'Hope you guys have a great trip! My only request-my brother and Lori have to sign my book!']

In [43]:
df.columns

Index(['backers_count', 'blurb', 'category', 'deadline', 'goal', 'launched_at',
       'name', 'pledged', 'slug', 'staff_pick', 'state', 'launched_year',
       'launched_month', 'campaign_length', 'category_type', 'blurb_length',
       'country_name', 'id', 'url', 'comments', 'dates', 'mid_campaign',
       'comment_length', 'pre_mid_campaign_dates',
       'pre_mid_campaign_dates_length', 'have_comments',
       'pre_mid_campaign_comments'],
      dtype='object')

In [44]:
df_full.columns

Index(['backers_count', 'blurb', 'category', 'deadline', 'goal', 'launched_at',
       'name', 'pledged', 'slug', 'staff_pick', 'state', 'launched_year',
       'launched_month', 'campaign_length', 'category_type', 'blurb_length',
       'country_name', 'id', 'url', 'comments', 'dates'],
      dtype='object')

In [45]:
df_full.shape

(169652, 21)

In [46]:
df_filtered = df[['name', 'have_comments']].copy()

In [47]:
df_master = pd.merge(df_full, df_filtered, on='name', how='left')
df_master = df_master[df_master['state'].isin(['successful', 'failed'])]
df_master['category_type'] = df_master['category'].map(lambda x: extract(x))
df_master['have_comments'].fillna(0, inplace=True)

In [48]:
df_master['have_comments'].value_counts()

0.0    166260
1.0      3970
Name: have_comments, dtype: int64

## One Hot Encoding 

In [49]:
df_master.head()

Unnamed: 0,backers_count,blurb,category,deadline,goal,launched_at,name,pledged,slug,staff_pick,state,launched_year,launched_month,campaign_length,category_type,blurb_length,country_name,id,url,comments,dates,have_comments
0,5,A Free to Win Online Mobile Strategy Game to e...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2017-10-09 14:41:03,1000.0,2017-09-25 14:41:03,Iron Age Kingdoms - First Free To Win Mobile S...,105.0,iron-age-kingdoms-first-free-to-win-mobile-str...,False,failed,2017,9,14,games,24,Canada,,,,,0.0
1,19,"Lizard is a dark comedy and modern western, th...","{""id"":11,""name"":""Film & Video"",""slug"":""film & ...",2019-09-21 23:39:48,1100.0,2019-07-23 23:39:48,"""Lizard"" a short film",2358.0,lizard-a-short-film,False,successful,2019,7,60,film & video,15,the United States,,,,,0.0
2,5,My friend and I are making an app that has nev...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2016-03-22 21:11:17,25000.0,2016-01-22 22:11:17,Total Franchise Football 2016,172.0,total-franchise-football-2016,False,failed,2016,1,60,games,25,the United States,,,,,0.0
3,23,"More than a bike trip, a state of mind. And a ...","{""id"":48,""name"":""Nonfiction"",""slug"":""publishin...",2012-09-25 02:33:06,500.0,2012-09-15 02:33:06,The Trans-American Psychogeographic Literary C...,501.0,the-trans-american-psychogeographic-literary-c...,False,successful,2012,9,10,publishing,12,the United States,,,,,0.0
4,0,This is the birth of a revolutionary artist......,"{""id"":39,""name"":""Hip-Hop"",""slug"":""music/hip-ho...",2017-12-11 02:43:11,2500.0,2017-11-10 02:43:11,The Golden Ticket to the Wonka Factory,0.0,the-golden-ticket-to-the-wonka-factory,False,failed,2017,11,31,music,23,the United States,,,,,0.0


In [50]:
df_master_ohe = pd.get_dummies(df_master[['category_type', 'country_name', 'staff_pick']])

In [51]:
df_master.columns

Index(['backers_count', 'blurb', 'category', 'deadline', 'goal', 'launched_at',
       'name', 'pledged', 'slug', 'staff_pick', 'state', 'launched_year',
       'launched_month', 'campaign_length', 'category_type', 'blurb_length',
       'country_name', 'id', 'url', 'comments', 'dates', 'have_comments'],
      dtype='object')

In [52]:
data = pd.merge(df_master, df_master_ohe, left_index=True, right_index=True)
data.shape

(170230, 48)

In [53]:
good_data = data[data['state'].isin(['successful', 'failed'])]
live = data[data['state'].isin(['live'])]
print(good_data.shape)
print(live.shape)

(170230, 48)
(0, 48)


In [54]:
good_data['state'] = good_data['state'].replace(to_replace=['successful', 'failed'], value=[1,0])

In [55]:
model_data = good_data.copy()

In [56]:
model_data.shape

(170230, 48)

In [57]:
# model_data.dropna(inplace=True)
# print(model_data.shape)

In [58]:
model_data.columns

Index(['backers_count', 'blurb', 'category', 'deadline', 'goal', 'launched_at',
       'name', 'pledged', 'slug', 'staff_pick_x', 'state', 'launched_year',
       'launched_month', 'campaign_length', 'category_type', 'blurb_length',
       'country_name', 'id', 'url', 'comments', 'dates', 'have_comments',
       'staff_pick_y', 'category_type_art', 'category_type_comics',
       'category_type_crafts', 'category_type_dance', 'category_type_design',
       'category_type_fashion', 'category_type_film & video',
       'category_type_food', 'category_type_games', 'category_type_journalism',
       'category_type_music', 'category_type_photography',
       'category_type_publishing', 'category_type_technology',
       'category_type_theater', 'country_name_Australia',
       'country_name_Canada', 'country_name_France', 'country_name_Germany',
       'country_name_Italy', 'country_name_Mexico', 'country_name_Spain',
       'country_name_other', 'country_name_the Netherlands',
       'count

In [59]:
X = model_data.drop(['backers_count','blurb', 'category', 'deadline', 'launched_at', 
                     'name', 'pledged', 'slug', 'state', 'category_type', 'country_name',
                    'staff_pick_y', 'id', 'url', 'comments', 'dates'],
                   axis=1)
y = model_data['state']


In [60]:
X.shape

(170230, 32)

## Scraped Campaigns Only


In [61]:
df_filtered = df[['name', 'have_comments']].copy()

In [62]:
model_data_ = pd.merge(full, df_filtered, on='name', how='left')
model_data_.fillna(0,inplace=True)


In [63]:
model_data1 = pd.merge(data, model_data_, on='name', how='right')

In [64]:
model_data1.fillna(0, inplace=True)
model_data1['state'] = model_data1['state'].replace(to_replace=['successful', 'failed'], value=[1,0])

In [65]:
X1 = model_data1.drop(['backers_count','blurb', 'category', 'deadline', 
                     'launched_at', 'name', 'pledged', 'slug', 'state', 'category_type', 'country_name', 'id_x', 'url_x',
                       'comments_x', 'dates', 'staff_pick_y', 'Unnamed: 0', 'id_y', 'url_y', 'comments_y', 'have_comments_y'],
                   axis=1)
y1 = model_data1['state']

In [99]:
df_filtered2 = df[['name', 'have_comments', 'pre_mid_campaign_dates_length']].copy()

In [100]:
df_filtered2

Unnamed: 0,name,have_comments,pre_mid_campaign_dates_length
0,Burma Storybook,0,0
1,The Alchemy of Collaboration,0,0
2,Frack This? - The Wyoming Artist Expedition,1,3
3,Cookietownworld,1,1
4,Dunpets Colors: Monster-catching RPG,1,11
...,...,...,...
7697,Goodyear Welted Leather Shoes,1,1
7698,The Test of Time: Number Six in the Pastmaster...,1,1
7699,"Stickies are Dry Erase, Static Cling Stickers ...",1,1
7700,"""The Awakening"" Film Adaptation",1,1


In [101]:
model_data_2 = pd.merge(full, df_filtered2, on='name', how='left')
model_data_2.fillna(0,inplace=True)

In [102]:
model_data2 = pd.merge(data, model_data_2, on='name', how='right')

In [104]:
model_data2.fillna(0, inplace=True)
model_data2['state'] = model_data2['state'].replace(to_replace=['successful', 'failed'], value=[1,0])

In [108]:
X2 = model_data2.drop(['backers_count','blurb', 'category', 'deadline', 
                     'launched_at', 'name', 'pledged', 'slug', 'state', 'category_type', 'country_name', 'id_x', 'url_x',
                       'comments_x', 'dates', 'staff_pick_y', 'Unnamed: 0', 'id_y', 'url_y', 'comments_y', 'have_comments_y'],
                   axis=1)
y2 = model_data2['state']

## Modeling

In [66]:
## Old model with added "have comments" features. FULL DATASET
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [67]:
def get_scores(classifier, X_train, X_test, y_train, y_test, **kwargs):
    model = classifier(**kwargs)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    return model.score(X_test, y_test), precision_score(y_test, y_predict), recall_score(y_test, y_predict)
print("Model, Accuracy, Precision, Recall")
print("    Random Forest:", get_scores(RandomForestClassifier, X_train, X_test, y_train, y_test))
print("    Logistic Regression:", get_scores(LogisticRegression, X_train, X_test, y_train, y_test))
print("    Gradient Boost:", get_scores(GradientBoostingClassifier, X_train, X_test, y_train, y_test))

Model, Accuracy, Precision, Recall
    Random Forest: (0.7397904036843836, 0.7645704923148943, 0.795478529101388)
    Logistic Regression: (0.6221157009257954, 0.6200538811247684, 0.8967854533647212)
    Gradient Boost: (0.7439259363691902, 0.7545764470466167, 0.8264875395730172)


In [68]:
## Old model with added "have comments" features. 17K DATAPOINTS, THOSE THAT ARE SCRAPED
X_train, X_test, y_train, y_test = train_test_split(X1, y1)

In [69]:
def get_scores(classifier, X_train, X_test, y_train, y_test, **kwargs):
    model = classifier(**kwargs)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    return model.score(X_test, y_test), precision_score(y_test, y_predict), recall_score(y_test, y_predict)
print("Model, Accuracy, Precision, Recall")
print("    Random Forest:", get_scores(RandomForestClassifier, X_train, X_test, y_train, y_test))
print("    Logistic Regression:", get_scores(LogisticRegression, X_train, X_test, y_train, y_test))
print("    Gradient Boost:", get_scores(GradientBoostingClassifier, X_train, X_test, y_train, y_test))

Model, Accuracy, Precision, Recall
    Random Forest: (0.8128627450980392, 0.8371138996138996, 0.8700777526962629)
    Logistic Regression: (0.656156862745098, 0.6630336058128974, 0.9154752947078003)
    Gradient Boost: (0.8037647058823529, 0.8263358778625954, 0.8688236769500878)


In [110]:
## Old model with added "have comments" and "pre_mid_campaign_dates_length" features. 17K DATAPOINTS, THOSE THAT ARE SCRAPED
X_train, X_test, y_train, y_test = train_test_split(X2, y2)

In [111]:
def get_scores(classifier, X_train, X_test, y_train, y_test, **kwargs):
    model = classifier(**kwargs)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    return model.score(X_test, y_test), precision_score(y_test, y_predict), recall_score(y_test, y_predict)
print("Model, Accuracy, Precision, Recall")
print("    Random Forest:", get_scores(RandomForestClassifier, X_train, X_test, y_train, y_test))
print("    Logistic Regression:", get_scores(LogisticRegression, X_train, X_test, y_train, y_test))
print("    Gradient Boost:", get_scores(GradientBoostingClassifier, X_train, X_test, y_train, y_test))

Model, Accuracy, Precision, Recall
    Random Forest: (0.8203921568627451, 0.8391923990498812, 0.8830292426893277)
    Logistic Regression: (0.6734117647058824, 0.6726029861485878, 0.9345163709072731)
    Gradient Boost: (0.8072156862745098, 0.8264248704663213, 0.8770307423144214)


## LSTM

In [161]:
df_lstm = df[df['pre_mid_campaign_comments'].apply(lambda x: len(x)>2)]


In [162]:
def string_convert(x):
    str1 = ''.join(str(e) for e in x)
    return str1 

In [163]:
df_lstm['pre_mid_campaign_comments'] = df_lstm['pre_mid_campaign_comments'].apply(lambda x: string_convert(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [164]:
x = df_lstm['pre_mid_campaign_comments'].values

In [170]:
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(x)
xtest= tokenizer.texts_to_sequences(x) 

In [171]:
maxlen=20
xtest=pad_sequences(xtest,padding='post', maxlen=maxlen)

In [172]:
xtest

array([[37, 65, 18, ...,  2, 16, 70],
       [72, 94, 78, ...,  8, 63,  4],
       [45, 58, 32, ..., 67, 33, 49],
       ...,
       [95,  1, 47, ..., 70,  1, 67],
       [52,  4,  3, ..., 23,  1, 11],
       [93, 30, 50, ..., 24, 12, 31]], dtype=int32)

In [175]:
print(xtest[0])
print(x[0])

[37 65 18 84  7 20 75  8 95 17  4 46 16 99 16  5 17  2 16 70]
Lori, the kids will want you and Uncle Michael to sign their card!  Enjoy your trip.My $25 pledge was on behalf of Nina Gibbons, who donated to support this project.  Thanks so much for your donation.Hope you guys have a great trip! My only request-my brother and Lori have to sign my book!


In [176]:
vocab_size=len(tokenizer.word_index)+1
vocab_size

22856

In [177]:
model=Sequential()

In [178]:
my_model = keras.models.load_model('my_half_model')

In [182]:
predict = my_model.predict(xtest)

In [183]:
predict

array([[0.50482345],
       [0.96277493],
       [0.6321287 ],
       ...,
       [0.44323578],
       [0.7173919 ],
       [0.8861469 ]], dtype=float32)

In [184]:
df_lstm['sentiment_predict'] = predict

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [185]:
df_lstm

Unnamed: 0,backers_count,blurb,category,deadline,goal,launched_at,name,pledged,slug,staff_pick,state,launched_year,launched_month,campaign_length,category_type,blurb_length,country_name,id,url,comments,dates,mid_campaign,comment_length,pre_mid_campaign_dates,pre_mid_campaign_dates_length,have_comments,pre_mid_campaign_comments,sentiment_predict
2,50,"8 artists explore the collision of art, the en...","{""id"":54,""name"":""Mixed Media"",""slug"":""art/mixe...",2012-06-04 03:59:00,3300.0,2012-05-03 12:13:56,Frack This? - The Wyoming Artist Expedition,3805.00,frack-this-the-wyoming-artist-expedition,False,successful,2012,5,32,art,16,the United States,1103726466,https://www.kickstarter.com/projects/331530956...,"['To the Artists:', 'I encourage each artist t...","[2012-05-30 15:37:00, 2012-05-14 14:46:00, 201...",2012-05-19,4,"[2012-05-14 14:46:00, 2012-05-12 09:48:00, 201...",3,1,"Lori, the kids will want you and Uncle Michael...",0.504823
4,59,Explore a magical world full of monsters and d...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2017-12-22 20:25:20,15000.0,2017-11-22 20:25:20,Dunpets Colors: Monster-catching RPG,4310.75,dunpets-colors-dungeon-crawl-and-pets,True,failed,2017,11,30,games,22,Spain,1228292914,https://www.kickstarter.com/projects/gugamesde...,"['Any updates about the premier? :)', 'Any pro...","[2018-04-17 18:49:00, 2018-01-22 01:59:00, 201...",2017-12-08,18,"[2017-12-03 12:14:00, 2017-12-03 07:10:00, 201...",11,1,Any progress update on the premiere?Sorry that...,0.962775
6,464,Watch our FULL Pilot Short Film Exclusively on...,"{""id"":291,""name"":""Action"",""slug"":""film & video...",2019-12-13 19:00:00,25000.0,2019-11-14 03:01:07,Reign of Judges: FINAL STAND to Fund our Full ...,52760.45,reign-of-judges-final-stand-to-fund-our-full-f...,False,successful,2019,11,30,film & video,13,the United States,540877514,https://www.kickstarter.com/projects/darinsout...,['I gave $25. I wish it was more. I hope that...,"[2019-12-12 20:54:00, 2019-12-12 20:45:00, 201...",2019-11-29,12,"[2019-11-18 17:30:00, 2019-11-18 17:29:00, 201...",5,1,"If I had the means to fund the entire project,...",0.632129
11,108,A collection of 97 colour photographs showcasi...,"{""id"":280,""name"":""Photobooks"",""slug"":""photogra...",2017-01-31 18:08:20,6500.0,2016-12-27 18:08:20,Iceland Impressions: photographs by Iwona and ...,6645.00,iceland-impressions-photographs-by-iwona-and-adam,False,successful,2016,12,35,photography,17,other,643790953,https://www.kickstarter.com/projects/iceland-i...,"['Afternoon.', '', 'Just received my copy of t...","[2017-04-26 14:45:00, 2017-03-22 13:53:00, 201...",2017-01-12,15,"[2017-01-05 11:48:00, 2017-01-05 01:50:00, 201...",4,1,Stunning images. I'm looking forward to this b...,0.828198
14,163,Coming out of the Calgary Gay History Research...,"{""id"":48,""name"":""Nonfiction"",""slug"":""publishin...",2014-09-16 05:59:00,16200.0,2014-08-25 15:34:36,Writing Calgary's Gay History,20051.00,writing-calgarys-gay-history,True,successful,2014,8,22,publishing,19,Canada,2032631534,https://www.kickstarter.com/projects/485626109...,['Congratulations Kevin on what you have achie...,"[2018-07-31 11:29:00, 2014-09-16 08:57:00, 201...",2014-09-10,10,"[2014-09-09 21:15:00, 2014-09-09 21:14:00, 201...",3,1,"Love love love, hearts hearts hearts!<3YES !!!!!",0.678488
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7685,105,James may have been bitten off more than he ca...,"{""id"":252,""name"":""Graphic Novels"",""slug"":""comi...",2016-10-13 11:56:05,1500.0,2016-09-12 11:56:05,Blue - An Australian Cattle Dog Story,3066.00,blue-an-australian-cattle-dog-story,True,successful,2016,9,31,comics,19,Australia,29693622,https://www.kickstarter.com/projects/191742873...,"['British Columbia, Canada here! Received my ...","[2017-03-24 18:00:00, 2017-03-24 11:18:00, 201...",2016-09-27,27,"[2016-09-18 08:17:00, 2016-09-18 08:17:00, 201...",10,1,Stephan fantastic will drop you a message late...,0.250960
7687,44,"Shaka Rising, saga of a legendary Zulu leader,...","{""id"":252,""name"":""Graphic Novels"",""slug"":""comi...",2018-01-11 23:31:04,3000.0,2017-12-12 23:31:04,Shaka Rising: A Legend of the Warrior Prince,3145.00,shaka-rising-a-legend-of-the-warrior-prince,False,successful,2017,12,30,comics,22,the United States,1469636716,https://www.kickstarter.com/projects/kingshaka...,"['Looking forward to this! 👍', 'Recieved min...","[2018-03-07 15:52:00, 2018-03-06 09:14:00, 201...",2017-12-28,14,"[2017-12-18 14:35:00, 2017-12-18 13:25:00, 201...",4,1,"Looks good!Scott Mitchell Rosenberg, from Plat...",0.097364
7689,286,HORIZON is a 144-page collection of fantasy co...,"{""id"":249,""name"":""Anthologies"",""slug"":""comics/...",2014-06-04 20:34:31,7000.0,2014-05-05 20:34:31,Horizon Anthology,8955.41,horizon-anthology,True,successful,2014,5,30,comics,19,the United States,22826244,https://www.kickstarter.com/projects/impkingco...,['Thanks so much Jeremy!! Just wanted to make ...,"[2014-12-03 20:18:00, 2014-12-03 20:16:00, 201...",2014-05-21,19,"[2014-05-20 02:56:00, 2014-05-18 09:41:00, 201...",5,1,Now you guys should put the stretch goals at t...,0.443236
7694,549,The current Sam and Fuzzy series continues wit...,"{""id"":252,""name"":""Graphic Novels"",""slug"":""comi...",2016-12-22 03:00:00,35000.0,2016-11-23 17:52:10,Sam & Fuzzy Vol 5 and New Series Hardcover Set,66299.00,sam-and-fuzzy-vol-5-and-new-series-hardcover-set,True,successful,2016,11,28,comics,23,the United States,1840494906,https://www.kickstarter.com/projects/168734274...,['Hi :) I also did not receive my items. If po...,"[2018-01-02 14:43:00, 2017-12-19 03:15:00, 201...",2016-12-09,80,"[2016-12-02 12:22:00, 2016-11-30 18:29:00, 201...",19,1,Does the Hardcover Completionist tier come wit...,0.717392


In [94]:
trained_model

<tensorflow.python.keras.saving.saved_model.load.Sequential at 0x7f8991f05310>

In [98]:
real_model = keras.models.load_model('my_model')

TypeError: Error converting shape to a TensorShape: Dimension value must be integer or None or have an __index__ method, got 'class_name'.

In [132]:
df_comments = pd.read_csv('data/df_comments.csv', usecols=[0,1,2] )

  interactivity=interactivity, compiler=compiler, result=result)


In [133]:
def have_comments(x):
    if len(x)>2:
        return 1
    else:
        return 0

In [134]:
df_comments['have_comments'] = df_comments.apply(lambda x: have_comments(x))

In [135]:
df_comments

Unnamed: 0.1,Unnamed: 0,url,comments,have_comments
0,0,https://www.kickstarter.com/projects/11648486/...,[],
1,0,https://www.kickstarter.com/projects/210813336...,"['My cards arrived in Oregon, USA. They arriv...",
2,0,https://www.kickstarter.com/projects/ketonatur...,"['Hi Dan, I pledged $15 so I can get a copy of...",
3,0,https://www.kickstarter.com/projects/156496507...,[],
4,0,https://www.kickstarter.com/projects/christoph...,['It has been a year since the last update - i...,
...,...,...,...,...
1225,0,https://www.kickstarter.com/projects/208162880...,"['Please, is it possible to mount a leather or...",
1226,0,https://www.kickstarter.com/projects/big-blank...,['@creator I purchased both the blanket and an...,
1227,0,https://www.kickstarter.com/projects/128928984...,"['Impressed with micro usb GoPro version, now ...",
1228,0,https://www.kickstarter.com/projects/605374786...,[],


In [143]:
comments = df_comments[df_comments['have_comments'] == 1]

In [144]:
comments[['url', 'comments']].to_csv('comments_2.csv')