In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import ast
import glob
import tensorflow as tf
from tensorflow import keras

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras import layers

plt.style.use('ggplot')
%matplotlib inline

In [2]:
pd.set_option('display.max_columns', 50)

In [3]:
master = pd.read_csv('data/master_cleaned.csv')

In [4]:
comments = pd.read_csv('data/comments.csv')

In [5]:
dates = pd.read_csv('data/comments_dates.csv', names=['name','url','dates'])

In [6]:
print(comments.shape)
print(dates.shape)

(17642, 3)
(6902, 3)


In [7]:
comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17642 entries, 0 to 17641
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  17642 non-null  int64 
 1   url         17642 non-null  object
 2   comments    17642 non-null  object
dtypes: int64(1), object(2)
memory usage: 413.6+ KB


In [8]:
comments.drop('Unnamed: 0', axis=1, inplace=True)

In [9]:
comments.head()

Unnamed: 0,url,comments
0,https://www.kickstarter.com/projects/117862918...,[]
1,https://www.kickstarter.com/projects/lizardsho...,[]
2,https://www.kickstarter.com/projects/184368767...,[]
3,https://www.kickstarter.com/projects/376746530...,[]
4,https://www.kickstarter.com/projects/116371417...,[]


In [10]:
comments['comments'][11] = []
comments['comments'][12] = []

In [11]:
urls = pd.read_csv('data/comments_urls_copy.csv')

In [12]:
urls.head()

Unnamed: 0.1,Unnamed: 0,id,name,url
0,0,751419376,Iron Age Kingdoms - First Free To Win Mobile S...,https://www.kickstarter.com/projects/117862918...
1,1,289952460,"""Lizard"" a short film",https://www.kickstarter.com/projects/lizardsho...
2,2,37585826,Total Franchise Football 2016,https://www.kickstarter.com/projects/184368767...
3,3,1727301949,The Trans-American Psychogeographic Literary C...,https://www.kickstarter.com/projects/376746530...
4,4,144196412,The Golden Ticket to the Wonka Factory,https://www.kickstarter.com/projects/116371417...


In [13]:
# merge comments table with url tables to get campaign id
full = pd.merge(urls, comments, how='right', on= 'url', )

In [14]:
full.shape

(17642, 5)

In [15]:
full_with_dates = pd.merge(full, dates, on='url')
full_with_dates.drop(columns=['Unnamed: 0', 'name_y'], inplace=True)

In [16]:
full_with_dates.rename(columns={'name_x': 'name'}, inplace=True)


In [17]:

full_with_dates.head()

Unnamed: 0,id,name,url,comments,dates
0,1769794304,Burma Storybook,https://www.kickstarter.com/projects/618155130...,['Oh! My! The book has arrived and I am speech...,"['June 30, 2017 10:52 AM PDT', 'May 26, 2017 3..."
1,526305087,The Alchemy of Collaboration,https://www.kickstarter.com/projects/punchgrap...,"[""I know you read the messages. I've been chas...","['June 1, 2020 10:55 AM PDT', 'May 21, 2020 9:..."
2,1103726466,Frack This? - The Wyoming Artist Expedition,https://www.kickstarter.com/projects/331530956...,"['To the Artists:', 'I encourage each artist t...","['May 30, 2012 3:37 PM PDT', 'May 14, 2012 2:4..."
3,2045995373,Cookietownworld,https://www.kickstarter.com/projects/cookietow...,['I think Cookietownworld is an awesome idea. ...,"['June 23, 2016 8:55 AM PDT']"
4,1228292914,Dunpets Colors: Monster-catching RPG,https://www.kickstarter.com/projects/gugamesde...,"['Any updates about the premier? :)', 'Any pro...","['April 17, 2018 6:49 PM PDT', 'January 22, 20..."


In [18]:
full_with_dates.shape

(6911, 5)

## Merge with Master

In [19]:
# full df with comments and dates
df = pd.merge(master, full_with_dates, on='name')
df.drop(columns=['Unnamed: 0', 'country_displayable_name'], inplace=True)
df.shape

(6959, 21)

In [20]:
# master list with outer join, with comments or no comments 
df_full = pd.merge(master, full_with_dates, on='name', how='left')
df_full.drop(columns=['Unnamed: 0','country_displayable_name'], inplace=True)
df_full.shape

(169652, 21)

In [21]:
cols_to_convert = ['deadline', 'launched_at']
for c in cols_to_convert:
    df[c] = pd.to_datetime(df[c])

In [22]:
cols_to_convert = ['deadline', 'launched_at']
for c in cols_to_convert:
    df_full[c] = pd.to_datetime(df_full[c])

In [23]:
df['mid_campaign'] = df['launched_at'] + pd.DateOffset(15)
df['mid_campaign'] = df['mid_campaign'].dt.round('d')
df['dates'] = df['dates'].map(lambda x: x.split("',"))
df['comment_length'] = df['dates'].map(lambda x: len(x))

In [24]:
# df_full['mid_campaign'] = df_full['launched_at'] + pd.DateOffset(15)
# df_full['mid_campaign'] = df_full['mid_campaign'].dt.round('d')
# df_full['dates'] = df_full['dates'].map(lambda x: x.split("',"))
# df_full['comment_length'] = df_full['dates'].map(lambda x: len(x))

In [25]:
df

Unnamed: 0,backers_count,blurb,category,deadline,goal,launched_at,name,pledged,slug,staff_pick,state,launched_year,launched_month,campaign_length,category_type,blurb_length,country_name,id,url,comments,dates,mid_campaign,comment_length
0,152,A photography and poetry book featuring contem...,"{""id"":278,""name"":""People"",""slug"":""photography/...",2017-05-28 17:00:00,15000.0,2017-05-03 16:57:18,Burma Storybook,15777.00,burma-storybook,True,successful,2017,5,25,photography,13,the Netherlands,1769794304,https://www.kickstarter.com/projects/618155130...,['Oh! My! The book has arrived and I am speech...,"[['June 30, 2017 10:52 AM PDT, 'May 26, 2017 ...",2017-05-19,2
1,116,100 sumie ink originals by David Mack on top o...,"{""id"":276,""name"":""Fine Art"",""slug"":""photograph...",2018-07-29 21:05:01,8000.0,2018-06-29 21:05:01,The Alchemy of Collaboration,16706.00,the-alchemy-of-collaboration,False,successful,2018,6,30,photography,21,the United States,526305087,https://www.kickstarter.com/projects/punchgrap...,"[""I know you read the messages. I've been chas...","[['June 1, 2020 10:55 AM PDT, 'May 21, 2020 9...",2018-07-15,21
2,50,"8 artists explore the collision of art, the en...","{""id"":54,""name"":""Mixed Media"",""slug"":""art/mixe...",2012-06-04 03:59:00,3300.0,2012-05-03 12:13:56,Frack This? - The Wyoming Artist Expedition,3805.00,frack-this-the-wyoming-artist-expedition,False,successful,2012,5,32,art,16,the United States,1103726466,https://www.kickstarter.com/projects/331530956...,"['To the Artists:', 'I encourage each artist t...","[['May 30, 2012 3:37 PM PDT, 'May 14, 2012 2:...",2012-05-19,4
3,8,Cookietownworld is an interactive website desi...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2016-08-07 16:15:57,95000.0,2016-06-08 16:15:57,Cookietownworld,3229.00,cookietownworld,False,failed,2016,6,60,games,13,the United States,2045995373,https://www.kickstarter.com/projects/cookietow...,['I think Cookietownworld is an awesome idea. ...,"[['June 23, 2016 8:55 AM PDT']]",2016-06-24,1
4,59,Explore a magical world full of monsters and d...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2017-12-22 20:25:20,15000.0,2017-11-22 20:25:20,Dunpets Colors: Monster-catching RPG,4310.75,dunpets-colors-dungeon-crawl-and-pets,True,failed,2017,11,30,games,22,Spain,1228292914,https://www.kickstarter.com/projects/gugamesde...,"['Any updates about the premier? :)', 'Any pro...","[['April 17, 2018 6:49 PM PDT, 'January 22, 2...",2017-12-08,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6954,39,"A year ago, he was left alone. A young German ...","{""id"":32,""name"":""Shorts"",""slug"":""film & video/...",2018-07-01 15:23:51,4750.0,2018-06-05 15:23:51,A Year Ago Yesterday: A Short Film,4750.00,a-year-ago-yesterday-a-short-film,False,successful,2018,6,26,film & video,23,the United States,1301583231,https://www.kickstarter.com/projects/492701766...,['Very Proud of you Tori!! May God Bless YOU a...,"[['June 26, 2018 3:48 AM PDT']]",2018-06-21,1
6955,26,Help us transform our center into a self-susta...,"{""id"":314,""name"":""Spaces"",""slug"":""food/spaces""...",2018-05-23 04:04:24,10000.0,2018-04-23 04:04:24,Sustainable PopUp Community at Elf Works Retre...,571.00,sustainable-popup-community-at-elf-works-retre...,False,failed,2018,4,30,food,15,the United States,2070909818,https://www.kickstarter.com/projects/elfworksp...,"[""I nominate Sarah Lipuma and Jean Noel Nesta ...","[['April 25, 2018 4:58 PM PDT, 'April 25, 201...",2018-05-08,6
6956,188,Charting 12 years of Abbey Road Studio wall gr...,"{""id"":7,""name"":""Design"",""slug"":""design"",""posit...",2019-10-31 10:07:46,5000.0,2019-10-10 10:07:46,Abbey Road Graffiti Book,7960.00,abbey-road-graffiti-book,False,successful,2019,10,21,design,16,other,94455712,https://www.kickstarter.com/projects/abbeyrd1/...,"['A neat book, thanks!']","[['May 18, 2020 4:39 AM PDT']]",2019-10-25,1
6957,26,A woman returns home to the family she abandon...,"{""id"":293,""name"":""Drama"",""slug"":""film & video/...",2016-01-13 20:59:01,3000.0,2015-12-14 20:59:01,Mary Whitman Comes Home,3000.00,mary-whitman-comes-home,False,successful,2015,12,30,film & video,12,the United States,715308276,https://www.kickstarter.com/projects/981957749...,"[""Saw that you are fully funded!! So excited f...","[['January 13, 2016 3:06 PM PST, 'January 13,...",2015-12-30,4


In [26]:
df_full

Unnamed: 0,backers_count,blurb,category,deadline,goal,launched_at,name,pledged,slug,staff_pick,state,launched_year,launched_month,campaign_length,category_type,blurb_length,country_name,id,url,comments,dates
0,5,A Free to Win Online Mobile Strategy Game to e...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2017-10-09 14:41:03,1000.0,2017-09-25 14:41:03,Iron Age Kingdoms - First Free To Win Mobile S...,105.0,iron-age-kingdoms-first-free-to-win-mobile-str...,False,failed,2017,9,14,games,24,Canada,,,,
1,19,"Lizard is a dark comedy and modern western, th...","{""id"":11,""name"":""Film & Video"",""slug"":""film & ...",2019-09-21 23:39:48,1100.0,2019-07-23 23:39:48,"""Lizard"" a short film",2358.0,lizard-a-short-film,False,successful,2019,7,60,film & video,15,the United States,,,,
2,5,My friend and I are making an app that has nev...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2016-03-22 21:11:17,25000.0,2016-01-22 22:11:17,Total Franchise Football 2016,172.0,total-franchise-football-2016,False,failed,2016,1,60,games,25,the United States,,,,
3,23,"More than a bike trip, a state of mind. And a ...","{""id"":48,""name"":""Nonfiction"",""slug"":""publishin...",2012-09-25 02:33:06,500.0,2012-09-15 02:33:06,The Trans-American Psychogeographic Literary C...,501.0,the-trans-american-psychogeographic-literary-c...,False,successful,2012,9,10,publishing,12,the United States,,,,
4,0,This is the birth of a revolutionary artist......,"{""id"":39,""name"":""Hip-Hop"",""slug"":""music/hip-ho...",2017-12-11 02:43:11,2500.0,2017-11-10 02:43:11,The Golden Ticket to the Wonka Factory,0.0,the-golden-ticket-to-the-wonka-factory,False,failed,2017,11,31,music,23,the United States,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169647,48,The bee all end all Kickstarter for crazy bitc...,"{""id"":10,""name"":""Food"",""slug"":""food"",""position...",2019-03-29 22:35:40,550.0,2019-02-27 23:35:40,Bitches Bee Crazy Hives,2145.0,bitches-bee-crazy-hives,False,successful,2019,2,30,food,15,the United States,,,,
169648,179,Detective Tori Jones and her partner Officer R...,"{""id"":33,""name"":""Webseries"",""slug"":""film & vid...",2012-05-19 04:00:00,30000.0,2012-04-15 19:31:32,Lesbian Cops: The Movie - Season 2,30045.5,lesbian-cops-the-movie-season-2-0,False,successful,2012,4,33,film & video,19,the United States,,,,
169649,18,Help me produce the play I have written for my...,"{""id"":285,""name"":""Plays"",""slug"":""theater/plays...",2014-09-30 16:00:00,500.0,2014-08-31 14:03:20,Generations (Senior Project),606.0,generations-senior-project,False,successful,2014,8,30,theater,12,the United States,,,,
169650,0,This documentary asks if the twenty-four hour ...,"{""id"":21,""name"":""Digital Art"",""slug"":""art/digi...",2013-06-07 18:39:19,10000.0,2013-05-08 18:39:19,US: A Story of our United States,0.0,us-a-story-of-our-united-states,False,failed,2013,5,30,art,22,the United States,,,,


In [27]:
def extract(cat):
    x = ast.literal_eval(re.search('({.+})', cat).group(0))
    if 'parent_name' not in x.keys():
        return x['slug']
    else:
        return x['parent_name'].lower()

In [28]:
df['category_type'] = df['category'].map(lambda x: extract(x))
df = df[df['state'].isin(['successful', 'failed'])]
df_full['category_type'] = df_full['category'].map(lambda x: extract(x))
df_full = df_full[df_full['state'].isin(['successful', 'failed'])]

In [29]:
print(df.shape)
print(df_full.shape)

(6959, 23)
(169652, 21)


In [30]:
df_full.head()

Unnamed: 0,backers_count,blurb,category,deadline,goal,launched_at,name,pledged,slug,staff_pick,state,launched_year,launched_month,campaign_length,category_type,blurb_length,country_name,id,url,comments,dates
0,5,A Free to Win Online Mobile Strategy Game to e...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2017-10-09 14:41:03,1000.0,2017-09-25 14:41:03,Iron Age Kingdoms - First Free To Win Mobile S...,105.0,iron-age-kingdoms-first-free-to-win-mobile-str...,False,failed,2017,9,14,games,24,Canada,,,,
1,19,"Lizard is a dark comedy and modern western, th...","{""id"":11,""name"":""Film & Video"",""slug"":""film & ...",2019-09-21 23:39:48,1100.0,2019-07-23 23:39:48,"""Lizard"" a short film",2358.0,lizard-a-short-film,False,successful,2019,7,60,film & video,15,the United States,,,,
2,5,My friend and I are making an app that has nev...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2016-03-22 21:11:17,25000.0,2016-01-22 22:11:17,Total Franchise Football 2016,172.0,total-franchise-football-2016,False,failed,2016,1,60,games,25,the United States,,,,
3,23,"More than a bike trip, a state of mind. And a ...","{""id"":48,""name"":""Nonfiction"",""slug"":""publishin...",2012-09-25 02:33:06,500.0,2012-09-15 02:33:06,The Trans-American Psychogeographic Literary C...,501.0,the-trans-american-psychogeographic-literary-c...,False,successful,2012,9,10,publishing,12,the United States,,,,
4,0,This is the birth of a revolutionary artist......,"{""id"":39,""name"":""Hip-Hop"",""slug"":""music/hip-ho...",2017-12-11 02:43:11,2500.0,2017-11-10 02:43:11,The Golden Ticket to the Wonka Factory,0.0,the-golden-ticket-to-the-wonka-factory,False,failed,2017,11,31,music,23,the United States,,,,


In [31]:
df['dates'][2]

["['May 30, 2012 3:37 PM PDT",
 " 'May 14, 2012 2:46 PM PDT",
 " 'May 12, 2012 9:48 AM PDT",
 " 'May 7, 2012 8:00 PM PDT']"]

In [32]:
def to_date(x):
    lst = []
    for i in x:
        i = i.replace("[", '').replace("]", '')
        d = pd.to_datetime(i[:-4])
        lst.append(d)
    return lst

In [33]:
df['dates'] = df['dates'].map(lambda x: to_date(x))

In [34]:
def date_subtract(df):
    lst = []
    for index, row in df.iterrows():
        row_lst =[]
        r = row['dates']
        for i in r:
            if i < row['mid_campaign']:
                row_lst.append(i)
        lst.append(row_lst)
    return lst 

In [35]:
def real_comments(df):
    lst = []
    for index, row in df.iterrows():
        r = eval(row['comments'])
        a = row['pre_mid_campaign_dates_length']
        if a > 0:
            lst.append(r[-a:])
        else:
            lst.append([])
    return lst


# eval(df['comments'][2])[-3:]

In [36]:
def have_comments(x):
    if x > 0:
        return 1
    else:
        return 0

In [37]:
df['pre_mid_campaign_dates'] = date_subtract(df)
df['pre_mid_campaign_dates_length'] = df['pre_mid_campaign_dates'].apply(lambda x: len(x))
df['have_comments'] = df['pre_mid_campaign_dates_length'].apply(lambda x: have_comments(x))

In [38]:
df['pre_mid_campaign_comments'] = real_comments(df)

In [39]:
df.head()

Unnamed: 0,backers_count,blurb,category,deadline,goal,launched_at,name,pledged,slug,staff_pick,state,launched_year,launched_month,campaign_length,category_type,blurb_length,country_name,id,url,comments,dates,mid_campaign,comment_length,pre_mid_campaign_dates,pre_mid_campaign_dates_length,have_comments,pre_mid_campaign_comments
0,152,A photography and poetry book featuring contem...,"{""id"":278,""name"":""People"",""slug"":""photography/...",2017-05-28 17:00:00,15000.0,2017-05-03 16:57:18,Burma Storybook,15777.0,burma-storybook,True,successful,2017,5,25,photography,13,the Netherlands,1769794304,https://www.kickstarter.com/projects/618155130...,['Oh! My! The book has arrived and I am speech...,"[2017-06-30 10:52:00, 2017-05-26 03:53:00]",2017-05-19,2,[],0,0,[]
1,116,100 sumie ink originals by David Mack on top o...,"{""id"":276,""name"":""Fine Art"",""slug"":""photograph...",2018-07-29 21:05:01,8000.0,2018-06-29 21:05:01,The Alchemy of Collaboration,16706.0,the-alchemy-of-collaboration,False,successful,2018,6,30,photography,21,the United States,526305087,https://www.kickstarter.com/projects/punchgrap...,"[""I know you read the messages. I've been chas...","[2020-06-01 10:55:00, 2020-05-21 09:46:00, 202...",2018-07-15,21,[],0,0,[]
2,50,"8 artists explore the collision of art, the en...","{""id"":54,""name"":""Mixed Media"",""slug"":""art/mixe...",2012-06-04 03:59:00,3300.0,2012-05-03 12:13:56,Frack This? - The Wyoming Artist Expedition,3805.0,frack-this-the-wyoming-artist-expedition,False,successful,2012,5,32,art,16,the United States,1103726466,https://www.kickstarter.com/projects/331530956...,"['To the Artists:', 'I encourage each artist t...","[2012-05-30 15:37:00, 2012-05-14 14:46:00, 201...",2012-05-19,4,"[2012-05-14 14:46:00, 2012-05-12 09:48:00, 201...",3,1,"[Lori, the kids will want you and Uncle Michae..."
3,8,Cookietownworld is an interactive website desi...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2016-08-07 16:15:57,95000.0,2016-06-08 16:15:57,Cookietownworld,3229.0,cookietownworld,False,failed,2016,6,60,games,13,the United States,2045995373,https://www.kickstarter.com/projects/cookietow...,['I think Cookietownworld is an awesome idea. ...,[2016-06-23 08:55:00],2016-06-24,1,[2016-06-23 08:55:00],1,1,[I think Cookietownworld is an awesome idea. I...
4,59,Explore a magical world full of monsters and d...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2017-12-22 20:25:20,15000.0,2017-11-22 20:25:20,Dunpets Colors: Monster-catching RPG,4310.75,dunpets-colors-dungeon-crawl-and-pets,True,failed,2017,11,30,games,22,Spain,1228292914,https://www.kickstarter.com/projects/gugamesde...,"['Any updates about the premier? :)', 'Any pro...","[2018-04-17 18:49:00, 2018-01-22 01:59:00, 201...",2017-12-08,18,"[2017-12-03 12:14:00, 2017-12-03 07:10:00, 201...",11,1,"[Any progress update on the premiere?, Sorry t..."


In [40]:
df.shape

(6959, 27)

In [41]:
eval(df['comments'][2])[-3:]

['Lori, the kids will want you and Uncle Michael to sign their card!  Enjoy your trip.',
 'My $25 pledge was on behalf of Nina Gibbons, who donated to support this project.  Thanks so much for your donation.',
 'Hope you guys have a great trip! My only request-my brother and Lori have to sign my book!']

In [42]:
df.columns

Index(['backers_count', 'blurb', 'category', 'deadline', 'goal', 'launched_at',
       'name', 'pledged', 'slug', 'staff_pick', 'state', 'launched_year',
       'launched_month', 'campaign_length', 'category_type', 'blurb_length',
       'country_name', 'id', 'url', 'comments', 'dates', 'mid_campaign',
       'comment_length', 'pre_mid_campaign_dates',
       'pre_mid_campaign_dates_length', 'have_comments',
       'pre_mid_campaign_comments'],
      dtype='object')

In [43]:
df_full.columns

Index(['backers_count', 'blurb', 'category', 'deadline', 'goal', 'launched_at',
       'name', 'pledged', 'slug', 'staff_pick', 'state', 'launched_year',
       'launched_month', 'campaign_length', 'category_type', 'blurb_length',
       'country_name', 'id', 'url', 'comments', 'dates'],
      dtype='object')

In [44]:
df_full.shape

(169652, 21)

In [45]:
df_filtered = df[['name', 'have_comments']].copy()

In [46]:
df_master = pd.merge(df_full, df_filtered, on='name', how='left')
df_master = df_master[df_master['state'].isin(['successful', 'failed'])]
df_master['category_type'] = df_master['category'].map(lambda x: extract(x))
df_master['have_comments'].fillna(0, inplace=True)

In [47]:
df_master['have_comments'].value_counts()

0.0    166600
1.0      3588
Name: have_comments, dtype: int64

## One Hot Encoding 

In [48]:
df_master.head()

Unnamed: 0,backers_count,blurb,category,deadline,goal,launched_at,name,pledged,slug,staff_pick,state,launched_year,launched_month,campaign_length,category_type,blurb_length,country_name,id,url,comments,dates,have_comments
0,5,A Free to Win Online Mobile Strategy Game to e...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2017-10-09 14:41:03,1000.0,2017-09-25 14:41:03,Iron Age Kingdoms - First Free To Win Mobile S...,105.0,iron-age-kingdoms-first-free-to-win-mobile-str...,False,failed,2017,9,14,games,24,Canada,,,,,0.0
1,19,"Lizard is a dark comedy and modern western, th...","{""id"":11,""name"":""Film & Video"",""slug"":""film & ...",2019-09-21 23:39:48,1100.0,2019-07-23 23:39:48,"""Lizard"" a short film",2358.0,lizard-a-short-film,False,successful,2019,7,60,film & video,15,the United States,,,,,0.0
2,5,My friend and I are making an app that has nev...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",2016-03-22 21:11:17,25000.0,2016-01-22 22:11:17,Total Franchise Football 2016,172.0,total-franchise-football-2016,False,failed,2016,1,60,games,25,the United States,,,,,0.0
3,23,"More than a bike trip, a state of mind. And a ...","{""id"":48,""name"":""Nonfiction"",""slug"":""publishin...",2012-09-25 02:33:06,500.0,2012-09-15 02:33:06,The Trans-American Psychogeographic Literary C...,501.0,the-trans-american-psychogeographic-literary-c...,False,successful,2012,9,10,publishing,12,the United States,,,,,0.0
4,0,This is the birth of a revolutionary artist......,"{""id"":39,""name"":""Hip-Hop"",""slug"":""music/hip-ho...",2017-12-11 02:43:11,2500.0,2017-11-10 02:43:11,The Golden Ticket to the Wonka Factory,0.0,the-golden-ticket-to-the-wonka-factory,False,failed,2017,11,31,music,23,the United States,,,,,0.0


In [49]:
df_master_ohe = pd.get_dummies(df_master[['category_type', 'country_name', 'staff_pick']])

In [50]:
df_master.columns

Index(['backers_count', 'blurb', 'category', 'deadline', 'goal', 'launched_at',
       'name', 'pledged', 'slug', 'staff_pick', 'state', 'launched_year',
       'launched_month', 'campaign_length', 'category_type', 'blurb_length',
       'country_name', 'id', 'url', 'comments', 'dates', 'have_comments'],
      dtype='object')

In [51]:
data = pd.merge(df_master, df_master_ohe, left_index=True, right_index=True)
data.shape

(170188, 48)

In [52]:
good_data = data[data['state'].isin(['successful', 'failed'])]
live = data[data['state'].isin(['live'])]
print(good_data.shape)
print(live.shape)

(170188, 48)
(0, 48)


In [53]:
good_data['state'] = good_data['state'].replace(to_replace=['successful', 'failed'], value=[1,0])

In [54]:
model_data = good_data.copy()

In [55]:
model_data.shape

(170188, 48)

In [56]:
# model_data.dropna(inplace=True)
# print(model_data.shape)

In [57]:
model_data.columns

Index(['backers_count', 'blurb', 'category', 'deadline', 'goal', 'launched_at',
       'name', 'pledged', 'slug', 'staff_pick_x', 'state', 'launched_year',
       'launched_month', 'campaign_length', 'category_type', 'blurb_length',
       'country_name', 'id', 'url', 'comments', 'dates', 'have_comments',
       'staff_pick_y', 'category_type_art', 'category_type_comics',
       'category_type_crafts', 'category_type_dance', 'category_type_design',
       'category_type_fashion', 'category_type_film & video',
       'category_type_food', 'category_type_games', 'category_type_journalism',
       'category_type_music', 'category_type_photography',
       'category_type_publishing', 'category_type_technology',
       'category_type_theater', 'country_name_Australia',
       'country_name_Canada', 'country_name_France', 'country_name_Germany',
       'country_name_Italy', 'country_name_Mexico', 'country_name_Spain',
       'country_name_other', 'country_name_the Netherlands',
       'count

In [58]:
X = model_data.drop(['backers_count','blurb', 'category', 'deadline', 'launched_at', 
                     'name', 'pledged', 'slug', 'state', 'category_type', 'country_name',
                    'staff_pick_y', 'id', 'url', 'comments', 'dates'],
                   axis=1)
y = model_data['state']


In [59]:
X.shape

(170188, 32)

## Scraped Campaigns Only


In [60]:
df_filtered = df[['name', 'have_comments']].copy()

In [61]:
model_data_ = pd.merge(full, df_filtered, on='name', how='left')
model_data_.fillna(0,inplace=True)


In [63]:
model_data1 = pd.merge(data, model_data_, on='name', how='right')

In [64]:
model_data1.fillna(0, inplace=True)
# model_data1['state'] = model_data1['state'].apply(lambda x: 1 if x == '')
model_data1['state'] = model_data1['state'].replace(to_replace=['successful', 'failed'], value=[1,0])

In [65]:
X1 = model_data1.drop(['backers_count','blurb', 'category', 'deadline', 
                     'launched_at', 'name', 'pledged', 'slug', 'state', 'category_type', 'country_name', 'id_x', 'url_x',
                       'comments_x', 'dates', 'staff_pick_y', 'Unnamed: 0', 'id_y', 'url_y', 'comments_y', 'have_comments_y'],
                   axis=1)
y1 = model_data1['state']*1

## Modeling

In [66]:
## Old model with added "have comments" features. FULL DATASET
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [67]:
def get_scores(classifier, X_train, X_test, y_train, y_test, **kwargs):
    model = classifier(**kwargs)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    return model.score(X_test, y_test), precision_score(y_test, y_predict), recall_score(y_test, y_predict)
print("Model, Accuracy, Precision, Recall")
print("    Random Forest:", get_scores(RandomForestClassifier, X_train, X_test, y_train, y_test))
print("    Logistic Regression:", get_scores(LogisticRegression, X_train, X_test, y_train, y_test))
print("    Gradient Boost:", get_scores(GradientBoostingClassifier, X_train, X_test, y_train, y_test))

Model, Accuracy, Precision, Recall
    Random Forest: (0.735022445765859, 0.7570814120199946, 0.7960721998125738)
    Logistic Regression: (0.6242978353350412, 0.6222361880820431, 0.887503565171332)
    Gradient Boost: (0.7385714621477425, 0.7451768488745981, 0.830949761642831)


In [68]:
## Old model with added "have comments" features. 17K DATAPOINTS, THOSE THAT ARE SCRAPED
X_train, X_test, y_train, y_test = train_test_split(X1, y1)

In [69]:
def get_scores(classifier, X_train, X_test, y_train, y_test, **kwargs):
    model = classifier(**kwargs)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    return model.score(X_test, y_test), precision_score(y_test, y_predict), recall_score(y_test, y_predict)
print("Model, Accuracy, Precision, Recall")
print("    Random Forest:", get_scores(RandomForestClassifier, X_train, X_test, y_train, y_test))
print("    Logistic Regression:", get_scores(LogisticRegression, X_train, X_test, y_train, y_test))
print("    Gradient Boost:", get_scores(GradientBoostingClassifier, X_train, X_test, y_train, y_test))

Model, Accuracy, Precision, Recall
    Random Forest: (0.8112610361485924, 0.8312298935906953, 0.8816272965879265)
    Logistic Regression: (0.6621689155422289, 0.6691343963553531, 0.9251968503937008)
    Gradient Boost: (0.8125937031484258, 0.8270401948842875, 0.8910761154855643)


## LSTM

In [105]:
test_input = df['comments'][:5].values

In [108]:
tokenizer = Tokenizer(num_words=2)
tokenizer.fit_on_texts(test_input)
# xtrain= tokenizer.texts_to_sequences(x_train)
xtest= tokenizer.texts_to_sequences(test_input) 

In [110]:
maxlen=10
# xtrain=pad_sequences(xtrain,padding='post', maxlen=maxlen)
xtest=pad_sequences(xtest,padding='post', maxlen=maxlen)

In [111]:
print(xtest[2])
print(test_input[2])

[1 1 1 1 1 1 0 0 0 0]
['To the Artists:', 'I encourage each artist to be open minded regarding both the environmental consequences and the benefits of oil/gas development.', '', 'I believe you will find that residents of Wyoming, by in large, understand that there is a balance. Also keep in mind that producers are highly regulated in Wyoming and are held accountable, to a large extent, for remediating  damages and restoring of disturbances. I challenge these artists to take into consideration that there is a cost and a benefit to the development in the Red Desert.', 'Lori, the kids will want you and Uncle Michael to sign their card!  Enjoy your trip.', 'My $25 pledge was on behalf of Nina Gibbons, who donated to support this project.  Thanks so much for your donation.', 'Hope you guys have a great trip! My only request-my brother and Lori have to sign my book!']


In [112]:
vocab_size=len(tokenizer.word_index)+1

In [113]:
model=Sequential()

In [114]:
trained_model = keras.models.load_model('test_model')

In [118]:
np.testing.assert_allclose(
  trained_model.predict(xtest),
  trained_model.predict(xtest))

In [119]:
predict = trained_model.predict(xtest)

In [120]:
predict

array([[0.19210683],
       [0.00038114],
       [0.03252702],
       [0.19210683],
       [0.00038114]], dtype=float32)

In [94]:
trained_model

<tensorflow.python.keras.saving.saved_model.load.Sequential at 0x7f8991f05310>

In [98]:
real_model = keras.models.load_model('my_model')

TypeError: Error converting shape to a TensorShape: Dimension value must be integer or None or have an __index__ method, got 'class_name'.