In [145]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import ast
import nltk
import glob

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline

from nltk.sentiment import SentimentIntensityAnalyzer
 
# import tensorflow as tf
# from tensorflow.keras import layers
# keras = tf.keras

plt.style.use('ggplot')
%matplotlib inline

In [244]:
pd.set_option('display.max_columns', 50)

In [245]:
master = pd.read_csv('data/master_cleaned.csv')

In [246]:
comments = pd.read_csv('data/comments.csv')

In [247]:
dates = pd.read_csv('data/comments_dates.csv', names=['name','url','dates'])

In [248]:
comments.shape

(17642, 3)

In [249]:
comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17642 entries, 0 to 17641
Data columns (total 3 columns):
Unnamed: 0    17642 non-null int64
url           17642 non-null object
comments      17642 non-null object
dtypes: int64(1), object(2)
memory usage: 413.6+ KB


In [250]:
def have_comments(x):
    if len(x) > 2:
        return 1
    else:
        return 0

In [251]:
comments['have_comments'] = comments['comments'].map(lambda x: have_comments(x))


In [252]:
comments

Unnamed: 0.1,Unnamed: 0,url,comments,have_comments
0,0,https://www.kickstarter.com/projects/117862918...,[],0
1,0,https://www.kickstarter.com/projects/lizardsho...,[],0
2,0,https://www.kickstarter.com/projects/184368767...,[],0
3,0,https://www.kickstarter.com/projects/376746530...,[],0
4,0,https://www.kickstarter.com/projects/116371417...,[],0
5,0,https://www.kickstarter.com/projects/153660549...,[],0
6,0,https://www.kickstarter.com/projects/205142684...,[],0
7,0,https://www.kickstarter.com/projects/1fifty1/s...,[],0
8,0,https://www.kickstarter.com/projects/relaydanc...,[],0
9,0,https://www.kickstarter.com/projects/132725698...,[],0


In [253]:
comments['comments'][11] = []
comments['comments'][12] = []

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [259]:
urls = pd.read_csv('data/comments_urls_copy.csv')

In [260]:
urls.head()

Unnamed: 0.1,Unnamed: 0,id,name,url
0,0,751419376,Iron Age Kingdoms - First Free To Win Mobile S...,https://www.kickstarter.com/projects/117862918...
1,1,289952460,"""Lizard"" a short film",https://www.kickstarter.com/projects/lizardsho...
2,2,37585826,Total Franchise Football 2016,https://www.kickstarter.com/projects/184368767...
3,3,1727301949,The Trans-American Psychogeographic Literary C...,https://www.kickstarter.com/projects/376746530...
4,4,144196412,The Golden Ticket to the Wonka Factory,https://www.kickstarter.com/projects/116371417...


In [261]:
# merge comments table with url tables to get campaign id
full = pd.merge(urls, comments, how='right', on= 'url', )

In [262]:
full.shape

(17642, 7)

In [263]:
full_with_dates = pd.merge(full, dates, on='url')
full_with_dates.drop(columns=['Unnamed: 0_x', 'Unnamed: 0_y', 'name_y'], inplace=True)

In [264]:
full_with_dates.rename(columns={'name_x': 'name'}, inplace=True)


In [265]:
full_with_dates

Unnamed: 0,id,name,url,comments,have_comments,dates
0,1769794304,Burma Storybook,https://www.kickstarter.com/projects/618155130...,['Oh! My! The book has arrived and I am speech...,1,"['June 30, 2017 10:52 AM PDT', 'May 26, 2017 3..."
1,526305087,The Alchemy of Collaboration,https://www.kickstarter.com/projects/punchgrap...,"[""I know you read the messages. I've been chas...",1,"['June 1, 2020 10:55 AM PDT', 'May 21, 2020 9:..."
2,1103726466,Frack This? - The Wyoming Artist Expedition,https://www.kickstarter.com/projects/331530956...,"['To the Artists:', 'I encourage each artist t...",1,"['May 30, 2012 3:37 PM PDT', 'May 14, 2012 2:4..."
3,2045995373,Cookietownworld,https://www.kickstarter.com/projects/cookietow...,['I think Cookietownworld is an awesome idea. ...,1,"['June 23, 2016 8:55 AM PDT']"
4,1228292914,Dunpets Colors: Monster-catching RPG,https://www.kickstarter.com/projects/gugamesde...,"['Any updates about the premier? :)', 'Any pro...",1,"['April 17, 2018 6:49 PM PDT', 'January 22, 20..."
5,1617261531,My Dream of Jeannie,https://www.kickstarter.com/projects/163694144...,"[""Congratulations on reaching your goal early!...",1,"['March 17, 2013 3:11 PM PDT', 'March 17, 2013..."
6,540877514,Reign of Judges: FINAL STAND to Fund our Full ...,https://www.kickstarter.com/projects/darinsout...,['I gave $25. I wish it was more. I hope that...,1,"['December 12, 2019 8:54 PM PST', 'December 12..."
7,506042363,LoveFound by Jon Rua,https://www.kickstarter.com/projects/makehisto...,"['Jon, I just watched the film - with earbuds ...",1,"['January 31, 2017 2:26 PM PST', 'January 9, 2..."
8,214046313,Send GenerOZity to the Moon! (Live Performance...,https://www.kickstarter.com/projects/generozit...,"['All the best guys & girls, see you at PAX!']",1,"['September 24, 2018 7:25 PM PDT']"
9,1093019685,Beats n Bars Festival 2017,https://www.kickstarter.com/projects/beatsnbar...,"['Knew you could do it.', 'Congratulations! Th...",1,"['July 3, 2017 3:23 PM PDT', 'July 1, 2017 8:4..."


## Merge with Master

In [266]:
# full df with comments and dates
df = pd.merge(master, full_with_dates, on='name')
df.drop(columns=['Unnamed: 0', 'country_name'], inplace=True)
df.shape

(2518, 22)

In [267]:
# master list with outer join, with comments or no comments 
df_full = pd.merge(master, full, on='name', how='left')
df_full.drop(columns=['Unnamed: 0', 'Unnamed: 0_x', 'Unnamed: 0_y','country_name'], inplace=True)
df_full.shape

(169626, 21)

In [268]:
df_full.head()

Unnamed: 0,backers_count,blurb,category,country_displayable_name,deadline,goal,launched_at,name,pledged,slug,staff_pick,state,launched_year,launched_month,campaign_length,category_type,blurb_length,id,url,comments,have_comments
0,5,A Free to Win Online Mobile Strategy Game to e...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",Canada,2017-10-09 14:41:03,1000.0,2017-09-25 14:41:03,Iron Age Kingdoms - First Free To Win Mobile S...,105.0,iron-age-kingdoms-first-free-to-win-mobile-str...,False,failed,2017,9,14,games,24,751419400.0,https://www.kickstarter.com/projects/117862918...,[],0.0
1,19,"Lizard is a dark comedy and modern western, th...","{""id"":11,""name"":""Film & Video"",""slug"":""film & ...",the United States,2019-09-21 23:39:48,1100.0,2019-07-23 23:39:48,"""Lizard"" a short film",2358.0,lizard-a-short-film,False,successful,2019,7,60,film & video,15,289952500.0,https://www.kickstarter.com/projects/lizardsho...,[],0.0
2,5,My friend and I are making an app that has nev...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",the United States,2016-03-22 21:11:17,25000.0,2016-01-22 22:11:17,Total Franchise Football 2016,172.0,total-franchise-football-2016,False,failed,2016,1,60,games,25,37585830.0,https://www.kickstarter.com/projects/184368767...,[],0.0
3,23,"More than a bike trip, a state of mind. And a ...","{""id"":48,""name"":""Nonfiction"",""slug"":""publishin...",the United States,2012-09-25 02:33:06,500.0,2012-09-15 02:33:06,The Trans-American Psychogeographic Literary C...,501.0,the-trans-american-psychogeographic-literary-c...,False,successful,2012,9,10,publishing,12,1727302000.0,https://www.kickstarter.com/projects/376746530...,[],0.0
4,0,This is the birth of a revolutionary artist......,"{""id"":39,""name"":""Hip-Hop"",""slug"":""music/hip-ho...",the United States,2017-12-11 02:43:11,2500.0,2017-11-10 02:43:11,The Golden Ticket to the Wonka Factory,0.0,the-golden-ticket-to-the-wonka-factory,False,failed,2017,11,31,music,23,144196400.0,https://www.kickstarter.com/projects/116371417...,[],0.0


In [269]:
df_full['have_comments'].value_counts()

0.0    10185
1.0     7581
Name: have_comments, dtype: int64

In [270]:
cols_to_convert = ['deadline', 'launched_at']
for c in cols_to_convert:
    df[c] = pd.to_datetime(df[c])

In [271]:
cols_to_convert = ['deadline', 'launched_at']
for c in cols_to_convert:
    df_full[c] = pd.to_datetime(df_full[c])

In [272]:
df['mid_campaign'] = df['launched_at'] + pd.DateOffset(15)
df['mid_campaign'] = df['mid_campaign'].dt.round('d')
df['dates'] = df['dates'].map(lambda x: x.split("',"))
df['comment_length'] = df['dates'].map(lambda x: len(x))

In [273]:
df_full['mid_campaign'] = df_full['launched_at'] + pd.DateOffset(15)
df_full['mid_campaign'] = df_full['mid_campaign'].dt.round('d')


In [274]:
def country_cat(x):
    country_list = ['the United States', 'the United Kingdom ', 'Canada', 'Australia',
                   'Germany', 'France', 'Mexico', 'Italy', 'Spain', 'the Netherlands']
    for country in x:
        if country in country_list:
            return country
        else:
            return 'other'

In [275]:
def extract(cat):
    x = ast.literal_eval(re.search('({.+})', cat).group(0))
    if 'parent_name' not in x.keys():
        return x['slug']
    else:
        return x['parent_name'].lower()

In [276]:
df['country_name'] = df['country_displayable_name'].map(lambda x: country_cat([x]))
df_full['country_name'] = df_full['country_displayable_name'].map(lambda x: country_cat([x]))

In [277]:
df['category_type'] = df['category'].map(lambda x: extract(x))
df = df[df['state'].isin(['successful', 'failed'])]

In [278]:
df_full['category_type'] = df_full['category'].map(lambda x: extract(x))
df_full = df_full[df_full['state'].isin(['successful', 'failed'])]

In [281]:
df_full.head()

Unnamed: 0,backers_count,blurb,category,country_displayable_name,deadline,goal,launched_at,name,pledged,slug,staff_pick,state,launched_year,launched_month,campaign_length,category_type,blurb_length,id,url,comments,have_comments,mid_campaign,country_name
0,5,A Free to Win Online Mobile Strategy Game to e...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",Canada,2017-10-09 14:41:03,1000.0,2017-09-25 14:41:03,Iron Age Kingdoms - First Free To Win Mobile S...,105.0,iron-age-kingdoms-first-free-to-win-mobile-str...,False,failed,2017,9,14,games,24,751419400.0,https://www.kickstarter.com/projects/117862918...,[],0.0,2017-10-11,Canada
1,19,"Lizard is a dark comedy and modern western, th...","{""id"":11,""name"":""Film & Video"",""slug"":""film & ...",the United States,2019-09-21 23:39:48,1100.0,2019-07-23 23:39:48,"""Lizard"" a short film",2358.0,lizard-a-short-film,False,successful,2019,7,60,film & video,15,289952500.0,https://www.kickstarter.com/projects/lizardsho...,[],0.0,2019-08-08,the United States
2,5,My friend and I are making an app that has nev...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",the United States,2016-03-22 21:11:17,25000.0,2016-01-22 22:11:17,Total Franchise Football 2016,172.0,total-franchise-football-2016,False,failed,2016,1,60,games,25,37585830.0,https://www.kickstarter.com/projects/184368767...,[],0.0,2016-02-07,the United States
3,23,"More than a bike trip, a state of mind. And a ...","{""id"":48,""name"":""Nonfiction"",""slug"":""publishin...",the United States,2012-09-25 02:33:06,500.0,2012-09-15 02:33:06,The Trans-American Psychogeographic Literary C...,501.0,the-trans-american-psychogeographic-literary-c...,False,successful,2012,9,10,publishing,12,1727302000.0,https://www.kickstarter.com/projects/376746530...,[],0.0,2012-09-30,the United States
4,0,This is the birth of a revolutionary artist......,"{""id"":39,""name"":""Hip-Hop"",""slug"":""music/hip-ho...",the United States,2017-12-11 02:43:11,2500.0,2017-11-10 02:43:11,The Golden Ticket to the Wonka Factory,0.0,the-golden-ticket-to-the-wonka-factory,False,failed,2017,11,31,music,23,144196400.0,https://www.kickstarter.com/projects/116371417...,[],0.0,2017-11-25,the United States


In [282]:
df['dates'][2]

["['May 30, 2012 3:37 PM PDT",
 " 'May 14, 2012 2:46 PM PDT",
 " 'May 12, 2012 9:48 AM PDT",
 " 'May 7, 2012 8:00 PM PDT']"]

In [283]:
def to_date(x):
    lst = []
    for i in x:
        i = i.replace("[", '').replace("]", '')
        d = pd.to_datetime(i[:-4])
        lst.append(d)
    return lst

In [284]:
df['dates'] = df['dates'].map(lambda x: to_date(x))


In [285]:
def date_subtract(df):
    lst = []
    for index, row in df.iterrows():
        row_lst =[]
        r = row['dates']
        for i in r:
            if i < row['mid_campaign']:
                row_lst.append(i)
        lst.append(row_lst)
    return lst 

In [286]:
def real_comments(df):
    lst = []
    for index, row in df.iterrows():
        r = eval(row['comments'])
        a = row['pre_mid_campaign_dates_length']
        if a > 0:
            lst.append(r[-a:])
        else:
            lst.append([])
    return lst


# eval(df['comments'][2])[-3:]

In [287]:
df['pre_mid_campaign_dates'] = date_subtract(df)
df['pre_mid_campaign_dates_length'] = df['pre_mid_campaign_dates'].apply(lambda x: len(x))

In [288]:
df['pre_mid_campaign_comments'] = real_comments(df)

In [289]:
df

Unnamed: 0,backers_count,blurb,category,country_displayable_name,deadline,goal,launched_at,name,pledged,slug,staff_pick,state,launched_year,launched_month,campaign_length,category_type,blurb_length,id,url,comments,have_comments,dates,mid_campaign,comment_length,country_name,pre_mid_campaign_dates,pre_mid_campaign_dates_length,pre_mid_campaign_comments
0,152,A photography and poetry book featuring contem...,"{""id"":278,""name"":""People"",""slug"":""photography/...",the Netherlands,2017-05-28 17:00:00,15000.0,2017-05-03 16:57:18,Burma Storybook,15777.00,burma-storybook,True,successful,2017,5,25,photography,13,1769794304,https://www.kickstarter.com/projects/618155130...,['Oh! My! The book has arrived and I am speech...,1,"[2017-06-30 10:52:00, 2017-05-26 03:53:00]",2017-05-19,2,the Netherlands,[],0,[]
1,116,100 sumie ink originals by David Mack on top o...,"{""id"":276,""name"":""Fine Art"",""slug"":""photograph...",the United States,2018-07-29 21:05:01,8000.0,2018-06-29 21:05:01,The Alchemy of Collaboration,16706.00,the-alchemy-of-collaboration,False,successful,2018,6,30,photography,21,526305087,https://www.kickstarter.com/projects/punchgrap...,"[""I know you read the messages. I've been chas...",1,"[2020-06-01 10:55:00, 2020-05-21 09:46:00, 202...",2018-07-15,21,the United States,[],0,[]
2,50,"8 artists explore the collision of art, the en...","{""id"":54,""name"":""Mixed Media"",""slug"":""art/mixe...",the United States,2012-06-04 03:59:00,3300.0,2012-05-03 12:13:56,Frack This? - The Wyoming Artist Expedition,3805.00,frack-this-the-wyoming-artist-expedition,False,successful,2012,5,32,art,16,1103726466,https://www.kickstarter.com/projects/331530956...,"['To the Artists:', 'I encourage each artist t...",1,"[2012-05-30 15:37:00, 2012-05-14 14:46:00, 201...",2012-05-19,4,the United States,"[2012-05-14 14:46:00, 2012-05-12 09:48:00, 201...",3,"[Lori, the kids will want you and Uncle Michae..."
3,8,Cookietownworld is an interactive website desi...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",the United States,2016-08-07 16:15:57,95000.0,2016-06-08 16:15:57,Cookietownworld,3229.00,cookietownworld,False,failed,2016,6,60,games,13,2045995373,https://www.kickstarter.com/projects/cookietow...,['I think Cookietownworld is an awesome idea. ...,1,[2016-06-23 08:55:00],2016-06-24,1,the United States,[2016-06-23 08:55:00],1,[I think Cookietownworld is an awesome idea. I...
4,59,Explore a magical world full of monsters and d...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",Spain,2017-12-22 20:25:20,15000.0,2017-11-22 20:25:20,Dunpets Colors: Monster-catching RPG,4310.75,dunpets-colors-dungeon-crawl-and-pets,True,failed,2017,11,30,games,22,1228292914,https://www.kickstarter.com/projects/gugamesde...,"['Any updates about the premier? :)', 'Any pro...",1,"[2018-04-17 18:49:00, 2018-01-22 01:59:00, 201...",2017-12-08,18,Spain,"[2017-12-03 12:14:00, 2017-12-03 07:10:00, 201...",11,"[Any progress update on the premiere?, Sorry t..."
5,24,I am recreating the I Dream of Jeannie bottle ...,"{""id"":54,""name"":""Mixed Media"",""slug"":""art/mixe...",the United States,2013-03-22 18:38:24,1500.0,2013-02-20 19:38:24,My Dream of Jeannie,1780.00,my-dream-of-jeannie,False,successful,2013,2,30,art,17,1617261531,https://www.kickstarter.com/projects/163694144...,"[""Congratulations on reaching your goal early!...",1,"[2013-03-17 15:11:00, 2013-03-17 13:30:00, 201...",2013-03-08,5,the United States,[],0,[]
6,464,Watch our FULL Pilot Short Film Exclusively on...,"{""id"":291,""name"":""Action"",""slug"":""film & video...",the United States,2019-12-13 19:00:00,25000.0,2019-11-14 03:01:07,Reign of Judges: FINAL STAND to Fund our Full ...,52760.45,reign-of-judges-final-stand-to-fund-our-full-f...,False,successful,2019,11,30,film & video,13,540877514,https://www.kickstarter.com/projects/darinsout...,['I gave $25. I wish it was more. I hope that...,1,"[2019-12-12 20:54:00, 2019-12-12 20:45:00, 201...",2019-11-29,12,the United States,"[2019-11-18 17:30:00, 2019-11-18 17:29:00, 201...",5,[If I had the means to fund the entire project...
7,150,LoveFound | A mind's journey through love | A ...,"{""id"":254,""name"":""Performances"",""slug"":""dance/...",the United States,2017-01-11 04:59:00,6000.0,2016-12-07 15:08:28,LoveFound by Jon Rua,13225.00,lovefound-by-jon-rua,True,successful,2016,12,35,dance,23,506042363,https://www.kickstarter.com/projects/makehisto...,"['Jon, I just watched the film - with earbuds ...",1,"[2017-01-31 14:26:00, 2017-01-09 22:44:00, 201...",2016-12-23,3,the United States,[],0,[]
8,32,"We're going to PAX Australia, who's coming wit...","{""id"":295,""name"":""Festivals"",""slug"":""film & vi...",Australia,2018-10-06 14:00:08,3000.0,2018-09-06 14:00:08,Send GenerOZity to the Moon! (Live Performance...,4946.32,send-generozity-to-the-moon-live-performance-a...,False,successful,2018,9,30,film & video,9,214046313,https://www.kickstarter.com/projects/generozit...,"['All the best guys & girls, see you at PAX!']",1,[2018-09-24 19:25:00],2018-09-22,1,Australia,[],0,[]
9,305,"2 Days, 1 Outdoor Stage, over 30 performers.\r...","{""id"":39,""name"":""Hip-Hop"",""slug"":""music/hip-ho...",the United States,2017-07-01 04:04:44,25000.0,2017-06-01 04:04:44,Beats n Bars Festival 2017,29811.00,beats-n-bars-festival-2017,False,successful,2017,6,30,music,21,1093019685,https://www.kickstarter.com/projects/beatsnbar...,"['Knew you could do it.', 'Congratulations! Th...",1,"[2017-07-03 15:23:00, 2017-07-01 08:47:00, 201...",2017-06-16,3,the United States,[2017-06-07 12:26:00],1,[Congratulations! Thanks for putting your visi...


In [290]:
a = date_subtract(df)

In [291]:
len(a)

2518

In [187]:
df['comments'][4]

'[\'Any updates about the premier? :)\', \'Any progress update on the premiere?\', "Sorry that you didn\'t reach your funding goal and glad to hear you will still be working on it. Will there be a way to pay for the removal of ads in the version you will be releasing?", \'What happens to those players who signed up to play on the premiere on iOS? Will it continue receiving updates or just time out?\', \'If this doesn’t get funded will it not happen?\', \'Can you post updates to the updates section?\', \'This game is so fun! Hope more people will catch this good game :)\', \'Ahh ok thanks for the clarification :)\', \'Will the progress made on the demo version be transferable to the final game? Or would we have to start over again?\', "That is excellent news, thank you! I\'m hoping for the pc version but would play it on my phone if that goal isn\'t reached. Either way, fun game so far!", \'If the game makes it to the steam version, will we have more controls available than just the arr

In [188]:
df['comments'][2]

"['To the Artists:', 'I encourage each artist to be open minded regarding both the environmental consequences and the benefits of oil/gas development.', '', 'I believe you will find that residents of Wyoming, by in large, understand that there is a balance. Also keep in mind that producers are highly regulated in Wyoming and are held accountable, to a large extent, for remediating  damages and restoring of disturbances. I challenge these artists to take into consideration that there is a cost and a benefit to the development in the Red Desert.', 'Lori, the kids will want you and Uncle Michael to sign their card!  Enjoy your trip.', 'My $25 pledge was on behalf of Nina Gibbons, who donated to support this project.  Thanks so much for your donation.', 'Hope you guys have a great trip! My only request-my brother and Lori have to sign my book!']"

In [None]:
df['pre_mid_campaign_dates'][2]

In [None]:
df['pre_mid_campaign_dates'][5]

In [None]:
df['dates'][4]

In [None]:
eval(df['comments'][2])[-3:]

In [None]:
eval(df['comments'][5])[-11:]

In [None]:
df['comments'][2]

## One Hot Encoding 

In [189]:
df_full.head()

Unnamed: 0,backers_count,blurb,category,country_displayable_name,deadline,goal,launched_at,name,pledged,slug,staff_pick,state,launched_year,launched_month,campaign_length,category_type,blurb_length,id,url,comments,have_comments,mid_campaign,country_name
0,5,A Free to Win Online Mobile Strategy Game to e...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",Canada,2017-10-09 14:41:03,1000.0,2017-09-25 14:41:03,Iron Age Kingdoms - First Free To Win Mobile S...,105.0,iron-age-kingdoms-first-free-to-win-mobile-str...,False,failed,2017,9,14,games,24,751419400.0,https://www.kickstarter.com/projects/117862918...,[],0.0,2017-10-11,Canada
1,19,"Lizard is a dark comedy and modern western, th...","{""id"":11,""name"":""Film & Video"",""slug"":""film & ...",the United States,2019-09-21 23:39:48,1100.0,2019-07-23 23:39:48,"""Lizard"" a short film",2358.0,lizard-a-short-film,False,successful,2019,7,60,film & video,15,289952500.0,https://www.kickstarter.com/projects/lizardsho...,[],0.0,2019-08-08,the United States
2,5,My friend and I are making an app that has nev...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",the United States,2016-03-22 21:11:17,25000.0,2016-01-22 22:11:17,Total Franchise Football 2016,172.0,total-franchise-football-2016,False,failed,2016,1,60,games,25,37585830.0,https://www.kickstarter.com/projects/184368767...,[],0.0,2016-02-07,the United States
3,23,"More than a bike trip, a state of mind. And a ...","{""id"":48,""name"":""Nonfiction"",""slug"":""publishin...",the United States,2012-09-25 02:33:06,500.0,2012-09-15 02:33:06,The Trans-American Psychogeographic Literary C...,501.0,the-trans-american-psychogeographic-literary-c...,False,successful,2012,9,10,publishing,12,1727302000.0,https://www.kickstarter.com/projects/376746530...,[],0.0,2012-09-30,the United States
4,0,This is the birth of a revolutionary artist......,"{""id"":39,""name"":""Hip-Hop"",""slug"":""music/hip-ho...",the United States,2017-12-11 02:43:11,2500.0,2017-11-10 02:43:11,The Golden Ticket to the Wonka Factory,0.0,the-golden-ticket-to-the-wonka-factory,False,failed,2017,11,31,music,23,144196400.0,https://www.kickstarter.com/projects/116371417...,[],0.0,2017-11-25,the United States


In [190]:
df_full_ohe = pd.get_dummies(df_full[['category_type', 'country_name', 'staff_pick']])

In [191]:
df_full.columns

Index(['backers_count', 'blurb', 'category', 'country_displayable_name',
       'deadline', 'goal', 'launched_at', 'name', 'pledged', 'slug',
       'staff_pick', 'state', 'launched_year', 'launched_month',
       'campaign_length', 'category_type', 'blurb_length', 'id', 'url',
       'comments', 'have_comments', 'mid_campaign', 'country_name'],
      dtype='object')

In [193]:
data = pd.merge(df_full, df_full_ohe, left_index=True, right_index=True)
data.shape

(169626, 49)

In [194]:
good_data = data[data['state'].isin(['successful', 'failed'])]
live = data[data['state'].isin(['live'])]
print(good_data.shape)
print(live.shape)

(169626, 49)
(0, 49)


In [195]:
good_data['state'] = good_data['state'].replace(to_replace=['successful', 'failed'], value=[1,0])

In [196]:
model_data = good_data.copy()

In [301]:
model_data.shape

(17748, 49)

In [302]:
model_data.dropna(inplace=True)
print(model_data.shape)

(17748, 49)


In [303]:
model_data['have_comments'].value_counts()

0.0    10175
1.0     7573
Name: have_comments, dtype: int64

In [199]:
X = model_data.drop(['backers_count','blurb', 'category', 'country_displayable_name', 'deadline', 
                     'launched_at', 'name', 'pledged', 'slug', 'state', 'category_type', 'country_name',
                    'staff_pick_y', 'id', 'url', 'comments', 'mid_campaign', 'country_name'],
                   axis=1)
y = model_data['state']


In [200]:
X.columns

Index(['goal', 'staff_pick_x', 'launched_year', 'launched_month',
       'campaign_length', 'blurb_length', 'have_comments', 'category_type_art',
       'category_type_comics', 'category_type_crafts', 'category_type_dance',
       'category_type_design', 'category_type_fashion',
       'category_type_film & video', 'category_type_food',
       'category_type_games', 'category_type_journalism',
       'category_type_music', 'category_type_photography',
       'category_type_publishing', 'category_type_technology',
       'category_type_theater', 'country_name_Australia',
       'country_name_Canada', 'country_name_France', 'country_name_Germany',
       'country_name_Italy', 'country_name_Mexico', 'country_name_Spain',
       'country_name_other', 'country_name_the Netherlands',
       'country_name_the United States'],
      dtype='object')

## Adding Features

In [304]:
df1 = df[['url','pre_mid_campaign_dates_length']]

In [305]:
df.shape

(2518, 28)

In [306]:
model_data1 = pd.merge(model_data, df1, on='url', how='outer')

In [307]:
model_data1.shape

(17858, 50)

In [309]:
model_data1.fillna(0,inplace=True)

In [310]:
X1 = model_data1.drop(['backers_count','blurb', 'category', 'country_displayable_name', 'deadline', 
                     'launched_at', 'name', 'pledged', 'slug', 'state', 'category_type', 'country_name',
                    'staff_pick_y', 'id', 'url', 'comments', 'mid_campaign', 'country_name', 'have_comments'],
                   axis=1)
y1 = model_data1['state']

In [311]:
X1.shape

(17858, 32)

## Modeling

In [206]:
## Old model with added "have comments" features
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [120]:
def get_scores(classifier, X_train, X_test, y_train, y_test, **kwargs):
    model = classifier(**kwargs)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    return model.score(X_test, y_test), precision_score(y_test, y_predict), recall_score(y_test, y_predict)
print("Model, Accuracy, Precision, Recall")
print("    Random Forest:", get_scores(RandomForestClassifier, X_train, X_test, y_train, y_test))
print("    Logistic Regression:", get_scores(LogisticRegression, X_train, X_test, y_train, y_test))
print("    Gradient Boost:", get_scores(GradientBoostingClassifier, X_train, X_test, y_train, y_test))

Model, Accuracy, Precision, Recall
    Random Forest: (0.7977424556553789, 0.8154848046309696, 0.8596491228070176)
    Logistic Regression: (0.7523612070951393, 0.8216216216216217, 0.7536231884057971)
    Gradient Boost: (0.8039622206864778, 0.8228217280349982, 0.8607932875667429)


In [312]:
## Old model with added "mid campaign comments length" features
X_train, X_test, y_train, y_test = train_test_split(X1, y1)

In [313]:
def get_scores(classifier, X_train, X_test, y_train, y_test, **kwargs):
    model = classifier(**kwargs)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    return model.score(X_test, y_test), precision_score(y_test, y_predict), recall_score(y_test, y_predict)
print("Model, Accuracy, Precision, Recall")
print("    Random Forest:", get_scores(RandomForestClassifier, X_train, X_test, y_train, y_test))
print("    Logistic Regression:", get_scores(LogisticRegression, X_train, X_test, y_train, y_test))
print("    Gradient Boost:", get_scores(GradientBoostingClassifier, X_train, X_test, y_train, y_test))

Model, Accuracy, Precision, Recall
    Random Forest: (0.7328107502799552, 0.7622837370242215, 0.8132152085640457)
    Logistic Regression: (0.6468085106382979, 0.6476264997391757, 0.9165743816906607)
    Gradient Boost: (0.7460246360582307, 0.7654196157735086, 0.8383167220376523)


In [315]:
## Old model, same dataset, without added "mid campaign comments length" features
X1 = model_data1.drop(['backers_count','blurb', 'category', 'country_displayable_name', 'deadline', 
                     'launched_at', 'name', 'pledged', 'slug', 'state', 'category_type', 'country_name',
                    'staff_pick_y', 'id', 'url', 'comments', 'mid_campaign', 'country_name', 'have_comments',
                      'pre_mid_campaign_dates_length'],
                   axis=1)
y1 = model_data1['state']

In [316]:
X_train, X_test, y_train, y_test = train_test_split(X1, y1)

In [317]:
def get_scores(classifier, X_train, X_test, y_train, y_test, **kwargs):
    model = classifier(**kwargs)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    return model.score(X_test, y_test), precision_score(y_test, y_predict), recall_score(y_test, y_predict)
print("Model, Accuracy, Precision, Recall")
print("    Random Forest:", get_scores(RandomForestClassifier, X_train, X_test, y_train, y_test))
print("    Logistic Regression:", get_scores(LogisticRegression, X_train, X_test, y_train, y_test))
print("    Gradient Boost:", get_scores(GradientBoostingClassifier, X_train, X_test, y_train, y_test))

Model, Accuracy, Precision, Recall
    Random Forest: (0.7339305711086226, 0.7627533193570929, 0.8109212481426449)
    Logistic Regression: (0.6353863381858903, 0.6378238341968911, 0.9145616641901931)
    Gradient Boost: (0.7462486002239642, 0.7611390284757119, 0.8439821693907875)


In [None]:
df_comments = df[df['have_comments']==1]
df_comments

In [None]:
df_comments.shape

In [None]:
df_comments['state'].value_counts()

In [None]:
df_comments[['url', 'comments']].to_csv('df_comments.csv')