In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import ast
import glob
import tensorflow as tf
from tensorflow import keras

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras import layers

plt.style.use('ggplot')
%matplotlib inline

In [6]:
pd.set_option('display.max_columns', 50)

In [7]:
master = pd.read_csv('data/master_cleaned.csv')

In [8]:
comments = pd.read_csv('data/comments.csv')

In [9]:
dates = pd.read_csv('data/comments_dates.csv', names=['name','url','dates'])

In [10]:
print(comments.shape)
print(dates.shape)

(17642, 3)
(4068, 3)


In [11]:
comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17642 entries, 0 to 17641
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  17642 non-null  int64 
 1   url         17642 non-null  object
 2   comments    17642 non-null  object
dtypes: int64(1), object(2)
memory usage: 413.6+ KB


In [17]:
comments.drop('Unnamed: 0', axis=1, inplace=True)

In [18]:
comments.head()

Unnamed: 0,url,comments,have_comments
0,https://www.kickstarter.com/projects/117862918...,[],0
1,https://www.kickstarter.com/projects/lizardsho...,[],0
2,https://www.kickstarter.com/projects/184368767...,[],0
3,https://www.kickstarter.com/projects/376746530...,[],0
4,https://www.kickstarter.com/projects/116371417...,[],0


In [19]:
comments['comments'][11] = []
comments['comments'][12] = []

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [20]:
urls = pd.read_csv('data/comments_urls_copy.csv')

In [21]:
urls.head()

Unnamed: 0.1,Unnamed: 0,id,name,url
0,0,751419376,Iron Age Kingdoms - First Free To Win Mobile S...,https://www.kickstarter.com/projects/117862918...
1,1,289952460,"""Lizard"" a short film",https://www.kickstarter.com/projects/lizardsho...
2,2,37585826,Total Franchise Football 2016,https://www.kickstarter.com/projects/184368767...
3,3,1727301949,The Trans-American Psychogeographic Literary C...,https://www.kickstarter.com/projects/376746530...
4,4,144196412,The Golden Ticket to the Wonka Factory,https://www.kickstarter.com/projects/116371417...


In [22]:
# merge comments table with url tables to get campaign id
full = pd.merge(urls, comments, how='right', on= 'url', )

In [23]:
full.shape

(17642, 6)

In [28]:
full_with_dates = pd.merge(full, dates, on='url')
full_with_dates.drop(columns=['Unnamed: 0', 'name_y'], inplace=True)

In [29]:
full_with_dates.rename(columns={'name_x': 'name'}, inplace=True)


In [31]:
full_with_dates.head()

Unnamed: 0,id,name,url,comments,have_comments,dates
0,1769794304,Burma Storybook,https://www.kickstarter.com/projects/618155130...,['Oh! My! The book has arrived and I am speech...,1,"['June 30, 2017 10:52 AM PDT', 'May 26, 2017 3..."
1,526305087,The Alchemy of Collaboration,https://www.kickstarter.com/projects/punchgrap...,"[""I know you read the messages. I've been chas...",1,"['June 1, 2020 10:55 AM PDT', 'May 21, 2020 9:..."
2,1103726466,Frack This? - The Wyoming Artist Expedition,https://www.kickstarter.com/projects/331530956...,"['To the Artists:', 'I encourage each artist t...",1,"['May 30, 2012 3:37 PM PDT', 'May 14, 2012 2:4..."
3,2045995373,Cookietownworld,https://www.kickstarter.com/projects/cookietow...,['I think Cookietownworld is an awesome idea. ...,1,"['June 23, 2016 8:55 AM PDT']"
4,1228292914,Dunpets Colors: Monster-catching RPG,https://www.kickstarter.com/projects/gugamesde...,"['Any updates about the premier? :)', 'Any pro...",1,"['April 17, 2018 6:49 PM PDT', 'January 22, 20..."


## Merge with Master

In [32]:
# full df with comments and dates
df = pd.merge(master, full_with_dates, on='name')
df.drop(columns=['Unnamed: 0', 'country_name'], inplace=True)
df.shape

(4113, 22)

In [36]:
# master list with outer join, with comments or no comments 
df_full = pd.merge(master, full, on='name', how='left')
df_full.drop(columns=['Unnamed: 0_x', 'Unnamed: 0_y','country_name'], inplace=True)
df_full.shape

(169626, 21)

In [37]:
df_full.head()

Unnamed: 0,backers_count,blurb,category,country_displayable_name,deadline,goal,launched_at,name,pledged,slug,staff_pick,state,launched_year,launched_month,campaign_length,category_type,blurb_length,id,url,comments,have_comments
0,5,A Free to Win Online Mobile Strategy Game to e...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",Canada,2017-10-09 14:41:03,1000.0,2017-09-25 14:41:03,Iron Age Kingdoms - First Free To Win Mobile S...,105.0,iron-age-kingdoms-first-free-to-win-mobile-str...,False,failed,2017,9,14,games,24,751419400.0,https://www.kickstarter.com/projects/117862918...,[],0.0
1,19,"Lizard is a dark comedy and modern western, th...","{""id"":11,""name"":""Film & Video"",""slug"":""film & ...",the United States,2019-09-21 23:39:48,1100.0,2019-07-23 23:39:48,"""Lizard"" a short film",2358.0,lizard-a-short-film,False,successful,2019,7,60,film & video,15,289952500.0,https://www.kickstarter.com/projects/lizardsho...,[],0.0
2,5,My friend and I are making an app that has nev...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",the United States,2016-03-22 21:11:17,25000.0,2016-01-22 22:11:17,Total Franchise Football 2016,172.0,total-franchise-football-2016,False,failed,2016,1,60,games,25,37585830.0,https://www.kickstarter.com/projects/184368767...,[],0.0
3,23,"More than a bike trip, a state of mind. And a ...","{""id"":48,""name"":""Nonfiction"",""slug"":""publishin...",the United States,2012-09-25 02:33:06,500.0,2012-09-15 02:33:06,The Trans-American Psychogeographic Literary C...,501.0,the-trans-american-psychogeographic-literary-c...,False,successful,2012,9,10,publishing,12,1727302000.0,https://www.kickstarter.com/projects/376746530...,[],0.0
4,0,This is the birth of a revolutionary artist......,"{""id"":39,""name"":""Hip-Hop"",""slug"":""music/hip-ho...",the United States,2017-12-11 02:43:11,2500.0,2017-11-10 02:43:11,The Golden Ticket to the Wonka Factory,0.0,the-golden-ticket-to-the-wonka-factory,False,failed,2017,11,31,music,23,144196400.0,https://www.kickstarter.com/projects/116371417...,[],0.0


In [38]:
df_full['have_comments'].value_counts()

0.0    10185
1.0     7581
Name: have_comments, dtype: int64

In [39]:
cols_to_convert = ['deadline', 'launched_at']
for c in cols_to_convert:
    df[c] = pd.to_datetime(df[c])

In [40]:
cols_to_convert = ['deadline', 'launched_at']
for c in cols_to_convert:
    df_full[c] = pd.to_datetime(df_full[c])

In [41]:
df['mid_campaign'] = df['launched_at'] + pd.DateOffset(15)
df['mid_campaign'] = df['mid_campaign'].dt.round('d')
df['dates'] = df['dates'].map(lambda x: x.split("',"))
df['comment_length'] = df['dates'].map(lambda x: len(x))

In [42]:
df_full['mid_campaign'] = df_full['launched_at'] + pd.DateOffset(15)
df_full['mid_campaign'] = df_full['mid_campaign'].dt.round('d')


In [43]:
def country_cat(x):
    country_list = ['the United States', 'the United Kingdom ', 'Canada', 'Australia',
                   'Germany', 'France', 'Mexico', 'Italy', 'Spain', 'the Netherlands']
    for country in x:
        if country in country_list:
            return country
        else:
            return 'other'

In [44]:
def extract(cat):
    x = ast.literal_eval(re.search('({.+})', cat).group(0))
    if 'parent_name' not in x.keys():
        return x['slug']
    else:
        return x['parent_name'].lower()

In [45]:
df['country_name'] = df['country_displayable_name'].map(lambda x: country_cat([x]))
df_full['country_name'] = df_full['country_displayable_name'].map(lambda x: country_cat([x]))

In [46]:
df['category_type'] = df['category'].map(lambda x: extract(x))
df = df[df['state'].isin(['successful', 'failed'])]

In [47]:
df_full['category_type'] = df_full['category'].map(lambda x: extract(x))
df_full = df_full[df_full['state'].isin(['successful', 'failed'])]

In [48]:
df_full.head()

Unnamed: 0,backers_count,blurb,category,country_displayable_name,deadline,goal,launched_at,name,pledged,slug,staff_pick,state,launched_year,launched_month,campaign_length,category_type,blurb_length,id,url,comments,have_comments,mid_campaign,country_name
0,5,A Free to Win Online Mobile Strategy Game to e...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",Canada,2017-10-09 14:41:03,1000.0,2017-09-25 14:41:03,Iron Age Kingdoms - First Free To Win Mobile S...,105.0,iron-age-kingdoms-first-free-to-win-mobile-str...,False,failed,2017,9,14,games,24,751419400.0,https://www.kickstarter.com/projects/117862918...,[],0.0,2017-10-11,Canada
1,19,"Lizard is a dark comedy and modern western, th...","{""id"":11,""name"":""Film & Video"",""slug"":""film & ...",the United States,2019-09-21 23:39:48,1100.0,2019-07-23 23:39:48,"""Lizard"" a short film",2358.0,lizard-a-short-film,False,successful,2019,7,60,film & video,15,289952500.0,https://www.kickstarter.com/projects/lizardsho...,[],0.0,2019-08-08,the United States
2,5,My friend and I are making an app that has nev...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",the United States,2016-03-22 21:11:17,25000.0,2016-01-22 22:11:17,Total Franchise Football 2016,172.0,total-franchise-football-2016,False,failed,2016,1,60,games,25,37585830.0,https://www.kickstarter.com/projects/184368767...,[],0.0,2016-02-07,the United States
3,23,"More than a bike trip, a state of mind. And a ...","{""id"":48,""name"":""Nonfiction"",""slug"":""publishin...",the United States,2012-09-25 02:33:06,500.0,2012-09-15 02:33:06,The Trans-American Psychogeographic Literary C...,501.0,the-trans-american-psychogeographic-literary-c...,False,successful,2012,9,10,publishing,12,1727302000.0,https://www.kickstarter.com/projects/376746530...,[],0.0,2012-09-30,the United States
4,0,This is the birth of a revolutionary artist......,"{""id"":39,""name"":""Hip-Hop"",""slug"":""music/hip-ho...",the United States,2017-12-11 02:43:11,2500.0,2017-11-10 02:43:11,The Golden Ticket to the Wonka Factory,0.0,the-golden-ticket-to-the-wonka-factory,False,failed,2017,11,31,music,23,144196400.0,https://www.kickstarter.com/projects/116371417...,[],0.0,2017-11-25,the United States


In [49]:
df['dates'][2]

["['May 30, 2012 3:37 PM PDT",
 " 'May 14, 2012 2:46 PM PDT",
 " 'May 12, 2012 9:48 AM PDT",
 " 'May 7, 2012 8:00 PM PDT']"]

In [50]:
def to_date(x):
    lst = []
    for i in x:
        i = i.replace("[", '').replace("]", '')
        d = pd.to_datetime(i[:-4])
        lst.append(d)
    return lst

In [51]:
df['dates'] = df['dates'].map(lambda x: to_date(x))

In [52]:
def date_subtract(df):
    lst = []
    for index, row in df.iterrows():
        row_lst =[]
        r = row['dates']
        for i in r:
            if i < row['mid_campaign']:
                row_lst.append(i)
        lst.append(row_lst)
    return lst 

In [53]:
def real_comments(df):
    lst = []
    for index, row in df.iterrows():
        r = eval(row['comments'])
        a = row['pre_mid_campaign_dates_length']
        if a > 0:
            lst.append(r[-a:])
        else:
            lst.append([])
    return lst


# eval(df['comments'][2])[-3:]

In [122]:
def have_comments(x):
    if x > 0:
        return 1
    else:
        return 0

In [123]:
df['pre_mid_campaign_dates'] = date_subtract(df)
df['pre_mid_campaign_dates_length'] = df['pre_mid_campaign_dates'].apply(lambda x: len(x))
df['have_comments'] = df['pre_mid_campaign_dates_length'].apply(lambda x: have_comments(x))

In [124]:
df['pre_mid_campaign_comments'] = real_comments(df)

In [125]:
df.head()

Unnamed: 0,backers_count,blurb,category,country_displayable_name,deadline,goal,launched_at,name,pledged,slug,staff_pick,state,launched_year,launched_month,campaign_length,category_type,blurb_length,id,url,comments,have_comments,dates,mid_campaign,comment_length,country_name,pre_mid_campaign_dates,pre_mid_campaign_dates_length,pre_mid_campaign_comments
0,152,A photography and poetry book featuring contem...,"{""id"":278,""name"":""People"",""slug"":""photography/...",the Netherlands,2017-05-28 17:00:00,15000.0,2017-05-03 16:57:18,Burma Storybook,15777.0,burma-storybook,True,successful,2017,5,25,photography,13,1769794304,https://www.kickstarter.com/projects/618155130...,['Oh! My! The book has arrived and I am speech...,0,"[2017-06-30 10:52:00, 2017-05-26 03:53:00]",2017-05-19,2,the Netherlands,[],0,[]
1,116,100 sumie ink originals by David Mack on top o...,"{""id"":276,""name"":""Fine Art"",""slug"":""photograph...",the United States,2018-07-29 21:05:01,8000.0,2018-06-29 21:05:01,The Alchemy of Collaboration,16706.0,the-alchemy-of-collaboration,False,successful,2018,6,30,photography,21,526305087,https://www.kickstarter.com/projects/punchgrap...,"[""I know you read the messages. I've been chas...",0,"[2020-06-01 10:55:00, 2020-05-21 09:46:00, 202...",2018-07-15,21,the United States,[],0,[]
2,50,"8 artists explore the collision of art, the en...","{""id"":54,""name"":""Mixed Media"",""slug"":""art/mixe...",the United States,2012-06-04 03:59:00,3300.0,2012-05-03 12:13:56,Frack This? - The Wyoming Artist Expedition,3805.0,frack-this-the-wyoming-artist-expedition,False,successful,2012,5,32,art,16,1103726466,https://www.kickstarter.com/projects/331530956...,"['To the Artists:', 'I encourage each artist t...",1,"[2012-05-30 15:37:00, 2012-05-14 14:46:00, 201...",2012-05-19,4,the United States,"[2012-05-14 14:46:00, 2012-05-12 09:48:00, 201...",3,"[Lori, the kids will want you and Uncle Michae..."
3,8,Cookietownworld is an interactive website desi...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",the United States,2016-08-07 16:15:57,95000.0,2016-06-08 16:15:57,Cookietownworld,3229.0,cookietownworld,False,failed,2016,6,60,games,13,2045995373,https://www.kickstarter.com/projects/cookietow...,['I think Cookietownworld is an awesome idea. ...,1,[2016-06-23 08:55:00],2016-06-24,1,the United States,[2016-06-23 08:55:00],1,[I think Cookietownworld is an awesome idea. I...
4,59,Explore a magical world full of monsters and d...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",Spain,2017-12-22 20:25:20,15000.0,2017-11-22 20:25:20,Dunpets Colors: Monster-catching RPG,4310.75,dunpets-colors-dungeon-crawl-and-pets,True,failed,2017,11,30,games,22,1228292914,https://www.kickstarter.com/projects/gugamesde...,"['Any updates about the premier? :)', 'Any pro...",1,"[2018-04-17 18:49:00, 2018-01-22 01:59:00, 201...",2017-12-08,18,Spain,"[2017-12-03 12:14:00, 2017-12-03 07:10:00, 201...",11,"[Any progress update on the premiere?, Sorry t..."


In [126]:
a = date_subtract(df)

In [127]:
len(a)

4113

In [None]:
df['pre_mid_campaign_dates'][2]

In [None]:
df['pre_mid_campaign_dates'][5]

In [None]:
df['dates'][4]

In [None]:
eval(df['comments'][2])[-3:]

In [None]:
eval(df['comments'][5])[-11:]

In [None]:
df['comments'][2]

## One Hot Encoding 

In [128]:
df_full.head()

Unnamed: 0,backers_count,blurb,category,country_displayable_name,deadline,goal,launched_at,name,pledged,slug,staff_pick,state,launched_year,launched_month,campaign_length,category_type,blurb_length,id,url,comments,have_comments,mid_campaign,country_name
0,5,A Free to Win Online Mobile Strategy Game to e...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",Canada,2017-10-09 14:41:03,1000.0,2017-09-25 14:41:03,Iron Age Kingdoms - First Free To Win Mobile S...,105.0,iron-age-kingdoms-first-free-to-win-mobile-str...,False,failed,2017,9,14,games,24,751419400.0,https://www.kickstarter.com/projects/117862918...,[],0.0,2017-10-11,Canada
1,19,"Lizard is a dark comedy and modern western, th...","{""id"":11,""name"":""Film & Video"",""slug"":""film & ...",the United States,2019-09-21 23:39:48,1100.0,2019-07-23 23:39:48,"""Lizard"" a short film",2358.0,lizard-a-short-film,False,successful,2019,7,60,film & video,15,289952500.0,https://www.kickstarter.com/projects/lizardsho...,[],0.0,2019-08-08,the United States
2,5,My friend and I are making an app that has nev...,"{""id"":272,""name"":""Mobile Games"",""slug"":""games/...",the United States,2016-03-22 21:11:17,25000.0,2016-01-22 22:11:17,Total Franchise Football 2016,172.0,total-franchise-football-2016,False,failed,2016,1,60,games,25,37585830.0,https://www.kickstarter.com/projects/184368767...,[],0.0,2016-02-07,the United States
3,23,"More than a bike trip, a state of mind. And a ...","{""id"":48,""name"":""Nonfiction"",""slug"":""publishin...",the United States,2012-09-25 02:33:06,500.0,2012-09-15 02:33:06,The Trans-American Psychogeographic Literary C...,501.0,the-trans-american-psychogeographic-literary-c...,False,successful,2012,9,10,publishing,12,1727302000.0,https://www.kickstarter.com/projects/376746530...,[],0.0,2012-09-30,the United States
4,0,This is the birth of a revolutionary artist......,"{""id"":39,""name"":""Hip-Hop"",""slug"":""music/hip-ho...",the United States,2017-12-11 02:43:11,2500.0,2017-11-10 02:43:11,The Golden Ticket to the Wonka Factory,0.0,the-golden-ticket-to-the-wonka-factory,False,failed,2017,11,31,music,23,144196400.0,https://www.kickstarter.com/projects/116371417...,[],0.0,2017-11-25,the United States


In [129]:
df_full_ohe = pd.get_dummies(df_full[['category_type', 'country_name', 'staff_pick']])

In [130]:
df_full.columns

Index(['backers_count', 'blurb', 'category', 'country_displayable_name',
       'deadline', 'goal', 'launched_at', 'name', 'pledged', 'slug',
       'staff_pick', 'state', 'launched_year', 'launched_month',
       'campaign_length', 'category_type', 'blurb_length', 'id', 'url',
       'comments', 'have_comments', 'mid_campaign', 'country_name'],
      dtype='object')

In [131]:
data = pd.merge(df_full, df_full_ohe, left_index=True, right_index=True)
data.shape

(169626, 49)

In [132]:
good_data = data[data['state'].isin(['successful', 'failed'])]
live = data[data['state'].isin(['live'])]
print(good_data.shape)
print(live.shape)

(169626, 49)
(0, 49)


In [133]:
good_data['state'] = good_data['state'].replace(to_replace=['successful', 'failed'], value=[1,0])

In [134]:
model_data = good_data.copy()

In [135]:
model_data.shape

(169626, 49)

In [136]:
model_data.dropna(inplace=True)
print(model_data.shape)

(17766, 49)


In [137]:
model_data['have_comments'].value_counts()

0.0    10185
1.0     7581
Name: have_comments, dtype: int64

In [138]:
X = model_data.drop(['backers_count','blurb', 'category', 'country_displayable_name', 'deadline', 
                     'launched_at', 'name', 'pledged', 'slug', 'state', 'category_type', 'country_name',
                    'staff_pick_y', 'id', 'url', 'comments', 'mid_campaign', 'country_name'],
                   axis=1)
y = model_data['state']


In [158]:
X.shape

(17766, 32)

## Adding Features

In [140]:
df1 = df[['url','pre_mid_campaign_dates_length']]

In [141]:
df.shape

(4113, 28)

In [142]:
model_data1 = pd.merge(model_data, df1, on='url', how='outer')

In [143]:
model_data1.shape

(17930, 50)

In [159]:
model_data1.fillna(0,inplace=True)

In [160]:
X1 = model_data1.drop(['backers_count','blurb', 'category', 'country_displayable_name', 'deadline', 
                     'launched_at', 'name', 'pledged', 'slug', 'state', 'category_type', 'country_name',
                    'staff_pick_y', 'id', 'url', 'comments', 'mid_campaign', 'country_name', 'pre_mid_campaign_dates_length'],
                   axis=1)
y1 = model_data1['state']

In [161]:
X1.shape

(17930, 32)

## Modeling

In [152]:
## Old model with added "have comments" features. FULL DATASET
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [157]:
def get_scores(classifier, X_train, X_test, y_train, y_test, **kwargs):
    model = classifier(**kwargs)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    return model.score(X_test, y_test), precision_score(y_test, y_predict), recall_score(y_test, y_predict)
print("Model, Accuracy, Precision, Recall")
print("    Random Forest:", get_scores(RandomForestClassifier, X_train, X_test, y_train, y_test))
print("    Logistic Regression:", get_scores(LogisticRegression, X_train, X_test, y_train, y_test))
print("    Gradient Boost:", get_scores(GradientBoostingClassifier, X_train, X_test, y_train, y_test))

Model, Accuracy, Precision, Recall
    Random Forest: (0.8090950022512382, 0.8292166549047283, 0.8658806190125277)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


    Logistic Regression: (0.7537145429986493, 0.8380634390651085, 0.739867354458364)
    Gradient Boost: (0.8043674020711391, 0.827708703374778, 0.8585114222549742)


In [163]:
## Old model with added "have comments" features. 17K DATAPOINTS, THOSE THAT ARE SCRAPED
X_train, X_test, y_train, y_test = train_test_split(X1, y1)

In [164]:
def get_scores(classifier, X_train, X_test, y_train, y_test, **kwargs):
    model = classifier(**kwargs)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    return model.score(X_test, y_test), precision_score(y_test, y_predict), recall_score(y_test, y_predict)
print("Model, Accuracy, Precision, Recall")
print("    Random Forest:", get_scores(RandomForestClassifier, X_train, X_test, y_train, y_test))
print("    Logistic Regression:", get_scores(LogisticRegression, X_train, X_test, y_train, y_test))
print("    Gradient Boost:", get_scores(GradientBoostingClassifier, X_train, X_test, y_train, y_test))

Model, Accuracy, Precision, Recall
    Random Forest: (0.8061565915681463, 0.8158172231985941, 0.8705926481620405)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


    Logistic Regression: (0.6671871514610752, 0.6583917970858068, 0.9152288072018004)
    Gradient Boost: (0.8070488512157038, 0.816075816075816, 0.872093023255814)


In [169]:
## Old model, same dataset, added "mid campaign comments length" features
X1 = model_data1.drop(['backers_count','blurb', 'category', 'country_displayable_name', 'deadline', 
                     'launched_at', 'name', 'pledged', 'slug', 'state', 'category_type', 'country_name',
                    'staff_pick_y', 'id', 'url', 'comments', 'mid_campaign', 'country_name'],
                   axis=1)
y1 = model_data1['state']

In [170]:
X_train, X_test, y_train, y_test = train_test_split(X1, y1)

In [171]:
def get_scores(classifier, X_train, X_test, y_train, y_test, **kwargs):
    model = classifier(**kwargs)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    return model.score(X_test, y_test), precision_score(y_test, y_predict), recall_score(y_test, y_predict)
print("Model, Accuracy, Precision, Recall")
print("    Random Forest:", get_scores(RandomForestClassifier, X_train, X_test, y_train, y_test))
print("    Logistic Regression:", get_scores(LogisticRegression, X_train, X_test, y_train, y_test))
print("    Gradient Boost:", get_scores(GradientBoostingClassifier, X_train, X_test, y_train, y_test))

Model, Accuracy, Precision, Recall
    Random Forest: (0.8025875529779166, 0.8164948453608247, 0.8712871287128713)
    Logistic Regression: (0.6466651795672541, 0.6464258262874711, 0.9251925192519251)
    Gradient Boost: (0.8095025652464868, 0.8257391304347826, 0.8705537220388706)


In [None]:
df_comments = df[df['have_comments']==1]
df_comments

In [None]:
df_comments.shape

In [None]:
df_comments['state'].value_counts()

In [None]:
df_comments[['url', 'comments']].to_csv('df_comments.csv')

In [113]:
x_test = full_with_dates['comments'][:100].values

In [114]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(x_test)
xtest= tokenizer.texts_to_sequences(x_test)


In [115]:
vocab_size=len(tokenizer.word_index)+1

In [116]:
maxlen=10
xtest=pad_sequences(xtest,padding='post', maxlen=maxlen)


In [117]:
model=Sequential()

In [118]:
reconstructed_model = keras.models.load_model('test_model')

In [120]:
np.testing.assert_allclose(
  model.predict(xtest),
  reconstructed_model.predict(xtest))

UnimplementedError:  Cast int32 to string is not supported
	 [[node Cast (defined at <ipython-input-120-b1de6021920f>:2) ]] [Op:__inference_distributed_function_25825]

Function call stack:
distributed_function
