This notebook contains an implementation of ELMo text embeddings of Kickstarter prediction data, for potential use on a RandomForestClassifier or Neural Network. The RFC did not outperform a hyperparameter tuned RFC with tfidf vectorization, so ultimately was not used for the end model in deployment.

In [None]:
# Imports 
import pandas as pd
import numpy as np

# Install latest version of pandas profiling
!pip install category_encoders


In [None]:
# Import data from GitHub repo
# Data sourced via https://webrobots.io/kickstarter-datasets/
url = "https://raw.githubusercontent.com/kickstarter-success-aoobg/DS/master/machine_learning/data/Kickstarter.csv"
kickstarter = pd.read_csv(url)

In [None]:
# Split into train & test data
from sklearn.model_selection import train_test_split

train, test = train_test_split(kickstarter, 
                               stratify=kickstarter['state'], 
                               random_state=7)


In [None]:
# Define wrangle function

def wrangle(X):
  """ 
  Wrangles kickstarter data in preparation for binary classification
  prediction of success or failure.
  """
  
  # Copy to avoid SettingWithCopy error
  X = X.copy()

  # Reduce to only binary classification of "Successful" or "failed"
  X = X[(X["state"] == "successful") | (X["state"] == "failed")]

  # Convert target to binary variable
  X["success"] = X["state"].replace({"successful":1, "failed":0})

  # Create "Campaign Length" column & convert Unix to number of days
  X["campaign_length"] = (X["deadline"] - X["launched_at"]) / 86400

  # Drop any campaigns lasting longer than the allowed 60 days
  # (Using 61 to account for unix time conversions between 60 and 61)
  X = X[X["campaign_length"] < 61] 

  # Convert goal column to USD
  X["usd_goal"] = X["goal"] * X["fx_rate"]

  # Combine text columns for use in NLP modeling
  X["combined_text"] = X["name"] + " " + X["blurb"]

  # Pull category data from json category column
  categories = []
  for i in enumerate(X["category"]):
    df = pd.read_json(i[1])
    try:
      categories.append(df["parent_name"][0])
    except:
      categories.append(df["name"][0])
  X["category"] = categories

  return X

In [None]:
# Wrangle data
train = wrangle(train)
test = wrangle(test)

In [None]:
# Begin ELMo implementation
# Code adapted from: 
# https://www.analyticsvidhya.com/blog/2019/03/learn-to-use-elmo-to-extract-
# features-from-text/?utm_source=blog&utm_medium=top-pretrained-models-nlp-article

import tensorflow_hub as hub
# import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

# Disable eager execution for compatability with elmo model 
tf.compat.v1.disable_eager_execution()

# Load elmo3
elmo = hub.Module("https://tfhub.dev/google/elmo/3", trainable=True)

Instructions for updating:
non-resource variables are not supported in the long term


In [None]:
# Define ELMo function
def elmo_vect(x):
  ''' 
  Takes text input and generates ELMo embedding outputs.
  '''

  embeddings = elmo(
    x.tolist(), 
    signature='default',
    as_dict=True)["elmo"]

  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    return sess.run(embeddings)
   # return sess.run(tf.reduce_mean(embeddings,1)) # creates 2D output

In [None]:
# Create batches of data to be used in ELMo -- produces 2D output
# list_train = [train["combined_text"][i:i+500] for i in range(0, train.shape[0],500)]
# list_test = [test["combined_text"][i:i+500] for i in range(0, test.shape[0],500)]

In [None]:
# Extract ELMo embeddings for 2D
# elmo_train = [elmo_vect(x) for x in list_train]
# elmo_test = [elmo_vect(x) for x in list_test]

#Extract embeddings for 3D output
# using 1500 samples as is max RAM will allow in single batch
elmo_train = elmo_vect(train["combined_text"][0:1500])
elmo_test = elmo_vect(test["combined_text"][0:1500])


In [None]:
# Look at shape of 3D embeddings
print(elmo_train.shape)
print(elmo_test.shape)

(1500, 39, 1024)
(879, 39, 1024)


In [None]:
# Pickle 3D embeddings for later use w/o retraining
import joblib
joblib.dump(elmo_train, 'elmo_train_embeddings3D.pkl')
joblib.dump(elmo_test, 'elmo_test_embeddings3D.pkl')

['elmo_test_embeddings3D.pkl']

In [None]:
# Concatenate batches for 2D output
# elmo_train_full = np.concatenate(elmo_train)
# elmo_test_full = np.concatenate(elmo_test)

In [None]:
# Shape of 2D embeddings
print(elmo_train_full.shape)
print(elmo_test_full.shape)

(2634, 1024)
(879, 1024)


In [None]:
# Pickle 2D embeddings for later use w/o retraining
import joblib
joblib.dump(elmo_train_full, 'elmo_train_embeddings.pkl')
joblib.dump(elmo_test_full, 'elmo_test_embeddings.pkl')

['elmo_test_embeddings.pkl']

# RandomForestClassifier with 2D Embeddings

In [None]:
# Load pickled models
import joblib
elmo_train_full = joblib.load('elmo_train_embeddings.pkl')
elmo_test_full = joblib.load('elmo_test_embeddings.pkl')

In [None]:
# Define X matrix and y target to prepare for baseline model
features = ["combined_text", "campaign_length", "category", "usd_goal"]
target = ["success"]

X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]

In [None]:
# Fit RFC
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(max_depth=15, n_estimators=10)
model.fit(elmo_train_full, y_train)

  """


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
model.score(elmo_train_full, y_train)

0.9863325740318907

In [None]:
model.score(elmo_test_full, y_test)

0.6370875995449374

In [None]:
# Take sample of X_train / X_test
# X_train_sampled = X_train
# X_test_sampled = X_test

In [None]:
# Create new columns in df with prediction & probability by row
# X_train["nlp_pred"] = grid_search.predict(X_train["combined_text"])
# X_train["nlp_proba"] = grid_search.predict_proba(X_train["combined_text"])[:,1]

# X_test["nlp_pred"] = grid_search.predict(X_test["combined_text"])
# X_test["nlp_proba"] = grid_search.predict_proba(X_test["combined_text"])[:,1]

# Create new columns in df with prediction & probability by row
X_train["nlp_pred"] = model.predict(elmo_train_full)
X_train["nlp_proba"] = model.predict_proba(elmo_train_full)[:,1]

X_test["nlp_pred"] = model.predict(elmo_test_full)
X_test["nlp_proba"] = model.predict_proba(elmo_test_full)[:,1]


In [None]:
# Categorical encode category column
import category_encoders as ce

encoder = ce.OrdinalEncoder()
X_train["category"] = encoder.fit_transform(X_train["category"])
X_test["category"] = encoder.transform(X_test["category"])

In [None]:
# Generate df to run through final model
final_features = ["campaign_length", "category", "usd_goal", "nlp_proba"]
X_train_final = X_train[final_features]
X_test_final = X_test[final_features]

In [None]:
# Create final model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

model = RandomForestClassifier()

parameters = {
    'n_estimators': (5, 10, 20, 50),
    'max_depth': (5, 10, 15)
}

grid_search_final = GridSearchCV(model, parameters, cv=5, n_jobs=-1, verbose=True)
grid_search_final.fit(X_train_final, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    2.8s finished
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [None]:
# Look at best score 
grid_search_final.best_score_

0.9870902807339054

In [None]:
# Look at best params
grid_search_final.best_params_

{'max_depth': 5, 'n_estimators': 5}

In [None]:
# Refit with best params (embedding)
clf = RandomForestClassifier(max_depth=20, n_estimators=1000)

clf.fit(X_train_final, y_train)

# Look at train score
print("Train score:", clf.score(X_train_final, y_train))

# Look at test score
print("Test score:", clf.score(X_test_final, y_test))

  after removing the cwd from sys.path.


Train score: 1.0
Test score: 0.6325369738339022
