In [1]:
# important basic imports

import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import string
# !pip install nltk
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
from textblob import TextBlob
import matplotlib.pyplot as plt
import matplotlib.colors as clrs
%matplotlib inline

# if you get errors about installation or downloading, uncomment the above commented lines

In [2]:
# relevant imports

# data prep
from sklearn import preprocessing
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from scipy import stats

# nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

# import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# import matplotlib
from matplotlib.pyplot import scatter

In [3]:
# import data
df = pd.read_csv("./ks-projects-201801.csv")

In [4]:
# keep only campaigns that were successful or failed
df['state_string'] = [str(state) for state in df['state']]
df = df[(df.state_string == 'successful') | (df.state_string == 'failed')]
df = df.drop(['state_string'], axis=1)

In [5]:
# fill nan/missing values with correct values
df['usd pledged'].fillna(df['usd_pledged_real'], inplace=True)
df['name'].fillna('', inplace=True)

In [6]:
# make new duration and cross-product feature for testing in model
df['deadline'] = pd.to_datetime(df['deadline'])
df['launched'] = pd.to_datetime(df['launched'])
df['duration_days'] = (df['deadline'] - df['launched']).astype('timedelta64[D]')
df['goal*days'] = df['usd_goal_real']*df['duration_days']

In [7]:
# remove outliers - do or not do?
# df1 = df[(np.abs(stats.zscore(df[['usd_goal_real']]) < 3))]
# df2 = df[(np.abs(stats.zscore(df[['duration_days']]) < 3))]

# df = pd.merge(df1, df2, how='inner')
# new_num_rows = len(df)
# pct_data_kept = new_num_rows/num_rows
# print(pct_data_kept)

In [8]:
# convert dataframe columns with dtype object to categories
df['state'] = df['state'].astype('category')
df['main_category'] = df['main_category'].astype('category')
df['category'] = df['category'].astype('category')
df['currency'] = df['currency'].astype('category')
df['country'] = df['country'].astype('category')

In [9]:
# format name column and add word count/name sentiment features
df['name'] = [str(name) for name in df['name']]
df['name_word_count'] = [len(name.split(" ")) for name in df['name']]
df['name_sentiment'] = [TextBlob(name).sentiment.polarity for name in df['name']]

In [10]:
# one hot encoding for main category and country columns
main_categories_cols = pd.get_dummies(df['main_category'], prefix = 'main_category')
country_cols = pd.get_dummies(df['country'], prefix = 'country')
df = pd.concat([df, main_categories_cols, country_cols], axis=1)

In [11]:
df.to_csv('all_features_no_tfidf.csv')

In [12]:
all_features = list(df)

# segment features into original, unnecessary, unknown, and BASE (what we want to use for our models)
unknown_features = ['pledged','backers','usd pledged','usd_pledged_real','currency']
unnecessary_features = ['ID','deadline','launched','category','goal','main_category','country']
fts = ['usd_goal_real','duration_days','goal*days']
base_features = [feat for feat in all_features if feat not in unknown_features and feat not in unnecessary_features]
df1 = df[base_features]

In [13]:
# training/validation/testing 60/20/20 split
y = df1['state'] # define target variable
x = df1.drop('state', axis=1)  
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=1)

In [14]:
# normalize numeric data using min_max normalization
# x_train[fts] = (x_train[fts]-x_train[fts].min())/(x_train[fts].max()-x_train[fts].min())
# x_val[fts] = (x_val[fts]-x_val[fts].min())/(x_val[fts].max()-x_val[fts].min())
# x_test[fts] = (x_test[fts]-x_test[fts].min())/(x_test[fts].max()-x_test[fts].min())

In [15]:
# standardize numeric data
x_train[fts] = (x_train[fts]-x_train[fts].mean())/(x_train[fts].std())
x_val[fts] = (x_val[fts]-x_val[fts].mean())/(x_val[fts].std())
x_test[fts] = (x_test[fts]-x_test[fts].mean())/(x_test[fts].std())

In [16]:
print(x_train.shape)
print(x_val.shape)
print(x_test.shape)

(199005, 44)
(66335, 44)
(66335, 44)


In [17]:
# setup preprocessing and tokenization for tfidf vectorizer
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()
punctuation = string.punctuation

# lowercase and remove punctuation
def preprocessing(name):
    return name.lower().translate(str.maketrans('', '', string.punctuation))

# tokenize the name and lemmatize its tokens
def tokenizing(name):
    return [lemmatizer.lemmatize(t) for t in word_tokenize(name) if t not in stopwords and len(t)>2]

In [18]:
# create tfidf vectorizer and generate word vectors
vectorizer = TfidfVectorizer(preprocessor=preprocessing,
                             tokenizer=tokenizing,
                             strip_accents='ascii',
                             analyzer='word',
                             sublinear_tf=True,
                             min_df=100,
                             ngram_range=(1,2))

tfidf = vectorizer.fit_transform(x_train['name'])

In [19]:
# tfidf for training data
tfidf_words = [("tfidf_" + word) for word in vectorizer.get_feature_names()]
x_train_tfidf = pd.DataFrame(tfidf.toarray(), columns=tfidf_words)

x_train.reset_index(drop=True, inplace=True)
x_train_tfidf.reset_index(drop=True, inplace=True)

x_train_new = pd.concat([x_train, x_train_tfidf], axis=1)
print(x_train_new.shape)

(199005, 1450)


In [20]:
# tfidf for validation data
tfidf_val = vectorizer.transform(x_val['name'])
x_val_tfidf = pd.DataFrame(tfidf_val.toarray(), columns=tfidf_words)

x_val.reset_index(drop=True, inplace=True)
x_val_tfidf.reset_index(drop=True, inplace=True)

x_val_new = pd.concat([x_val, x_val_tfidf], axis=1)
print(x_val_new.shape)

(66335, 1450)


In [21]:
# tfidf for testing data
tfidf_test = vectorizer.transform(x_test['name'])
x_test_tfidf = pd.DataFrame(tfidf_test.toarray(), columns=tfidf_words)

x_test.reset_index(drop=True, inplace=True)
x_test_tfidf.reset_index(drop=True, inplace=True)

x_test_new = pd.concat([x_test, x_test_tfidf], axis=1)
print(x_test_new.shape)

(66335, 1450)


In [22]:
y_train.reset_index(drop=True, inplace=True)
y_val.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [23]:
# remove name column
x_train_new = x_train_new.drop(['name'], axis=1)
x_val_new = x_val_new.drop(['name'], axis=1)
x_test_new = x_test_new.drop(['name'], axis=1)

In [24]:
# save training data
x_train_new.to_csv('x_train.csv', index=False)
y_train.to_csv('y_train.csv', index=False)

In [25]:
# save validation data
x_val_new.to_csv('x_val.csv', index=False)
y_val.to_csv('y_val.csv', index=False)

In [26]:
# save testing data
x_test_new.to_csv('x_test.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

In [27]:
x_train_new.shape

(199005, 1449)

In [28]:
# below is code that I wrote that we're not using anymore

In [29]:
# one hot encoding
# from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# main_categories = x_train['main_category']
# print(np.unique(main_categories))
# integer encode
# label_encoder = LabelEncoder()
# integer_encoded = label_encoder.fit_transform(main_categories)
# print(len(integer_encoded))
# binary encode
# onehot_encoder = OneHotEncoder(sparse=False)
# integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
# onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
# print(onehot_encoded.shape)

In [30]:
# turn names into strings, remove punctuation, and tokenize 
# table = str.maketrans(dict.fromkeys(string.punctuation))
# lowercase_names = df['name'].astype(str).str.lower()
# token_lists = [nltk.word_tokenize(name.translate(table)) for name in lowercase_names]

In [31]:
# remove stopwords
# def remove_stopwords(token_list):
#     return [token for token in token_list if not token in stopwords]

# df['name_tokens'] = [remove_stopwords(token_list) for token_list in token_lists]

In [32]:
# create array of all words
# words = [word for word_list in df['name_tokens'] for word in word_list]

In [33]:
# word to vec
# from gensim.scripts.glove2word2vec import glove2word2vec
# glove_input_file = 'glove.6B.100d.txt'
# word2vec_output_file = 'glove.6B.100d.txt.word2vec'
# glove2word2vec(glove_input_file, word2vec_output_file)