### Kickstarter dataset project - Simple training

In [None]:
import pandas as pd
import plotly as plt
import numpy as np

pd.options.display.max_rows = 4000

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

import plotly.express as px 
import plotly.subplots as tls
import plotly
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

SEED = 42

<a id='Q1'></a>

# Reading and cleaning up the dataframe

### I am omitting this step for the sake of clarity. This part would made us read from the original dataset and cleaning it up + adding relevant features if the original features are not right.

# Going from the final dataframe

In [None]:
df = pd.read_csv('out2.zip', compression='zip')

In [None]:
df.shape

In [None]:
df = df.drop(['goal', 'pledged', 'usd pledged'], axis = 1)

In [None]:
df.describe(include='all', datetime_is_numeric=True)

In [None]:
# Converting the columns into the right dtypes as for dates and numbers.
df["deadline"] = pd.to_datetime(df['deadline'])
df["launched"] = pd.to_datetime(df['launched'])
df["ID"] = pd.to_numeric(df["ID"])
df["backers"] = pd.to_numeric(df["backers"])
df["real_usd_pledged"] = pd.to_numeric(df["real_usd_pledged"])
df["usd_goal"] = pd.to_numeric(df["usd_goal"])

In [None]:
df.dtypes

In [None]:
df.isnull().any()

In [None]:
df[df['country'].isnull()].head()

In [None]:
df[df['country'].isnull()].shape

Let's drop these because we can see that there is 0 backers and no country nor usd pledged previously, it seems to be a mistake in getting the data

In [None]:
df = df[~df['country'].isnull()]

In [None]:
df = df.loc[~((df['real_usd_pledged']>=df['usd_goal']) & (df['state']=='failed'))].reset_index(drop=True)

In [None]:
df.isnull().any()

In [None]:
df.shape

In [None]:
df.duplicated().sum()

In [None]:
counts = df['name'].value_counts().rename_axis('name').reset_index(name='counts')

In [None]:
duplicate_names = df[df['name'].isin(counts[counts['counts']>1].name.tolist())]

In [None]:
duplicate_names.shape

In [None]:
duplicate_names.sort_values(by=['name']).head()

I'll leave it as it is, but it's interesting to see that some duplicates seem genuine, others seem to be about the same project revamped/relaunched and others are also another rendition of the same project (play at theater and video for instance...).

It would be interesting to know more about the motives and mindset of people creating these projects 'again' (needs of funds again), are there also possible cases of reboot of past successful projects (hoax ?). 

Overall, it still can be integrated in our model as we want to predict the success/failure of a campaign regardless.

## Distribution of goals and pledges

In [None]:
df.dtypes

In [None]:
# # Campaign length
# df['campaign_days'] = df['deadline'] - df['launched']
# df['campaign_days'] = df['campaign_days'].dt.round('d').dt.days # Rounding to nearest days, then showing as number only

# # Launch day of week
# df['launch_day'] = df['launched'].dt.day_name()

# # Deadline day of week
# df['deadline_day'] = df['deadline'].dt.day_name()

# # Launch month
# df['launch_month'] = df['launched'].dt.month_name()

# # Deadline month
# df['deadline_month'] = df['deadline'].dt.month_name()

In [None]:
df.groupby('main_category').main_category.count().sort_values(ascending=False)

In [None]:
# Importing the required libraries
import matplotlib.pyplot as plt
from matplotlib import cm

# Creating a dataframe grouped by category with columns for failed and successful
cat_df = pd.get_dummies(df.set_index('main_category').state).groupby('main_category').sum()
# Plotting
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(12,12))
color = cm.CMRmap(np.linspace(0.1,0.8,df.main_category.nunique()))

df.groupby('main_category').category.count().plot(kind='bar', ax=ax1, color=color)
ax1.set_title('Number of projects')
ax1.set_xlabel('')
df.groupby('main_category').usd_goal.median().plot(kind='bar', ax=ax2, color=color)
ax2.set_title('Median project goal ($)')
ax2.set_xlabel('')
df.groupby('main_category').real_usd_pledged.median().plot(kind='bar', ax=ax3, color=color)
ax3.set_title('Median pledged per project ($)')
ax3.set_xlabel('')
cat_df.div(cat_df.sum(axis=1), axis=0).successful.plot(kind='bar', ax=ax4, color=color) # Normalizes counts across rows
ax4.set_title('Proportion of successful projects')
ax4.set_xlabel('')
df.groupby('main_category').backers.median().plot(kind='bar', ax=ax5, color=color)
ax5.set_title('Median backers per project')
ax5.set_xlabel('')
fig.subplots_adjust(hspace=0.6)
plt.show()

In [None]:
# Checking the distributions of continuous features
df[df.describe().columns].hist(figsize=(12,10))

We take the log to better see the distributions as we have outliers in both cases.

In [None]:
df_failed = df[df["state"] == "failed"]
df_sucess = df[df["state"] == "successful"]


# Add histogram data
failed = np.log(df_failed['usd_goal']+1)
success = np.log(df_sucess['usd_goal']+1)

trace1 = go.Histogram(
    x=failed,
    opacity=0.60, nbinsx=30, name='Goals Failed', histnorm='probability'
)
trace2 = go.Histogram(
    x=success,
    opacity=0.60, nbinsx=30, name='Goals Sucessful', histnorm='probability'
)

data = [trace1, trace2]
layout = go.Layout(barmode='overlay', title=go.layout.Title(text="Distributions of usd_goal"))

fig = go.Figure(
    data=data,
    layout=layout
)

iplot(fig)

Based on the above histogram, it seems the failed projects tend to have higher values (so higher goals)

In [None]:
import plotly.express as px
fig = px.box(df, x="main_category", y="usd_goal")
fig.show()

## Feature engineering

Variables for the logistic regression:
* len(name) to take into account the name of the project
* if the name has all upper case words
* if the name contains ! or ?
* number of words in name
* does the name contains non alphanumeric characters
* duration between launch and deadline
* month of launch_date

Others 

* goal in usd
* category (1-hot encoded)
* main category (1-hot encoded)
* country (1-hot encoding)

to predict target variable state

In [None]:
def getDelta(a,b):
    '''Get diffence in days between launch and deadline'''
    return (a - b).days

# Duration of the project   
df['duration'] = df.apply(lambda x: getDelta(x['deadline'],x['launched']),axis = 1)

In [None]:
df['month'] = df['launched'].dt.month
df['year_month'] = df['launched'].map(lambda x: str(x.year) + "-" + str(x.month))

In [None]:
import re

def has_non_chars(name):
    for c in name:
        if not c.isalpha() and c!='?' and c!='!':
            return 1
    return 0

def has_exclamation_interrogation(name):
    if ("!" in name or "?" in name):
        return 1
    return 0

def has_upper(name):
    for word in name.split(' '):
        if word.isupper() and len(re.sub(r'\W+', '', word))>1:
            return 1
    return 0

In [None]:
df['len_name'] = df.name.str.len()

In [None]:
df['name_nb_words'] = df.name.apply(lambda x: len(str(x).split(' ')))

In [None]:
df['name_non_chars'] = df.name.apply(has_non_chars)

In [None]:
df['name_has_symbol'] = df.name.apply(has_exclamation_interrogation)

In [None]:
df['name_upper'] = df.name.apply(has_upper)

In [None]:
df['cat_full'] = df[["main_category","category"]].agg('-'.join, axis=1)

In [None]:
df.head()

<a id='Q2'></a>

## I. Let's prepare the dataset to train the model

In [None]:
df.columns

In [None]:
ks = df.drop(['ID','name','deadline','launched','year_month', 'backers', 'real_usd_pledged'], axis=1).copy()

In [None]:
ks.columns

usd_goal is skewed, let's check the distribution here, let's replace it.

In [None]:
ks['usd_goal_corrected'] = np.log1p(ks['usd_goal'])

In [None]:
ks['state'] = ks.state.map(dict(successful=1, failed=0))

## 2. More data exploration

In [None]:
corr = ks.corr()
dims = (16, 10)
fig, ax = plt.subplots(figsize = dims)
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,ax = ax, cmap="Blues")

In [None]:
# We'll drop name_nb_words because it's highly correlated with len_name
ks = ks.drop(['name_nb_words'], axis=1)
# We can drop currency too as the currency is explained by the country
ks = ks.drop(['currency'], axis=1)
# We can drop category and main_category as it's encoded in cat_full
ks = ks.drop(['category','main_category'], axis=1)

In [None]:
ks.columns

In [None]:
ks.state.value_counts(normalize=True)

We may consider the dataset is balanced because of the 60/40 % ratio

In [None]:
ks.dtypes

In [None]:
ks.describe(include='all')

## II. Model training

In [None]:
ks.dtypes

In [None]:
ks = pd.get_dummies(ks)

In [None]:
ks.columns #get_dummies create one-hot encoded columns. every unique category gets a column of 1/0

In [None]:
y = ks.state
X = ks.drop(['state','usd_goal'], axis = 1)

In [None]:
# Transforming the data
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=list(X.columns))
X.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED)

In [None]:
print('x_train.shape:', X_train.shape)
print('y_train.shape:', y_train.shape)
print('x_test.shape :', X_test.shape)
print('y_test.shape :', y_test.shape)

In [None]:
# Fitting a logistic regression model with default parameters
logreg = LogisticRegression()
logreg.fit(X_train,y_train)

In [None]:
# Making predictions
y_hat_train = logreg.predict(X_train)
y_hat_test = logreg.predict(X_test)

In [None]:
# Logistic regression scores
print("Logistic regression score for training set:", round(logreg.score(X_train, y_train),5))
print("Logistic regression score for test set:", round(logreg.score(X_test, y_test),5))
print("\nClassification report:")
print(classification_report(y_test, y_hat_test))

In [None]:
import itertools

def plot_cf(y_true, y_pred, class_names=None, model_name=None):
    """Plots a confusion matrix"""
    cf = confusion_matrix(y_true, y_pred)
    plt.imshow(cf, cmap=plt.cm.Blues)
    plt.grid(b=None)
    if model_name:
        plt.title("Confusion Matrix: {}".format(model_name))
    else:
        plt.title("Confusion Matrix")
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    
    class_names = set(y_true)
    tick_marks = np.arange(len(class_names))
    if class_names:
        plt.xticks(tick_marks, class_names)
        plt.yticks(tick_marks, class_names)
    
    thresh = cf.max() / 2.
    
    for i, j in itertools.product(range(cf.shape[0]), range(cf.shape[1])):
        plt.text(j, i, cf[i, j], horizontalalignment='center', color='white' if cf[i, j] > thresh else 'black')

    plt.colorbar()

In [None]:
# Confusion matrix
plot_cf(y_test, y_hat_test)

In [None]:
from sklearn.metrics import roc_curve, auc

# Plotting the AUC-ROC
y_score = logreg.fit(X_train, y_train).decision_function(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_score)

print('AUC:', round(auc(fpr, tpr),5))

plt.figure(figsize=(10,8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()