##Analysis Questions
##### Can you categorize the factors for a sucessful and unsuccesful project? Classification
##### Can you predict if a project will succeed or fail? Logistic Regression
##### Can you predict how much a backer will pledge to a project based on various factors? Linear Regression
##### What are the most important factors for a successful project and unsuccessful project?

In [2]:
import pandas as pd
import numpy as np
import chardet

In [3]:
# Reading error with UTF-8 unable to decode the file
ks16a = pd.read_csv("/dbfs/FileStore/tables/ks_projects_201612-284ce.csv")

In [4]:
# Code based on https://www.kaggle.com/rtatman/data-cleaning-challenge-character-encodings
# Use chardet to detect character encoding: chardet shows Windows-1252 encoding

with open("/dbfs/FileStore/tables/ks_projects_201612-284ce.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(10000))
    
print(result)


In [5]:
# Read CSV file with encolding Windows-1252
ks16a = pd.read_csv("/dbfs/FileStore/tables/ks_projects_201612-284ce.csv", encoding='Windows-1252')



In [6]:
ks16a.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09 11:36:00,1000,2015-08-11 12:12:28,0,failed,0,GB,0,,,,
1,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26 00:20:50,45000,2013-01-12 00:20:50,220,failed,3,US,220,,,,
2,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16 04:24:11,5000,2012-03-17 03:24:11,1,failed,1,US,1,,,,
3,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29 01:00:00,19500,2015-07-04 08:35:03,1283,canceled,14,US,1283,,,,
4,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01 13:38:27,50000,2016-02-26 13:38:27,52375,successful,224,US,52375,,,,


In [7]:
ks16a.columns

In [8]:
# Column names have a space, rename column names to have no spaces 
ks16a.columns = ['ID', 'name', 'category', 'main_category', 'currency', 'deadline', \
       'goal', 'launched', 'pledged', 'state', 'backers', 'country', \
       'usd_pledged', 'c_13', 'c_14', 'c_15', 'c_16']
    

In [9]:
# Cast all columns to string
ks16a["ID"] = ks16a["ID"].astype(str)
ks16a["name"] = ks16a["name"].astype(str)
ks16a["category"] = ks16a["category"].astype(str)
ks16a["main_category"] = ks16a["main_category"].astype(str)
ks16a["currency"] = ks16a["currency"].astype(str)
ks16a["deadline"] = ks16a["deadline"].astype(str)
ks16a["goal"] = ks16a["goal"].astype(str)
ks16a["launched"] = ks16a["launched"].astype(str)
ks16a["pledged"] = ks16a["pledged"].astype(str)
ks16a["state"] = ks16a["state"].astype(str)
ks16a["backers"] = ks16a["backers"].astype(str) 
ks16a["country"] = ks16a["country"].astype(str)                                              
ks16a["usd_pledged"] = ks16a["usd_pledged"].astype(str)
ks16a["c_13"] = ks16a["c_13"].astype(str)
ks16a["c_14"] = ks16a["c_14"].astype(str)
ks16a["c_15"] = ks16a["c_15"].astype(str)
ks16a["c_16"] = ks16a["c_16"].astype(str)

In [10]:
# Create a dataframe in Spark
ks16 = spark.createDataFrame(ks16a)

In [11]:
# Import functions/datatypes for timestamp, integer, and double
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Drop extra columns
ks16 = ks16.drop('c_13','c_14','c_15','c_16')


# Cast from string to integer and double
ks16 = ks16.withColumn("ID", ks16['ID'].cast(IntegerType()))
ks16 = ks16.withColumn("goal", ks16['goal'].cast(IntegerType()))
ks16 = ks16.withColumn("pledged", ks16['pledged'].cast(DoubleType()))
ks16 = ks16.withColumn("backers", ks16['backers'].cast(IntegerType()))
ks16 = ks16.withColumn("usd_pledged", ks16['usd_pledged'].cast(DoubleType()))


In [12]:
from pyspark.sql.functions import isnan, when, count, col
display(ks16.select([count(when(isnan(c) | col(c).isNull(),c)).alias(c) for c in ks16.columns]))

ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd_pledged
0,0,0,0,0,0,632,0,624,0,623,0,4413


In [13]:
# Drop all nulls from the data frame
ks16 = ks16.dropna()

In [14]:
from pyspark.sql.functions import isnan, when, count, col
display(ks16.select([count(when(isnan(c) | col(c).isNull(),c)).alias(c) for c in ks16.columns]))

ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd_pledged
0,0,0,0,0,0,0,0,0,0,0,0,0


In [15]:
import time
import datetime

# Convert Date/Time to just Date
ks16 = ks16.withColumn("deadline", to_date(unix_timestamp("deadline", "yyyy-MM-dd").cast("timestamp")))
ks16 = ks16.withColumn("launched", to_date(unix_timestamp("launched", "yyyy-MM-dd").cast("timestamp")))

In [16]:
display(ks16).take(5)

ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd_pledged
1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000,2015-08-11,0.0,failed,0,GB,0.0
1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000,2013-01-12,220.0,failed,3,US,220.0
1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000,2012-03-17,1.0,failed,1,US,1.0
1000011046,Community Film Project: The Art of Neighborhood Filmmaking,Film & Video,Film & Video,USD,2015-08-29,19500,2015-07-04,1283.0,canceled,14,US,1283.0
1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000,2016-02-26,52375.0,successful,224,US,52375.0
1000023410,Support Solar Roasted Coffee & Green Energy! SolarCoffee.co,Food,Food,USD,2014-12-21,1000,2014-12-01,1205.0,successful,16,US,1205.0
1000030581,Chaser Strips. Our Strips make Shots their B*tch!,Drinks,Food,USD,2016-03-17,25000,2016-02-01,453.0,failed,40,US,453.0
1000034518,SPIN - Premium Retractable In-Ear Headphones with Mic,Product Design,Design,USD,2014-05-29,125000,2014-04-24,8233.0,canceled,58,US,8233.0
100004195,STUDIO IN THE SKY - A Documentary Feature Film (Canceled),Documentary,Film & Video,USD,2014-08-10,65000,2014-07-11,6240.57,canceled,43,US,6240.57
100004721,Of Jesus and Madmen,Nonfiction,Publishing,CAD,2013-10-09,2500,2013-09-09,0.0,failed,0,CA,0.0


In [17]:
# No issues with decoding errors
ks18a = pd.read_csv("/dbfs/FileStore/tables/ks_projects_201801-a566d.csv")
# Column names have a space, rename column names to have no spaces 
ks18a.columns = ['ID', 'name', 'category', 'main_category', 'currency', 'deadline', \
       'goal', 'launched', 'pledged', 'state', 'backers', 'country', \
       'usd_pledged', 'usd_pledged_real', 'usd_goal_real']

ks18 = spark.createDataFrame(ks18a)

In [18]:
#Reference code: Chapter 5 Big Data Analysis
from pyspark.sql.functions import isnan, when, count, col

ks18.select([count(when(isnan(c) | col(c).isNull(),c)).alias(c) for c in ks18.columns]).show()



In [19]:
# Drop all null rows
ks18 = ks18.dropna()

# Drop extra columns
ks18 = ks18.drop('usd_pledged_real','usd_goal_real')


In [20]:
# Verify there are no nulls
ks18.select([count(when(isnan(c) | col(c).isNull(),c)).alias(c) for c in ks18.columns]).show()

In [21]:
# Convert date/time to date

ks18 = ks18.withColumn('deadline',to_date(unix_timestamp('deadline','yyyy-MM-dd').cast("timestamp"))) 
ks18 = ks18.withColumn('launched',to_date(unix_timestamp('launched','yyyy-MM-dd').cast("timestamp")))

In [22]:
display(ks18).take(5)

ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd_pledged
1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11,0.0,failed,0,GB,0.0
1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02,2421.0,failed,15,US,100.0
1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12,220.0,failed,3,US,220.0
1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17,1.0,failed,1,US,1.0
1000011046,Community Film Project: The Art of Neighborhood Filmmaking,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04,1283.0,canceled,14,US,1283.0
1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26,52375.0,successful,224,US,52375.0
1000023410,Support Solar Roasted Coffee & Green Energy! SolarCoffee.co,Food,Food,USD,2014-12-21,1000.0,2014-12-01,1205.0,successful,16,US,1205.0
1000030581,Chaser Strips. Our Strips make Shots their B*tch!,Drinks,Food,USD,2016-03-17,25000.0,2016-02-01,453.0,failed,40,US,453.0
1000034518,SPIN - Premium Retractable In-Ear Headphones with Mic,Product Design,Design,USD,2014-05-29,125000.0,2014-04-24,8233.0,canceled,58,US,8233.0
100004195,STUDIO IN THE SKY - A Documentary Feature Film (Canceled),Documentary,Film & Video,USD,2014-08-10,65000.0,2014-07-11,6240.57,canceled,43,US,6240.57


In [23]:
ks = ks18.union(ks16)
# Add the length of the project
ks = ks.withColumn('duration',datediff(ks.deadline,ks.launched))

In [24]:
display(ks).take(5)

ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd_pledged,duration
1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11,0.0,failed,0,GB,0.0,59
1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02,2421.0,failed,15,US,100.0,60
1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12,220.0,failed,3,US,220.0,45
1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17,1.0,failed,1,US,1.0,30
1000011046,Community Film Project: The Art of Neighborhood Filmmaking,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04,1283.0,canceled,14,US,1283.0,56
1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26,52375.0,successful,224,US,52375.0,35
1000023410,Support Solar Roasted Coffee & Green Energy! SolarCoffee.co,Food,Food,USD,2014-12-21,1000.0,2014-12-01,1205.0,successful,16,US,1205.0,20
1000030581,Chaser Strips. Our Strips make Shots their B*tch!,Drinks,Food,USD,2016-03-17,25000.0,2016-02-01,453.0,failed,40,US,453.0,45
1000034518,SPIN - Premium Retractable In-Ear Headphones with Mic,Product Design,Design,USD,2014-05-29,125000.0,2014-04-24,8233.0,canceled,58,US,8233.0,35
100004195,STUDIO IN THE SKY - A Documentary Feature Film (Canceled),Documentary,Film & Video,USD,2014-08-10,65000.0,2014-07-11,6240.57,canceled,43,US,6240.57,30


In [25]:
# Importing libraries to visualize the data
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker

ksp=ks.toPandas()


In [26]:
ksp.dtypes


In [27]:
ksp.info()

In [28]:
# Format Pandas to display float with 2 decimals
pd.options.display.float_format = '{:,.2f}'.format

In [29]:
# Descriptive statistics on kickstarter data
ksp.describe()

Unnamed: 0,ID,goal,pledged,backers,usd_pledged,duration
count,694188.0,694188.0,694188.0,694188.0,694188.0,694188.0
mean,1074728555.73,48660.43,9296.76,104.92,7409.81,34.63
std,619246061.86,1169751.31,93286.87,925.01,81477.01,68.75
min,5971.0,0.0,0.0,0.0,0.0,1.0
25%,537722510.0,2000.0,30.0,2.0,20.26,30.0
50%,1075550449.5,5250.0,615.0,12.0,456.68,30.0
75%,1610536778.25,16000.0,4022.07,56.0,3262.08,38.0
max,2147476221.0,100000000.0,20338986.27,219382.0,20338986.27,16739.0


In [30]:
# Distinct values for each column
for i in ks.columns:
    print(i, ks.select(i).distinct().count())

In [31]:
# Explore the total amount pledged for different countries  
ks_usd=ks.select('country','usd_pledged').groupby('country').sum('usd_pledged').sort('country',ascending=True).toPandas()
ks_usd.columns=['country','total pledged']
# Explore the number of projects for each states of the project 
ks_state=ks.groupby('state').count().sort('count',ascending=True).toPandas()

In [32]:
sns.barplot(x='state', y='count', data=ks_state).set_title("Project state distribution")
plt.show();

In [33]:
fig,ax =plt.subplots(1,2,figsize=(25,10))
# X axis which is sum(usd_pledged) is in bilions and we can see that US has the highest amount pledged around 4.4 billions followed by GB which is just 0.3 billion
sns.barplot(x='country', y='total pledged', data=ks_usd, ax=ax[0]).set_title("Country and total pledged in USD")
sns.barplot(x='state', y='count', data=ks_state, ax=ax[1]).set_title("Project state distribution")
plt.subplots_adjust(wspace=0.5)
display(fig)


In [34]:
# Explore the numbers of backers by varios main category
ks_backers=ks.select('main_category','backers').groupby('main_category').agg(count('backers')).sort('count(backers)',ascending=False).toPandas()
ks_duration=ks.select('main_category','duration').groupby('main_category').agg(avg('duration')).sort('avg(duration)',ascending=False).toPandas()

In [35]:
fig,ax =plt.subplots(1,2,figsize=(25,10))
sns.barplot(x='count(backers)', y='main_category', data=ks_backers,ax=ax[0]).set_title("Project category distribution by no. of backers ")
sns.barplot(x='avg(duration)', y='main_category', data=ks_duration,ax=ax[1]).set_title("Project category distribution by sum of duration ")
plt.subplots_adjust(wspace=0.5)
display(fig)

In [36]:
sns.barplot(x='count(backers)', y='main_category', data=ks_backers,ax=ax[0]).set_title("Project category distribution by no. of backers ")
plt.show();

In [37]:
# Explore the total amount pledged for each year
ksyear = ks.withColumn("launched", ks['launched'].cast('string'))
ksyear = ks.withColumn("launched", ks['launched'].substr(0,4))
# Select usd_pledge and launched and convert to Pandas dataframe
ks_year=ksyear.select('usd_pledged','launched').groupby('launched').sum('usd_pledged').sort('launched').toPandas()
#Remove 1970 data sum(usd_pledged) is zero
ks_year = ks_year.iloc[1:, :]

In [38]:
fig,ax=plt.subplots(figsize=(25,10))
sns.lineplot(x='launched', y='sum(usd_pledged)', data=ks_year).set_title("USD distribution as per the year")
display(fig)

In [39]:
# Filter successful projects and failed projects
failed = ksp.loc[ksp.state=='failed']
successful = ksp.loc[ksp.state=='successful']
 
x = successful.main_category.value_counts().index
y = successful.main_category.value_counts().values
data = {'Category': x,'Count':y}

# Create DataFrame on successful projects and main category
success = pd.DataFrame(data)

x = failed.main_category.value_counts().index
y = failed.main_category.value_counts().values
data = {'Category': x,'Count':y}

# Create DataFrame on failed projects and main category
fail = pd.DataFrame(data)

In [40]:
fig,ax =plt.subplots(1,2,figsize=(25,10))
sns.barplot(y='Category', x='Count', data=success, ax=ax[0] ).set_title("Successful Projects by Main Category")
sns.barplot(y='Category', x='Count', data=fail, ax=ax[1]).set_title("Failed Projects by Main Category")
plt.subplots_adjust(wspace=0.5)
display(fig)

In [41]:
x = successful.category.value_counts()[:10].index
y = successful.category.value_counts()[:10].values
data = {'Category': x,'Count':y}

# Create DataFrame on successful projects and main category
success_cat = pd.DataFrame(data)

x = failed.category.value_counts()[:10].index
y = failed.category.value_counts()[:10].values
data = {'Category': x,'Count':y}

# Create DataFrame on failed projects and main category
fail_cat = pd.DataFrame(data)

In [42]:
fig,ax =plt.subplots(1,2,figsize=(25,10))
sns.barplot(y='Category', x='Count', data=success_cat, ax=ax[0]).set_title("Top 10 Successful Projects")
sns.barplot(y='Category', x='Count', data=fail_cat, ax=ax[1]).set_title("Top 10 Failed Projects")
plt.subplots_adjust(wspace=0.5)
# control x and y limits
#plt.xlim(0, 20000)
#plt.ylim(0, 20000)
display(fig)

In [43]:
mean_success_cat = successful.groupby(['main_category']).mean()

x = mean_success_cat.index
y = mean_success_cat.duration.value_counts().index
data = {'Category': x,'Ave':y}

# Create DataFrame on successful projects and main category
mean_success_dur = pd.DataFrame(data)

x = mean_success_cat.index
y = mean_success_cat.backers.value_counts().index
data = {'Category': x,'Ave':y}

# Create DataFrame on successful projects and main category
mean_success_bac = pd.DataFrame(data)

In [44]:
fig,ax =plt.subplots(1,2,figsize=(25,10))
sns.barplot(y='Category', x='Ave', data=mean_success_dur, ax=ax[0] ).set_title("Average Project Length")
sns.barplot(y='Category', x='Ave', data=mean_success_bac, ax=ax[1]).set_title("Average Backers")
plt.subplots_adjust(wspace=0.2)
#plt.xticks(y_pos, mean_success_bac.Category, rotation=60)
display(fig)