In [1]:
# Import libraries and functions
import pandas as pd
import numpy as np
import chardet
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from pyspark.sql.functions import *


In [2]:
# Reading error with UTF-8 unable to decode the file
ks16a = pd.read_csv("/dbfs/FileStore/tables/ks_projects_201612-284ce.csv")

In [3]:
# Code based on https://www.kaggle.com/rtatman/data-cleaning-challenge-character-encodings
# Use chardet to detect character encoding: chardet shows Windows-1252 encoding

with open("/dbfs/FileStore/tables/ks_projects_201612-284ce.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(10000))
    
print(result)


In [4]:
# Read CSV file with encolding Windows-1252
ks16a = pd.read_csv("/dbfs/FileStore/tables/ks_projects_201612-284ce.csv", encoding='Windows-1252')



In [5]:
ks16a.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09 11:36:00,1000,2015-08-11 12:12:28,0,failed,0,GB,0,,,,
1,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26 00:20:50,45000,2013-01-12 00:20:50,220,failed,3,US,220,,,,
2,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16 04:24:11,5000,2012-03-17 03:24:11,1,failed,1,US,1,,,,
3,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29 01:00:00,19500,2015-07-04 08:35:03,1283,canceled,14,US,1283,,,,
4,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01 13:38:27,50000,2016-02-26 13:38:27,52375,successful,224,US,52375,,,,


In [6]:
ks16a.columns

In [7]:
# Column names have a space, rename column names to have no spaces 
ks16a.columns = ['ID', 'name', 'category', 'main_category', 'currency', 'deadline', \
       'goal', 'launched', 'pledged', 'state', 'backers', 'country', \
       'usd_pledged', 'c_13', 'c_14', 'c_15', 'c_16']
    

In [8]:
# Cast all columns to string
ks16a["ID"] = ks16a["ID"].astype(str)
ks16a["name"] = ks16a["name"].astype(str)
ks16a["category"] = ks16a["category"].astype(str)
ks16a["main_category"] = ks16a["main_category"].astype(str)
ks16a["currency"] = ks16a["currency"].astype(str)
ks16a["deadline"] = ks16a["deadline"].astype(str)
ks16a["goal"] = ks16a["goal"].astype(str)
ks16a["launched"] = ks16a["launched"].astype(str)
ks16a["pledged"] = ks16a["pledged"].astype(str)
ks16a["state"] = ks16a["state"].astype(str)
ks16a["backers"] = ks16a["backers"].astype(str) 
ks16a["country"] = ks16a["country"].astype(str)                                              
ks16a["usd_pledged"] = ks16a["usd_pledged"].astype(str)
ks16a["c_13"] = ks16a["c_13"].astype(str)
ks16a["c_14"] = ks16a["c_14"].astype(str)
ks16a["c_15"] = ks16a["c_15"].astype(str)
ks16a["c_16"] = ks16a["c_16"].astype(str)

In [9]:
# Create a dataframe in Spark
ks16 = spark.createDataFrame(ks16a)

In [10]:
# Import functions/datatypes for timestamp, integer, and double
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Drop extra columns
ks16 = ks16.drop('c_13','c_14','c_15','c_16')


# Cast from string to integer and double
ks16 = ks16.withColumn("ID", ks16['ID'].cast(IntegerType()))
ks16 = ks16.withColumn("goal", ks16['goal'].cast(IntegerType()))
ks16 = ks16.withColumn("pledged", ks16['pledged'].cast(DoubleType()))
ks16 = ks16.withColumn("backers", ks16['backers'].cast(IntegerType()))
ks16 = ks16.withColumn("usd_pledged", ks16['usd_pledged'].cast(DoubleType()))


In [11]:
from pyspark.sql.functions import isnan, when, count, col
ks16.select([count(when(isnan(c) | col(c).isNull(),c)).alias(c) for c in ks16.columns]).show()

In [12]:
# Drop all nulls from the data frame
ks16 = ks16.dropna()

In [13]:
import time
import datetime

# Convert from string to date
ks16 = ks16.withColumn("deadline", to_date(unix_timestamp("deadline", "yyyy-MM-dd").cast("timestamp")))
ks16 = ks16.withColumn("launched", to_date(unix_timestamp("launched", "yyyy-MM-dd").cast("timestamp")))

In [14]:
display(ks16.take(5))

ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd_pledged
1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000,2015-08-11,0.0,failed,0,GB,0.0
1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000,2013-01-12,220.0,failed,3,US,220.0
1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000,2012-03-17,1.0,failed,1,US,1.0
1000011046,Community Film Project: The Art of Neighborhood Filmmaking,Film & Video,Film & Video,USD,2015-08-29,19500,2015-07-04,1283.0,canceled,14,US,1283.0
1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000,2016-02-26,52375.0,successful,224,US,52375.0


In [15]:
# No issues with decoding errors
ks18a = pd.read_csv("/dbfs/FileStore/tables/ks_projects_201801-a566d.csv")
ks18 = spark.createDataFrame(ks18a)

In [16]:
#Reference code: Chapter 5 Big Data Analysis
from pyspark.sql.functions import isnan, when, count, col

ks18.select([count(when(isnan(c) | col(c).isNull(),c)).alias(c) for c in ks18.columns]).show()



In [17]:
# Drop all null rows
ks18 = ks18.dropna()

# Drop extra columns
ks18 = ks18.drop('usd_pledged_real','usd_goal_real')


In [18]:
# Verify there are no nulls
ks18.select([count(when(isnan(c) | col(c).isNull(),c)).alias(c) for c in ks18.columns]).show()

In [19]:
# Convert from string to date
ks18 = ks18.withColumn("deadline", to_date(unix_timestamp("deadline", "yyyy-MM-dd").cast("timestamp")))
ks18 = ks18.withColumn("launched", to_date(unix_timestamp("launched", "yyyy-MM-dd").cast("timestamp")))

In [20]:
display(ks18.take(5))

ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged
1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11,0.0,failed,0,GB,0.0
1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02,2421.0,failed,15,US,100.0
1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12,220.0,failed,3,US,220.0
1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17,1.0,failed,1,US,1.0
1000011046,Community Film Project: The Art of Neighborhood Filmmaking,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04,1283.0,canceled,14,US,1283.0


In [21]:
ks = ks16.union(ks18)

# Add the length of the project
ks = ks.withColumn('duration',datediff(ks.deadline,ks.launched))

# Create a view or table
temp_table_name = "ks_projects"
ks.createOrReplaceTempView(temp_table_name)

In [22]:
display(ks.take(5))

ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd_pledged,duration
1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11,0.0,failed,0,GB,0.0,59
1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12,220.0,failed,3,US,220.0,45
1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17,1.0,failed,1,US,1.0,30
1000011046,Community Film Project: The Art of Neighborhood Filmmaking,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04,1283.0,canceled,14,US,1283.0,56
1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26,52375.0,successful,224,US,52375.0,35


In [23]:
ks.schema

In [24]:
%sql

/* Query the created temp table in a SQL cell */

select * from ks_projects limit 5

ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd_pledged,duration
1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11,0.0,failed,0,GB,0.0,59
1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12,220.0,failed,3,US,220.0,45
1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17,1.0,failed,1,US,1.0,30
1000011046,Community Film Project: The Art of Neighborhood Filmmaking,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04,1283.0,canceled,14,US,1283.0,56
1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26,52375.0,successful,224,US,52375.0,35


In [25]:
pd.options.display.float_format = '{:,.2f}'.format

In [26]:
# Use a different dataframe to transform during analysis (create restore point)
ks1 = ks

In [27]:
ks1 = ks1.select('main_category', 'goal', 'state', 'backers', 'country', 'usd_pledged','duration')

In [28]:
display(ks1.take(10))

main_category,goal,state,backers,country,usd_pledged,duration
Publishing,1000.0,failed,0,GB,0.0,59
Film & Video,45000.0,failed,3,US,220.0,45
Music,5000.0,failed,1,US,1.0,30
Food,50000.0,successful,224,US,52375.0,35
Food,1000.0,successful,16,US,1205.0,20
Food,25000.0,failed,40,US,453.0,45
Publishing,2500.0,failed,0,CA,0.0,30
Music,12500.0,successful,100,US,12700.0,30
Crafts,5000.0,failed,0,US,0.0,30
Games,200000.0,failed,0,US,0.0,45


In [29]:
ks1 = ks1.filter("state == 'successful' or state == 'failed'")

In [30]:
from pyspark.ml.classification import  RandomForestClassifier
from pyspark.ml.feature import CountVectorizer,StringIndexer, OneHotEncoderEstimator, VectorAssembler, VectorSlicer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit


In [31]:
train_data,test_data=ks1.randomSplit([0.7,0.3], seed=123)
display(train_data.take(5))


main_category,goal,state,backers,country,usd_pledged,duration
Art,1.0,successful,1,US,1.0,33
Art,3.0,successful,7,US,79.0,30
Art,4.0,successful,12,US,205.0,30
Art,5.0,failed,2,US,2.0,30
Art,5.0,failed,2,US,2.0,30


In [32]:
display(test_data.take(5))

main_category,goal,state,backers,country,usd_pledged,duration
Art,1.0,successful,17,GB,228.26419409,30
Art,1.0,successful,37,US,796.0,30
Art,1.0,successful,78,US,834.0,18
Art,5.0,successful,3,US,31.0,30
Art,5.0,successful,50,US,121.0,3


In [33]:
# Convert the categorical columns to hold numerical data
category_indexer = StringIndexer(inputCol='main_category',outputCol='category_index',handleInvalid='keep')
state_indexer = StringIndexer(inputCol='state',outputCol='state_index',handleInvalid='keep')
country_indexer = StringIndexer(inputCol='country',outputCol='country_index',handleInvalid='keep')

# Vector assembler is used to create a vector of input features
vector_assembler = VectorAssembler(inputCols=['category_index','country_index','goal','backers','duration','usd_pledged'],
                            outputCol="features")

In [34]:
rf_model = RandomForestClassifier(labelCol="state_index", featuresCol="features", seed = 321,
                            numTrees=100, cacheNodeIds = True, subsamplingRate = 0.7)

In [35]:
pipe = Pipeline(stages=[category_indexer,state_indexer,country_indexer,vector_assembler, rf_model])
fitted_pipe=pipe.fit(train_data)
rf1_results=fitted_pipe.transform(test_data)

In [36]:
rf1_results.select(['state_index','rawPrediction','prediction','probability']).show()

In [37]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [38]:
fit_model = rf_model.fit(rf1_results.select(['features','state_index']))

ACC_evaluator = MulticlassClassificationEvaluator(
    labelCol="state_index", predictionCol="prediction", metricName="accuracy")

In [39]:
# Print the model
rfModel = fitted_pipe.stages[4]
print(rfModel) 
# Print the accuracy and the test data error
accuracy = ACC_evaluator.evaluate(rf1_results)*100
print("\nAccuracy = %g" % (accuracy)," %\n")
print("Test Error = %g" % (100 - accuracy)," %\n")

In [40]:
fitted_pipe.stages[-1].featureImportances

In [41]:
# function from https://www.timlrx.com/2018/06/19/feature-selection-using-feature-importance-score-creating-a-pyspark-estimator/
def ExtractFeatureImp(featureImp, dataset, featuresCol):
    list_extract = []
    for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score', ascending = False))

In [42]:
# Use defined function to extract the important features in the data 
ExtractFeatureImp(fitted_pipe.stages[-1].featureImportances, rf1_results , "features")

Unnamed: 0,idx,name,vals,score
1,3,backers,,0.52
3,5,usd_pledged,,0.29
0,2,goal,,0.17
4,0,category_index,"[Film & Video, Music, Publishing, Games, Techn...",0.01
2,4,duration,,0.0
5,1,country_index,"[US, GB, CA, AU, DE, NL, FR, IT, ES, SE, NZ, D...",0.0
