In [1]:
import pandas as pd
import numpy as np
import chardet
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [2]:
# Reading error with UTF-8 unable to decode the file
ks16a = pd.read_csv("/dbfs/FileStore/tables/ks_projects_201612-284ce.csv")

In [3]:
# Code based on https://www.kaggle.com/rtatman/data-cleaning-challenge-character-encodings
# Use chardet to detect character encoding: chardet shows Windows-1252 encoding

with open("/dbfs/FileStore/tables/ks_projects_201612-284ce.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(10000))
    
print(result)


In [4]:
# Read CSV file with encolding Windows-1252
ks16a = pd.read_csv("/dbfs/FileStore/tables/ks_projects_201612-284ce.csv", encoding='Windows-1252')
ks16a.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09 11:36:00,1000,2015-08-11 12:12:28,0,failed,0,GB,0,,,,
1,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26 00:20:50,45000,2013-01-12 00:20:50,220,failed,3,US,220,,,,
2,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16 04:24:11,5000,2012-03-17 03:24:11,1,failed,1,US,1,,,,
3,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29 01:00:00,19500,2015-07-04 08:35:03,1283,canceled,14,US,1283,,,,
4,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01 13:38:27,50000,2016-02-26 13:38:27,52375,successful,224,US,52375,,,,


In [5]:
ks16a.columns

In [6]:
# Column names have a space, rename column names to have no spaces 
ks16a.columns = ['ID', 'name', 'category', 'main_category', 'currency', 'deadline', \
       'goal', 'launched', 'pledged', 'state', 'backers', 'country', \
       'usd_pledged', 'c_13', 'c_14', 'c_15', 'c_16']
    

In [7]:
# Cast all columns to string
ks16a["ID"] = ks16a["ID"].astype(str)
ks16a["name"] = ks16a["name"].astype(str)
ks16a["category"] = ks16a["category"].astype(str)
ks16a["main_category"] = ks16a["main_category"].astype(str)
ks16a["currency"] = ks16a["currency"].astype(str)
ks16a["deadline"] = ks16a["deadline"].astype(str)
ks16a["goal"] = ks16a["goal"].astype(str)
ks16a["launched"] = ks16a["launched"].astype(str)
ks16a["pledged"] = ks16a["pledged"].astype(str)
ks16a["state"] = ks16a["state"].astype(str)
ks16a["backers"] = ks16a["backers"].astype(str) 
ks16a["country"] = ks16a["country"].astype(str)                                              
ks16a["usd_pledged"] = ks16a["usd_pledged"].astype(str)
ks16a["c_13"] = ks16a["c_13"].astype(str)
ks16a["c_14"] = ks16a["c_14"].astype(str)
ks16a["c_15"] = ks16a["c_15"].astype(str)
ks16a["c_16"] = ks16a["c_16"].astype(str)

In [8]:
# Create a dataframe in Spark
ks16 = spark.createDataFrame(ks16a)

In [9]:
#Import PySpark libraries 
import pyspark
from pyspark import SparkContext, SparkConf
# Import functions/datatypes for timestamp, integer, and double
from pyspark.sql import Row
from pyspark.sql.functions import *
from pyspark.sql.types import *


In [10]:
# Drop extra columns
ks16 = ks16.drop('c_13','c_14','c_15','c_16')


# Cast from string to integer and double
ks16 = ks16.withColumn("ID", ks16['ID'].cast(IntegerType()))
ks16 = ks16.withColumn("goal", ks16['goal'].cast(IntegerType()))
ks16 = ks16.withColumn("pledged", ks16['pledged'].cast(DoubleType()))
ks16 = ks16.withColumn("backers", ks16['backers'].cast(IntegerType()))
ks16 = ks16.withColumn("usd_pledged", ks16['usd_pledged'].cast(DoubleType()))


In [11]:
#Reference code: Chapter 5 Big Data Analysis
from pyspark.sql.functions import isnan, when, count, col
ks16.select([count(when(isnan(c) | col(c).isNull(),c)).alias(c) for c in ks16.columns]).show()

In [12]:
# Drop all nulls from the data frame
ks16 = ks16.dropna()

In [13]:
import time
import datetime
# Convert from date/time to just date
ks16 = ks16.withColumn("deadline", to_date(unix_timestamp("deadline", "yyyy-MM-dd").cast("timestamp")))
ks16 = ks16.withColumn("launched", to_date(unix_timestamp("launched", "yyyy-MM-dd").cast("timestamp")))

In [14]:
ks16.show(20)

In [15]:
# No issues with decoding errors
ks18a = pd.read_csv("/dbfs/FileStore/tables/ks_projects_201801-a566d.csv")
ks18a.head()


Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [16]:
ks18a.columns

In [17]:
ks18a.columns = ['ID', 'name', 'category', 'main_category', 'currency', 'deadline', \
       'goal', 'launched', 'pledged', 'state', 'backers', 'country', \
       'usd_pledged', 'usd_pledged_real','usd_goal_real']
ks18 = spark.createDataFrame(ks18a)

In [18]:
#Reference code: Chapter 5 Big Data Analysis
from pyspark.sql.functions import isnan, when, count, col

ks18.select([count(when(isnan(c) | col(c).isNull(),c)).alias(c) for c in ks18.columns]).show()



In [19]:
# Drop all null rows
ks18 = ks18.dropna()

# Drop extra columns
ks18 = ks18.drop('usd_pledged_real','usd_goal_real')


In [20]:
# Verify there are no nulls
ks18.select([count(when(isnan(c) | col(c).isNull(),c)).alias(c) for c in ks18.columns]).show()

In [21]:
# # Convert from date/time to just date
ks18 = ks18.withColumn("deadline", to_date(unix_timestamp("deadline", "yyyy-MM-dd").cast("timestamp")))
ks18 = ks18.withColumn("launched", to_date(unix_timestamp("launched", "yyyy-MM-dd").cast("timestamp")))

In [22]:
ks18.show(20)

In [23]:
ks = ks16.union(ks18)

# Add the length of the project
ks = ks.withColumn('duration',datediff(ks.deadline,ks.launched))

In [24]:
display(ks.take(5))

ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd_pledged,duration
1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11,0.0,failed,0,GB,0.0,59
1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12,220.0,failed,3,US,220.0,45
1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17,1.0,failed,1,US,1.0,30
1000011046,Community Film Project: The Art of Neighborhood Filmmaking,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04,1283.0,canceled,14,US,1283.0,56
1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26,52375.0,successful,224,US,52375.0,35


In [25]:
# Create a view or table

temp_table_name = "ks_projects"

ks.createOrReplaceTempView(temp_table_name)

In [26]:
ks.schema

In [27]:
display(ks.select('goal','pledged').describe())

summary,goal,pledged
count,694188.0,694188.0
mean,48660.433803191634,9296.75753340595
stddev,1169751.3133712732,93286.87225251716
min,0.0,0.0
max,100000000.0,20338986.27


In [28]:
# Add the length of the project
ks1 = ks.withColumn('duration',datediff(ks.deadline,ks.launched))

# Create a view or table
temp_table_name = "ks1"
ks1.createOrReplaceTempView(temp_table_name)


In [29]:
ks1 = ks1.filter("state == 'successful' or state == 'failed'")
display(ks1.take(5))

ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd_pledged,duration
1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11,0.0,failed,0,GB,0.0,59
1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12,220.0,failed,3,US,220.0,45
1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17,1.0,failed,1,US,1.0,30
1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26,52375.0,successful,224,US,52375.0,35
1000023410,Support Solar Roasted Coffee & Green Energy! SolarCoffee.co,Food,Food,USD,2014-12-21,1000.0,2014-12-01,1205.0,successful,16,US,1205.0,20


In [30]:
ks1 = ks1.select('main_category', 'goal', 'state', 'backers', 'country', 'usd_pledged','duration')

display(ks1.take(5))

main_category,goal,state,backers,country,usd_pledged,duration
Publishing,1000.0,failed,0,GB,0.0,59
Film & Video,45000.0,failed,3,US,220.0,45
Music,5000.0,failed,1,US,1.0,30
Food,50000.0,successful,224,US,52375.0,35
Food,1000.0,successful,16,US,1205.0,20


In [31]:
#c = ks1.count()
#ks1_sample = ks1.sample(False, 0.1, seed=1221).limit(c)
#display(ks1_sample.take(20))

In [32]:
# Import the required libraries

from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import CountVectorizer,VectorAssembler,StringIndexer,OneHotEncoderEstimator
from pyspark.ml import Pipeline


In [33]:
# Create a 70-30 train test split

train_data,test_data=ks1.randomSplit([0.7,0.3],seed=123)
display(train_data.take(5))


main_category,goal,state,backers,country,usd_pledged,duration
Art,1.0,successful,1,US,1.0,33
Art,3.0,successful,7,US,79.0,30
Art,4.0,successful,12,US,205.0,30
Art,5.0,failed,2,US,2.0,30
Art,5.0,failed,2,US,2.0,30


In [34]:
display(test_data.take(5))

main_category,goal,state,backers,country,usd_pledged,duration
Art,1.0,successful,17,GB,228.26419409,30
Art,1.0,successful,37,US,796.0,30
Art,1.0,successful,78,US,834.0,18
Art,5.0,successful,3,US,31.0,30
Art,5.0,successful,50,US,121.0,3


In [35]:
# Convert the categorical columns to hold numerical data
category_indexer = StringIndexer(inputCol='main_category',outputCol='category_index',handleInvalid='keep')
state_indexer = StringIndexer(inputCol='state',outputCol='state_index',handleInvalid='keep')
country_indexer = StringIndexer(inputCol='country',outputCol='country_index',handleInvalid='keep')

# OneHotEncoderEstimator converts the indexed data into a vector which will be effectively handled by Logistic Regression model
onehot_encoder = OneHotEncoderEstimator(inputCols=['category_index','state_index','country_index'],
                                      outputCols=['category_vec','state_vec','country_vec'],
                                      handleInvalid='keep')

# Vector assembler is used to create a vector of input features
vector_assembler = VectorAssembler(inputCols=['category_index','state_index','country_index','goal','backers','duration'],
                            outputCol="features")

In [36]:
display(train_data.filter(train_data['state'] == 'successful').take(10))

main_category,goal,state,backers,country,usd_pledged,duration
Art,1.0,successful,1,US,1.0,33
Art,3.0,successful,7,US,79.0,30
Art,4.0,successful,12,US,205.0,30
Art,5.0,successful,5,US,7.0,30
Art,10.0,successful,12,US,120.0,30
Art,10.0,successful,109,US,2645.0,15
Art,15.0,successful,7,NL,84.33524022,30
Art,20.0,successful,1,US,20.0,1
Art,20.0,successful,4,GB,48.55171744,60
Art,20.0,successful,4,US,22.0,10


In [37]:
pipe = Pipeline(stages=[category_indexer,state_indexer,country_indexer,onehot_encoder,vector_assembler])
fitted_pipe=pipe.fit(train_data)
lr1_train=fitted_pipe.transform(train_data)
LinearReg_model = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, featuresCol="features",labelCol='usd_pledged', predictionCol="prediction")


In [38]:
lr1_model = LinearReg_model.fit(lr1_train.select(['features','usd_pledged']))

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lr1_model.coefficients))
print("Intercept: %s" % str(lr1_model.intercept))
print("numIterations: %d" % lr1_model.summary.totalIterations)
print("objectiveHistory: %s" % str(lr1_model.summary.objectiveHistory))

In [39]:
# Based on code from https://databricks.com/blog/2015/06/02/statistical-and-mathematical-functions-with-dataframes-in-spark.html
# Summarize the model over the training set and print out some metrics
print("RMSE: %f" % lr1_model.summary.rootMeanSquaredError)
print("r2: %f" % lr1_model.summary.r2)
#print(lr1_model.tValues)
#print(lr1_model.pValues)


In [40]:
display(lr1_model.summary.residuals.take(5))

residuals
-3483.599459225172
-3555.29762066498
-3717.328287493661
-108.61430751981447
-108.61430751981447


In [41]:
py_residuals = lr1_model.summary.residuals.toPandas()

# Residuals centered around zero, close to a normal curve
fig,ax=plt.subplots()
sns.distplot(py_residuals['residuals'])
#sns.
#plt.ylim(0, 3000000)
plt.xlim(-750000, 750000)
display(fig)

In [42]:
lr1_test=fitted_pipe.transform(test_data)
results = lr1_model.transform(lr1_test)
display(results.select(['usd_pledged','prediction']).take(10))


usd_pledged,prediction
228.26419409,4055.046931401564
796.0,5362.479242962387
834.0,6940.572766327721
31.0,3403.873919738172
121.0,4347.909503386345
6.0,45.30283531746
10.0,1851.7781633835516
17.0,2654.6053188949427
22.0,2750.748791012933
220.0,4940.170725999563


In [43]:
py_results = results.select(['usd_pledged','prediction']).toPandas()

# Residuals centered around zero, close to a normal curve
fig,ax=plt.subplots(figsize=(25,10))
#sns.distplot(py_residuals['residuals'])
sns.lineplot(x='usd_pledged', y='prediction', data=py_results).set_title("Predicted Pledge Amount vs. Actual Pledged Amount")
#plt.ylim(0, 3000000)
#plt.xlim(-750000, 750000)
display(fig)

In [44]:
test_results = lr1_model.evaluate(lr1_test)
print("RMSE: %f" % test_results.rootMeanSquaredError)
print("r2: %f" % test_results.r2)

In [45]:
ptest = test_data.select('backers','usd_pledged').toPandas()
dtest = test_data.select('duration','usd_pledged').toPandas()
gtest = test_data.select('goal','usd_pledged').toPandas()

In [46]:
fig,ax=plt.subplots(figsize=(20,5))
sns.regplot(x='goal', y='usd_pledged', data=gtest).set_title("Goal vs Pledged $USD")
#sns.regplot(x='duration' y='usd_pledged', data=dtest, s = s[1]).set_title("Project Length vs Pledged $USD")
plt.subplots_adjust(left=0.2, wspace=1.0)
#plt.ylim(0, 1000000)
#plt.xlim(0, 20000)
display(fig)

In [47]:
fig,s=plt.subplots(figsize=(20,5))
sns.regplot(x='backers', y='usd_pledged', data=ptest).set_title("G vs Pledged $USD")
#sns.regplot(x='duration' y='usd_pledged', data=dtest, s = s[1]).set_title("Project Length vs Pledged $USD")
plt.subplots_adjust(left=0.2, wspace=1.0)
plt.ylim(0, 3000000)
plt.xlim(0, 20000)
display(fig)

In [48]:
fig,s=plt.subplots(figsize=(20,5))
sns.regplot(x='duration', y='usd_pledged', data=dtest).set_title("Project Length vs Pledged $USD")
plt.subplots_adjust(left=0.2, wspace=1.0)
#plt.ylim(0, 500000)
#plt.xlim(0, 20000)
display(fig)

In [49]:
train2_data = train_data.drop('main_category')
test2_data = test_data.drop('main_category')
display(train2_data.take(5))

goal,state,backers,country,usd_pledged,duration
1.0,successful,17,GB,228.26419409,30
1.0,successful,37,US,796.0,30
1.0,successful,78,US,834.0,18
4.0,successful,12,US,205.0,30
5.0,successful,50,US,121.0,3


In [50]:
display(test2_data.take(5))

goal,state,backers,country,usd_pledged,duration
1.0,successful,1,US,1.0,33
3.0,successful,7,US,79.0,30
5.0,failed,2,US,2.0,30
5.0,failed,2,US,2.0,30
5.0,successful,3,US,31.0,30


In [51]:
state_indexer = StringIndexer(inputCol='state',outputCol='state_index',handleInvalid='keep')
country_indexer = StringIndexer(inputCol='country',outputCol='country_index',handleInvalid='keep')

# OneHotEncoderEstimator converts the indexed data into a vector which will be effectively handled by Logistic Regression model
onehot_encoder = OneHotEncoderEstimator(inputCols=['state_index','country_index'],
                                      outputCols=['state_vec','country_vec'],
                                      handleInvalid='keep')


In [52]:
# Vector assembler is used to create a vector of input features
vector_assembler = VectorAssembler(inputCols=['state_index','country_index','goal','backers','duration'],
                            outputCol="features") 
pipe = Pipeline(stages=[state_indexer,country_indexer,onehot_encoder,vector_assembler])
fitted_pipe=pipe.fit(train2_data)
lr2_train=fitted_pipe.transform(train2_data)
LinearReg_model = LinearRegression(maxIter=10, regParam=0.3,elasticNetParam=0.8, featuresCol="features", labelCol='usd_pledged',predictionCol="prediction")

In [53]:
lr2_model = LinearReg_model.fit(lr2_train.select(['features','usd_pledged']))

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lr2_model.coefficients))
print("Intercept: %s" % str(lr2_model.intercept))
print("numIterations: %d" % lr2_model.summary.totalIterations)
print("objectiveHistory: %s" % str(lr2_model.summary.objectiveHistory))

In [54]:
# Summarize the model over the training set and print out some metrics
print("RMSE: %f" % fit_model.summary.rootMeanSquaredError)
print("r2: %f" % fit_model.summary.r2)


In [55]:
display(fit_model.summary.residuals.take(5))

residuals
2644.474637623269
2605.915570412906
2480.3833449881054
2632.3283377848043
2389.883037993049


In [56]:
lr2_test=fitted_pipe.transform(test2_data)
results = lr2_model.transform(lr2_test)
display(results.select(['usd_pledged','prediction']).take(10))

usd_pledged,prediction
1.0,2193.546286016608
79.0,2393.0809218586046
2.0,72.09014314077785
2.0,72.09014314077785
31.0,2142.4603056527103
7.0,2267.770884479787
0.0,-53.21908206565104
2645.0,7901.93412667095
84.33524022,1744.3651275009786
1.0,9.438914589182332


In [57]:
test_results = lr2_model.evaluate(lr2_test)
print("RMSE: %f" % test_results.rootMeanSquaredError)
print("r2: %f" % test_results.r2)

In [58]:
train3_data = train2_data.filter(col('state')== 'successful')
test3_data = test2_data.filter(col('state')== 'successful')

In [59]:
state_indexer = StringIndexer(inputCol='state',outputCol='state_index',handleInvalid='keep')
country_indexer = StringIndexer(inputCol='country',outputCol='country_index',handleInvalid='keep')

# OneHotEncoderEstimator converts the indexed data into a vector which will be effectively handled by Logistic Regression model
onehot_encoder = OneHotEncoderEstimator(inputCols=['state_index','country_index'],
                                      outputCols=['state_vec','country_vec'],
                                      handleInvalid='keep')

In [60]:
# Vector assembler is used to create a vector of input features
vector_assembler = VectorAssembler(inputCols=['state_index','country_index','goal','backers','duration'],
                            outputCol="features") 
pipe = Pipeline(stages=[state_indexer,country_indexer,onehot_encoder,vector_assembler])
fitted_pipe=pipe.fit(train3_data)
lr3_train=fitted_pipe.transform(train3_data)
LinearReg_model = LinearRegression(maxIter=10, regParam=0.3,elasticNetParam=0.8, featuresCol="features", labelCol='usd_pledged',predictionCol="prediction")

In [61]:
lr3_model = LinearReg_model.fit(lr3_train.select(['features','usd_pledged']))

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lr3_model.coefficients))
print("Intercept: %s" % str(lr3_model.intercept))
print("numIterations: %d" % lr3_model.summary.totalIterations)
print("objectiveHistory: %s" % str(lr3_model.summary.objectiveHistory))

In [62]:
# Summarize the model over the training set and print out some metrics
print("RMSE: %f" % lr3_model.summary.rootMeanSquaredError)
print("r2: %f" % lr3_model.summary.r2)


In [63]:
display(lr3_model.summary.residuals.take(5))

residuals
4385.369186009958
2089.0076822862447
570.8225475175077
2888.18595365406
2323.869291418981


In [64]:
lr3_test=fitted_pipe.transform(test_data)
results = lr3_model.transform(lr3_test)
display(results.select(['usd_pledged','prediction']).take(10))

usd_pledged,prediction
1.0,-3116.412413104455
79.0,-2962.5030720659665
2.0,-3239.417445218478
2.0,-3239.417445218478
31.0,-3183.71420455339
7.0,-3072.307723223213
0.0,-3346.81935111633
2645.0,1815.2742157534449
84.33524022,-13453.087069020055
1.0,-3283.106959586591


In [65]:
test_results = lr3_model.evaluate(lr3_test)
print("RMSE: %f" % test_results.rootMeanSquaredError)
print("r2: %f" % test_results.r2)

In [66]:
train4_data = train_data.drop('country')
test4_data = test_data.drop('country')
display(train2_data.take(5))

In [67]:
state_indexer = StringIndexer(inputCol='state',outputCol='state_index',handleInvalid='keep')
category_indexer = StringIndexer(inputCol='main_category',outputCol='category_index',handleInvalid='keep')


# OneHotEncoderEstimator converts the indexed data into a vector which will be effectively handled by Logistic Regression model
onehot_encoder = OneHotEncoderEstimator(inputCols=['state_index','category_index'],
                                      outputCols=['state_vec','category_vec'],
                                      handleInvalid='keep')

In [68]:
# Vector assembler is used to create a vector of input features
vector_assembler = VectorAssembler(inputCols=['state_index','category_index','goal','backers','duration'],
                            outputCol="features") 
pipe = Pipeline(stages=[state_indexer,category_indexer,onehot_encoder,vector_assembler])
fitted_pipe=pipe.fit(train4_data)
lr4_train=fitted_pipe.transform(train3_data)
LinearReg_model = LinearRegression(maxIter=10, regParam=0.3,elasticNetParam=0.8, featuresCol="features", labelCol='usd_pledged',predictionCol="prediction")

In [69]:
lr4_model = LinearReg_model.fit(lr4_train.select(['features','usd_pledged']))

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lr4_model.coefficients))
print("Intercept: %s" % str(lr4_model.intercept))
print("numIterations: %d" % lr4_model.summary.totalIterations)
print("objectiveHistory: %s" % str(lr4_model.summary.objectiveHistory))

In [70]:
# Summarize the model over the training set and print out some metrics
print("RMSE: %f" % lr4_model.summary.rootMeanSquaredError)
print("r2: %f" % lr4_model.summary.r2)

In [71]:
display(lr4_model.summary.residuals.take(5))

In [72]:
lr4_test=fitted_pipe.transform(test_data)
results = lr4_model.transform(lr4_test)
display(results.select(['usd_pledged','prediction']).take(10))

In [73]:
test_results = lr4_model.evaluate(lr3_test)
print("RMSE: %f" % test_results.rootMeanSquaredError)
print("r2: %f" % test_results.r2)