In [1]:
import pandas as pd
import numpy as np
import chardet
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
#Import PySpark libraries 
import pyspark
from pyspark import SparkContext, SparkConf
# Import functions/datatypes for timestamp, integer, and double
from pyspark.sql import Row
from pyspark.sql.functions import *
from pyspark.sql.types import *
import time
import datetime
# Code based on https://www.kaggle.com/rtatman/data-cleaning-challenge-character-encodings
# Use chardet to detect character encoding: chardet shows Windows-1252 encoding
with open("/dbfs/FileStore/tables/ks_projects_201612-284ce.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(10000))
    
print(result)

In [2]:
# Read CSV file with encolding Windows-1252
ks16a = pd.read_csv("/dbfs/FileStore/tables/ks_projects_201612-284ce.csv", encoding='Windows-1252')
# Column names have a space, rename column names to have no spaces 
ks16a.columns = ['ID', 'name', 'category', 'main_category', 'currency', 'deadline', \
       'goal', 'launched', 'pledged', 'state', 'backers', 'country', \
       'usd_pledged', 'c_13', 'c_14', 'c_15', 'c_16']    
# Cast all columns to string
ks16a["ID"] = ks16a["ID"].astype(str)
ks16a["name"] = ks16a["name"].astype(str)
ks16a["category"] = ks16a["category"].astype(str)
ks16a["main_category"] = ks16a["main_category"].astype(str)
ks16a["currency"] = ks16a["currency"].astype(str)
ks16a["deadline"] = ks16a["deadline"].astype(str)
ks16a["goal"] = ks16a["goal"].astype(str)
ks16a["launched"] = ks16a["launched"].astype(str)
ks16a["pledged"] = ks16a["pledged"].astype(str)
ks16a["state"] = ks16a["state"].astype(str)
ks16a["backers"] = ks16a["backers"].astype(str) 
ks16a["country"] = ks16a["country"].astype(str)                                              
ks16a["usd_pledged"] = ks16a["usd_pledged"].astype(str)
ks16a["c_13"] = ks16a["c_13"].astype(str)
ks16a["c_14"] = ks16a["c_14"].astype(str)
ks16a["c_15"] = ks16a["c_15"].astype(str)
ks16a["c_16"] = ks16a["c_16"].astype(str)
# Create a dataframe in Spark
ks16 = spark.createDataFrame(ks16a)
# Drop extra columns
ks16 = ks16.drop('c_13','c_14','c_15','c_16')
# Cast from string to integer and double
ks16 = ks16.withColumn("ID", ks16['ID'].cast(IntegerType()))
ks16 = ks16.withColumn("goal", ks16['goal'].cast(IntegerType()))
ks16 = ks16.withColumn("pledged", ks16['pledged'].cast(DoubleType()))
ks16 = ks16.withColumn("backers", ks16['backers'].cast(IntegerType()))
ks16 = ks16.withColumn("usd_pledged", ks16['usd_pledged'].cast(DoubleType()))

In [3]:
#Reference code: Chapter 5 Big Data Analysis
from pyspark.sql.functions import isnan, when, count, col
display(ks16.select([count(when(isnan(c) | col(c).isNull(),c)).alias(c) for c in ks16.columns]))

ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd_pledged
0,0,0,0,0,0,632,0,624,0,623,0,4413


In [4]:
# Drop all nulls from the data frame
ks16 = ks16.dropna()

In [5]:
#Reference code: Chapter 5 Big Data Analysis
from pyspark.sql.functions import isnan, when, count, col
display(ks16.select([count(when(isnan(c) | col(c).isNull(),c)).alias(c) for c in ks16.columns]))

ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd_pledged
0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
# Convert from date/time to just date
ks16 = ks16.withColumn("deadline", to_date(unix_timestamp("deadline", "yyyy-MM-dd").cast("timestamp")))
ks16 = ks16.withColumn("launched", to_date(unix_timestamp("launched", "yyyy-MM-dd").cast("timestamp")))
# No issues with decoding errors
ks18a = pd.read_csv("/dbfs/FileStore/tables/ks_projects_201801-a566d.csv")
ks18a.columns = ['ID', 'name', 'category', 'main_category', 'currency', 'deadline', \
       'goal', 'launched', 'pledged', 'state', 'backers', 'country', \
       'usd_pledged', 'usd_pledged_real','usd_goal_real']
ks18 = spark.createDataFrame(ks18a)

In [7]:
#Reference code: Chapter 5 Big Data Analysis
from pyspark.sql.functions import isnan, when, count, col
ks18.select([count(when(isnan(c) | col(c).isNull(),c)).alias(c) for c in ks18.columns]).show()

In [8]:
# Drop all null rows
ks18 = ks18.dropna()
# Drop extra columns
ks18 = ks18.drop('usd_pledged_real','usd_goal_real')

In [9]:
# Verify there are no nulls
ks18.select([count(when(isnan(c) | col(c).isNull(),c)).alias(c) for c in ks18.columns]).show()

In [10]:
# Convert from date/time to just date
ks18 = ks18.withColumn("deadline", to_date(unix_timestamp("deadline", "yyyy-MM-dd").cast("timestamp")))
ks18 = ks18.withColumn("launched", to_date(unix_timestamp("launched", "yyyy-MM-dd").cast("timestamp")))
ks = ks16.union(ks18)
# Add the length of the project
ks = ks.withColumn('duration',datediff(ks.deadline,ks.launched))

In [11]:
ks1 = ks.filter("state == 'successful' or state == 'failed'")
display(ks1.take(5))

ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd_pledged,duration
1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11,0.0,failed,0,GB,0.0,59
1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12,220.0,failed,3,US,220.0,45
1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17,1.0,failed,1,US,1.0,30
1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26,52375.0,successful,224,US,52375.0,35
1000023410,Support Solar Roasted Coffee & Green Energy! SolarCoffee.co,Food,Food,USD,2014-12-21,1000.0,2014-12-01,1205.0,successful,16,US,1205.0,20


In [12]:
ks1 = ks1.select('main_category', 'goal', 'state', 'backers', 'country', 'usd_pledged','duration')

display(ks1.describe())

summary,main_category,goal,state,backers,country,usd_pledged,duration
count,612554,612554.0,612554,612554.0,612554,612554.0,612554.0
mean,,43156.51625628107,,114.44249486575876,,8098.233586188493,34.06963794212429
stddev,,1085174.8589868369,,980.4828413222216,,86037.88362806017,12.79377076639144
min,Art,0.0,failed,0.0,AT,0.0,1.0
max,Theater,100000000.0,successful,219382.0,US,20338986.27,92.0


In [13]:
# Import the required libraries

from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import CountVectorizer,VectorAssembler,StringIndexer,OneHotEncoderEstimator
from pyspark.ml import Pipeline


In [14]:
# Create a 70-30 train test split

train_data,test_data=ks1.randomSplit([0.7,0.3],seed=123)
display(train_data.take(5))


main_category,goal,state,backers,country,usd_pledged,duration
Art,1.0,successful,1,US,1.0,33
Art,3.0,successful,7,US,79.0,30
Art,4.0,successful,12,US,205.0,30
Art,5.0,failed,2,US,2.0,30
Art,5.0,failed,2,US,2.0,30


In [15]:
display(test_data.take(5))

main_category,goal,state,backers,country,usd_pledged,duration
Art,1.0,successful,17,GB,228.26419409,30
Art,1.0,successful,37,US,796.0,30
Art,1.0,successful,78,US,834.0,18
Art,5.0,successful,3,US,31.0,30
Art,5.0,successful,50,US,121.0,3


In [16]:
# Convert the categorical columns to hold numerical data
category_indexer = StringIndexer(inputCol='main_category',outputCol='category_index',handleInvalid='keep')
state_indexer = StringIndexer(inputCol='state',outputCol='state_index',handleInvalid='keep')
country_indexer = StringIndexer(inputCol='country',outputCol='country_index',handleInvalid='keep')

# Vector assembler is used to create a vector of input features
vector_assembler = VectorAssembler(inputCols=['category_index','state_index','country_index','goal','backers','duration'],
                            outputCol="features")

In [17]:
display(train_data.filter(train_data['state'] == 'successful').take(10))

main_category,goal,state,backers,country,usd_pledged,duration
Art,1.0,successful,1,US,1.0,33
Art,3.0,successful,7,US,79.0,30
Art,4.0,successful,12,US,205.0,30
Art,5.0,successful,5,US,7.0,30
Art,10.0,successful,12,US,120.0,30
Art,10.0,successful,109,US,2645.0,15
Art,15.0,successful,7,NL,84.33524022,30
Art,20.0,successful,1,US,20.0,1
Art,20.0,successful,4,GB,48.55171744,60
Art,20.0,successful,4,US,22.0,10


In [18]:
pipe = Pipeline(stages=[category_indexer,state_indexer,country_indexer,vector_assembler])
fitted_pipe=pipe.fit(train_data)
lr1_train=fitted_pipe.transform(train_data)
LinearReg_model = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, featuresCol="features",labelCol='usd_pledged', predictionCol="prediction")


In [19]:
display(lr1_train.sort(["category_index"],ascending=True))

main_category,goal,state,backers,country,usd_pledged,duration,category_index,state_index,country_index,features
Film & Video,1.0,successful,12,US,352.0,60,0.0,1.0,0.0,"List(1, 6, List(), List(0.0, 1.0, 0.0, 1.0, 12.0, 60.0))"
Film & Video,2000.0,successful,14,US,2681.0,32,0.0,1.0,0.0,"List(1, 6, List(), List(0.0, 1.0, 0.0, 2000.0, 14.0, 32.0))"
Film & Video,2.0,successful,28,US,306.0,30,0.0,1.0,0.0,"List(1, 6, List(), List(0.0, 1.0, 0.0, 2.0, 28.0, 30.0))"
Film & Video,5.0,successful,10,US,185.0,16,0.0,1.0,0.0,"List(1, 6, List(), List(0.0, 1.0, 0.0, 5.0, 10.0, 16.0))"
Film & Video,10.0,failed,0,CA,0.0,30,0.0,0.0,2.0,"List(1, 6, List(), List(0.0, 0.0, 2.0, 10.0, 0.0, 30.0))"
Film & Video,10.0,successful,3,US,16.0,15,0.0,1.0,0.0,"List(1, 6, List(), List(0.0, 1.0, 0.0, 10.0, 3.0, 15.0))"
Film & Video,10.0,successful,3,US,85.0,56,0.0,1.0,0.0,"List(1, 6, List(), List(0.0, 1.0, 0.0, 10.0, 3.0, 56.0))"
Film & Video,15.0,successful,5,US,84.99,30,0.0,1.0,0.0,"List(1, 6, List(), List(0.0, 1.0, 0.0, 15.0, 5.0, 30.0))"
Film & Video,20.0,failed,0,US,0.0,31,0.0,0.0,0.0,"List(0, 6, List(3, 5), List(20.0, 31.0))"
Film & Video,20.0,failed,1,US,1.0,30,0.0,0.0,0.0,"List(1, 6, List(), List(0.0, 0.0, 0.0, 20.0, 1.0, 30.0))"


In [20]:
lr1_model = LinearReg_model.fit(lr1_train.select(['features','usd_pledged']))

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lr1_model.coefficients))
print("Intercept: %s" % str(lr1_model.intercept))
print("numIterations: %d" % lr1_model.summary.totalIterations)
print("objectiveHistory: %s" % str(lr1_model.summary.objectiveHistory))

##### Coefficients Insights: 
Country: US is base --> On average, pledged price decreases by $155.31 if country is not US while keeping other variables constant.
Goal: On average, price pledged increases $2.97 if goal is increased by $10K while keeping other variables constant.
Backers: On average, price pledged increases $57.61 if the number of backers is increased by 1 while keeping other variables constant.
Duration: On average, price pledged increases $65.31 if the number of days to raise money is extented by 1 day while keeping other variables constant.

[-4.064579821372248,3235.6535383194882,-155.31083358344037,0.0002973343341350778,57.60607389886918,65.31295887402514]

[ category          , state           , country           ,goal                 ,backers           ,duration ]

In [22]:
# Based on code from https://databricks.com/blog/2015/06/02/statistical-and-mathematical-functions-with-dataframes-in-spark.html
# Summarize the model over the training set and print out some metrics
print("RMSE: %f" % lr1_model.summary.rootMeanSquaredError)
print("r2: %f" % lr1_model.summary.r2)
#print(lr1_model.tValues)
#print(lr1_model.pValues)


In [23]:
display(lr1_model.summary.residuals.take(5))

residuals
-3483.599459225172
-3555.29762066498
-3717.328287493661
-108.61430751981491
-108.61430751981491


In [24]:
# Code Reference from https://stackoverflow.com/questions/52214404/how-to-get-the-correlation-matrix-of-a-pyspark-data-frame

# Show correlation matrix of the variables 
from pyspark.ml.stat import Correlation

df = lr1_train.drop('main_category','state','country','features')
# convert to vector column first
vector_col = "corr_features"
assembler = VectorAssembler(inputCols=df.columns, outputCol=vector_col)
df_vector = assembler.transform(df).select(vector_col)

# get correlation matrix
matrix = Correlation.corr(df_vector, vector_col)
corr_mat = matrix.collect()[0]["pearson({})".format(vector_col)].values


In [25]:
# Convert Numpy array to Pandas dataframe
py_corr = pd.DataFrame({'goal':[corr_mat[0],corr_mat[7],corr_mat[14], corr_mat[21],corr_mat[28],corr_mat[35], corr_mat[42]],
        'backers':[corr_mat[1],corr_mat[8],corr_mat[15], corr_mat[22],corr_mat[29],corr_mat[36], corr_mat[43]],                
        'usd_pledged':[corr_mat[2],corr_mat[9],corr_mat[16], corr_mat[23],corr_mat[30],corr_mat[37], corr_mat[44]],
        'duration':[corr_mat[3],corr_mat[10],corr_mat[17], corr_mat[24],corr_mat[31],corr_mat[38], corr_mat[45]],
        'category_index':[corr_mat[4],corr_mat[11],corr_mat[18], corr_mat[25],corr_mat[32],corr_mat[39], corr_mat[46]],
        'state_index':[corr_mat[5],corr_mat[12],corr_mat[19], corr_mat[26],corr_mat[33],corr_mat[40], corr_mat[47]],
        'country_index':[corr_mat[6],corr_mat[13],corr_mat[20], corr_mat[27],corr_mat[34],corr_mat[41], corr_mat[48]]}) 
py_corr.index = ['goal','backers','usd_pledged','duration','catgegory_index','state_index','country_index']
# Little or no multicollinearity
py_corr

Unnamed: 0,goal,backers,usd_pledged,duration,category_index,state_index,country_index
goal,1.0,0.005903,0.007313,0.022977,-0.007489,-0.025117,0.015376
backers,0.005903,1.0,0.702597,0.000104,-0.002865,0.113975,-0.009048
usd_pledged,0.007313,0.702597,1.0,0.007689,-0.003393,0.097082,-0.011372
duration,0.022977,0.000104,0.007689,1.0,-0.049202,-0.112746,-0.004579
catgegory_index,-0.007489,-0.002865,-0.003393,-0.049202,1.0,-0.031054,0.038975
state_index,-0.025117,0.113975,0.097082,-0.112746,-0.031054,1.0,-0.061164
country_index,0.015376,-0.009048,-0.011372,-0.004579,0.038975,-0.061164,1.0


In [26]:
fig,ax=plt.subplots(1, 1,figsize=(10,10))
sns.heatmap(py_corr,annot=True,cmap="YlGnBu")
#plt.ylim(0, 3000000)
#plt.xlim(0, 175)
display(fig)

In [27]:
py_residuals = lr1_model.summary.residuals.toPandas()

# Residuals centered around zero, close to a normal curve
fig,ax=plt.subplots()
sns.distplot(py_residuals['residuals'])
#plt.ylim(0, 3000000)
plt.xlim(-750000, 750000)
display(fig)

In [28]:
lr1_test=fitted_pipe.transform(test_data)
results = lr1_model.transform(lr1_test)
display(results.select(['usd_pledged','prediction']).take(10))


usd_pledged,prediction
228.26419409,4055.046931401564
796.0,5362.479242962387
834.0,6940.572766327721
31.0,3403.873919738172
121.0,4347.909503386344
6.0,45.30283531746045
10.0,1851.778163383551
17.0,2654.605318894942
22.0,2750.748791012933
220.0,4940.170725999562


In [29]:
pred1_case = pd.DataFrame({'main_category':['Publishing', 'Publishing', 'Publishing', 'Technology','Technology','Technology'],
    'goal':[1000, 5000, 20000, 1000, 5000, 20000],
    'state':['successful','successful','successful','successful','successful','successful'],
    'backers':[ 100, 500, 2000, 100, 500, 2000],
    'country':['US','US','US','US','US','US'],                      
    'usd_pledged':[0,0,0,0,0,0],
    'duration':[15, 30, 45, 15, 30, 45]}) 
pred1_data=spark.createDataFrame(pred1_case)
display(pred1_data)
                   

main_category,goal,state,backers,country,usd_pledged,duration
Publishing,1000,successful,100,US,0,15
Publishing,5000,successful,500,US,0,30
Publishing,20000,successful,2000,US,0,45
Technology,1000,successful,100,US,0,15
Technology,5000,successful,500,US,0,30
Technology,20000,successful,2000,US,0,45


In [30]:
pred1_test=fitted_pipe.transform(pred1_data)
pred1_results = lr1_model.transform(pred1_test)
display(pred1_results)

main_category,goal,state,backers,country,usd_pledged,duration,category_index,state_index,country_index,features,prediction
Publishing,1000,successful,100,US,0,15,2.0,1.0,0.0,"List(1, 6, List(), List(2.0, 1.0, 0.0, 1000.0, 100.0, 15.0))",8024.458291944686
Publishing,5000,successful,500,US,0,30,2.0,1.0,0.0,"List(1, 6, List(), List(2.0, 1.0, 0.0, 5000.0, 500.0, 30.0))",32047.77157193928
Publishing,20000,successful,2000,US,0,45,2.0,1.0,0.0,"List(1, 6, List(), List(2.0, 1.0, 0.0, 20000.0, 2000.0, 45.0))",119441.03681836544
Technology,1000,successful,100,US,0,15,4.0,1.0,0.0,"List(1, 6, List(), List(4.0, 1.0, 0.0, 1000.0, 100.0, 15.0))",8016.329132301942
Technology,5000,successful,500,US,0,30,4.0,1.0,0.0,"List(1, 6, List(), List(4.0, 1.0, 0.0, 5000.0, 500.0, 30.0))",32039.64241229653
Technology,20000,successful,2000,US,0,45,4.0,1.0,0.0,"List(1, 6, List(), List(4.0, 1.0, 0.0, 20000.0, 2000.0, 45.0))",119432.90765872267


In [31]:
Elasticity_pgoal =  0.0002973343341350778 * (32047.77/5000)
print("Elasticity of price wrt goal for $5000 publishing project: ", Elasticity_pgoal)

Elasticity_tgoal =   0.0002973343341350778 * (32039.64/5000) # For technology $5000 goa
print("Elasticity of price wrt goal for $5000 technology project: ", Elasticity_tgoal)

Elasticity_pback =  57.60607389886918 * (32047.77/500) 
print("\nElasticity of price wrt backers for $5000 technology project: ", Elasticity_pback)

Elasticity_tback =   57.60607389886918 * (32039.64/500) 
print("Elasticity of price wrt backers for $5000 publishing project: ", Elasticity_tback)


In [32]:
py_results = results.select(['usd_pledged','prediction']).toPandas()

# Residuals centered around zero, close to a normal curve
fig,ax=plt.subplots(1, 1,figsize=(25,10))
#sns.distplot(py_residuals['residuals'])
ax.legend()
sns.lineplot(y=py_results.usd_pledged.value_counts().index, x=py_results.usd_pledged.value_counts().values, data=py_results, color="blue", label="Actual" ).set_title("Predicted Pledge Amount vs. Actual Pledged Amount")
ax2 = ax.twinx()
sns.lineplot(y=py_results.prediction.value_counts().index, x=py_results.prediction.value_counts().values, data=py_results, color='red',label="Predicted").set_title("Predicted Pledge Amount vs. Actual Pledged Amount")
#plt.ylim(0, 3000000)
plt.xlim(0, 175)
display(fig)

In [33]:
test_results = lr1_model.evaluate(lr1_test)
print("RMSE: %f" % test_results.rootMeanSquaredError)
print("r2: %f" % test_results.r2)

In [34]:
ptest = test_data.select('backers','usd_pledged').toPandas()
dtest = test_data.select('duration','usd_pledged').toPandas()
gtest = test_data.select('goal','usd_pledged').toPandas()

In [35]:
fig,ax=plt.subplots(figsize=(20,5))
sns.regplot(x='goal', y='usd_pledged', data=gtest).set_title("Goal vs Pledged $USD")
#sns.regplot(x='duration' y='usd_pledged', data=dtest, s = s[1]).set_title("Project Length vs Pledged $USD")
plt.subplots_adjust(left=0.2, wspace=1.0)
#plt.ylim(0, 1000000)
#plt.xlim(0, 20000)
display(fig)

In [36]:
fig,s=plt.subplots(figsize=(20,5))
sns.regplot(x='backers', y='usd_pledged', data=ptest).set_title("G vs Pledged $USD")
#sns.regplot(x='duration' y='usd_pledged', data=dtest, s = s[1]).set_title("Project Length vs Pledged $USD")
plt.subplots_adjust(left=0.2, wspace=1.0)
plt.ylim(0, 3000000)
plt.xlim(0, 20000)
display(fig)

In [37]:
fig,s=plt.subplots(figsize=(20,5))
sns.regplot(x='duration', y='usd_pledged', data=dtest).set_title("Project Length vs Pledged $USD")
plt.subplots_adjust(left=0.2, wspace=1.0)
#plt.ylim(0, 500000)
#plt.xlim(0, 20000)
display(fig)

In [38]:
train2_data = train_data.drop('main_category')
test2_data = test_data.drop('main_category')
display(train2_data.take(5))

goal,state,backers,country,usd_pledged,duration
1.0,successful,1,US,1.0,33
3.0,successful,7,US,79.0,30
4.0,successful,12,US,205.0,30
5.0,failed,2,US,2.0,30
5.0,failed,2,US,2.0,30


In [39]:
display(test2_data.take(5))

goal,state,backers,country,usd_pledged,duration
1.0,successful,17,GB,228.26419409,30
1.0,successful,37,US,796.0,30
1.0,successful,78,US,834.0,18
5.0,successful,3,US,31.0,30
5.0,successful,50,US,121.0,3


In [40]:
state_indexer = StringIndexer(inputCol='state',outputCol='state_index',handleInvalid='keep')
country_indexer = StringIndexer(inputCol='country',outputCol='country_index',handleInvalid='keep')

# Vector assembler is used to create a vector of input features
vector_assembler = VectorAssembler(inputCols=['state_index','country_index','goal','backers','duration'],
                            outputCol="features") 

In [41]:
pipe = Pipeline(stages=[state_indexer,country_indexer,vector_assembler])
fitted_pipe=pipe.fit(train2_data)
lr2_train=fitted_pipe.transform(train2_data)
LinearReg_model = LinearRegression(maxIter=10, regParam=0.3,elasticNetParam=0.8, featuresCol="features", labelCol='usd_pledged',predictionCol="prediction")

In [42]:
lr2_model = LinearReg_model.fit(lr2_train.select(['features','usd_pledged']))

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lr2_model.coefficients))
print("Intercept: %s" % str(lr2_model.intercept))
print("numIterations: %d" % lr2_model.summary.totalIterations)
print("objectiveHistory: %s" % str(lr2_model.summary.objectiveHistory))

In [43]:
# Summarize the model over the training set and print out some metrics
print("RMSE: %f" % lr2_model.summary.rootMeanSquaredError)
print("r2: %f" % lr2_model.summary.r2)


In [44]:
display(lr2_model.summary.residuals.take(5))

residuals
-3487.6467368576778
-3559.161051372077
-3721.191614028081
-111.4140581162476
-111.4140581162476


In [45]:
lr2_test=fitted_pipe.transform(test2_data)
results = lr2_model.transform(lr2_test)
display(results.select(['usd_pledged','prediction']).take(10))

usd_pledged,prediction
228.26419409,4058.6623948069055
796.0,5366.342047731438
834.0,6943.699827922702
31.0,3407.7374340991064
121.0,4350.118539388715
6.0,48.04134580817208
10.0,1854.2944252941568
17.0,2657.550140349297
22.0,2753.9999003704443
220.0,4944.340007400274


In [46]:
test_results = lr2_model.evaluate(lr2_test)
print("RMSE: %f" % test_results.rootMeanSquaredError)
print("r2: %f" % test_results.r2)

In [47]:
train3_data = train2_data.filter(col('state')== 'successful')
test3_data = test2_data.filter(col('state')== 'successful')

In [48]:
state_indexer = StringIndexer(inputCol='state',outputCol='state_index',handleInvalid='keep')
country_indexer = StringIndexer(inputCol='country',outputCol='country_index',handleInvalid='keep')

# Vector assembler is used to create a vector of input features
vector_assembler = VectorAssembler(inputCols=['state_index','country_index','goal','backers','duration'],
                            outputCol="features") 

In [49]:
pipe = Pipeline(stages=[state_indexer,country_indexer,vector_assembler])
fitted_pipe=pipe.fit(train3_data)
lr3_train=fitted_pipe.transform(train3_data)
LinearReg_model = LinearRegression(maxIter=10, regParam=0.3,elasticNetParam=0.8, featuresCol="features", labelCol='usd_pledged',predictionCol="prediction")

In [50]:
lr3_model = LinearReg_model.fit(lr3_train.select(['features','usd_pledged']))

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lr3_model.coefficients))
print("Intercept: %s" % str(lr3_model.intercept))
print("numIterations: %d" % lr3_model.summary.totalIterations)
print("objectiveHistory: %s" % str(lr3_model.summary.objectiveHistory))

In [51]:
# Summarize the model over the training set and print out some metrics
print("RMSE: %f" % lr3_model.summary.rootMeanSquaredError)
print("r2: %f" % lr3_model.summary.r2)


In [52]:
display(lr3_model.summary.residuals.take(5))

residuals
2644.474637623269
2605.915570412906
2480.3833449881054
2632.3283377848043
2389.883037993049


In [53]:
lr3_test=fitted_pipe.transform(test3_data)
results = lr3_model.transform(lr3_test)
display(results.select(['usd_pledged','prediction']).take(10))

usd_pledged,prediction
228.26419409,-4126.5628549982775
796.0,-1025.055960524176
834.0,285.9392914709915
31.0,-2725.574540821721
121.0,-2043.905567542832
10.0,-4185.332323509065
17.0,-3450.563433412369
22.0,-3331.867031885432
220.0,-1349.2170121707795
175.16632629,-4346.009956285249


In [54]:
test_results = lr3_model.evaluate(lr3_test)
print("RMSE: %f" % test_results.rootMeanSquaredError)
print("r2: %f" % test_results.r2)

In [55]:
train4_data = train_data.drop('main_category','country')
test4_data = test_data.drop('main_category','country')
display(train4_data.take(5))

goal,state,backers,usd_pledged,duration
1.0,successful,1,1.0,33
3.0,successful,7,79.0,30
4.0,successful,12,205.0,30
5.0,failed,2,2.0,30
5.0,failed,2,2.0,30


In [56]:
display(test_data.take(5))

main_category,goal,state,backers,country,usd_pledged,duration
Art,1.0,successful,17,GB,228.26419409,30
Art,1.0,successful,37,US,796.0,30
Art,1.0,successful,78,US,834.0,18
Art,5.0,successful,3,US,31.0,30
Art,5.0,successful,50,US,121.0,3


In [57]:
state_indexer = StringIndexer(inputCol='state',outputCol='state_index',handleInvalid='keep')
#category_indexer = StringIndexer(inputCol='main_category',outputCol='category_index',handleInvalid='keep')

# Vector assembler is used to create a vector of input features
vector_assembler = VectorAssembler(inputCols=['state_index','goal','backers','duration'],
                            outputCol="features") 

In [58]:

pipe = Pipeline(stages=[state_indexer,vector_assembler])
fitted_pipe=pipe.fit(train4_data)
lr4_train=fitted_pipe.transform(train4_data)
LinearReg_model = LinearRegression(maxIter=10, regParam=0.3,elasticNetParam=0.8, featuresCol="features", labelCol='usd_pledged',predictionCol="prediction")

In [59]:
lr4_model = LinearReg_model.fit(lr4_train.select(['features','usd_pledged']))

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lr4_model.coefficients))
print("Intercept: %s" % str(lr4_model.intercept))
print("numIterations: %d" % lr4_model.summary.totalIterations)
print("objectiveHistory: %s" % str(lr4_model.summary.objectiveHistory))

In [60]:
# Summarize the model over the training set and print out some metrics
print("RMSE: %f" % lr4_model.summary.rootMeanSquaredError)
print("r2: %f" % lr4_model.summary.r2)

In [61]:
display(lr4_model.summary.residuals.take(5))

residuals
-3405.38058560758
-3475.942328810612
-3637.97623998616
15.152838350733418
15.152838350733418


In [62]:
lr4_test=fitted_pipe.transform(test4_data)
results = lr4_model.transform(lr4_test)
display(results.select(['usd_pledged','prediction']).take(10))

usd_pledged,prediction
228.26419409,4131.008980299922
796.0,5283.143454140327
834.0,6856.702387993231
31.0,3324.5160194734253
121.0,4258.319373578545
6.0,-78.84443623349466
10.0,1764.0566835467544
17.0,2569.548455918832
22.0,2667.589795605185
220.0,4862.726915037516


In [63]:
test_results = lr4_model.evaluate(lr4_test)
print("RMSE: %f" % test_results.rootMeanSquaredError)
print("r2: %f" % test_results.r2)