In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('Code').getOrCreate()

In [81]:
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import corr

In [4]:
data=spark.read.csv('C:\\Users\\User\\Downloads\\Python-and-Spark-for-Big-Data-master\\Python-and-Spark-for-Big-Data-master\\Spark_for_Machine_Learning\\Linear_Regression\\Ecommerce_Customers.csv',inferSchema=True,header=True)

In [5]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [11]:
data.head(1)[0]

Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005)

In [12]:
for item in data.head(1)[0]:
    print(item)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


In [13]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [14]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [16]:
assembler=VectorAssembler(inputCols=['Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership'],outputCol='features')

In [17]:
output=assembler.transform(data)

In [20]:
output.select('features').show()

+--------------------+
|            features|
+--------------------+
|[34.4972677251122...|
|[31.9262720263601...|
|[33.0009147556426...|
|[34.3055566297555...|
|[33.3306725236463...|
|[33.8710378793419...|
|[32.0215955013870...|
|[32.7391429383803...|
|[33.9877728956856...|
|[31.9365486184489...|
|[33.9925727749537...|
|[33.8793608248049...|
|[29.5324289670579...|
|[33.1903340437226...|
|[32.3879758531538...|
|[30.7377203726281...|
|[32.1253868972878...|
|[32.3388993230671...|
|[32.1878120459321...|
|[32.6178560628234...|
+--------------------+
only showing top 20 rows



In [21]:
output.head(1)

[Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005, features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826]))]

In [22]:
final_data=output.select('features','Yearly Amount Spent')

In [23]:
final_data.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
|[33.8710378793419...|   637.102447915074|
|[32.0215955013870...|  521.5721747578274|
|[32.7391429383803...|  549.9041461052942|
|[33.9877728956856...|  570.2004089636196|
|[31.9365486184489...|  427.1993848953282|
|[33.9925727749537...|  492.6060127179966|
|[33.8793608248049...|  522.3374046069357|
|[29.5324289670579...|  408.6403510726275|
|[33.1903340437226...|  573.4158673313865|
|[32.3879758531538...|  470.4527333009554|
|[30.7377203726281...|  461.7807421962299|
|[32.1253868972878...| 457.84769594494855|
|[32.3388993230671...| 407.70454754954415|
|[32.1878120459321...|  452.3156754800354|
|[32.6178560628234...|   605.061038804892|
+----------

In [24]:
train_data,test_data=final_data.randomSplit([0.7,0.3])

In [26]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                151|
|   mean|  502.4430913387791|
| stddev|   86.5089120619131|
|    min| 256.67058229005585|
|    max|  744.2218671047146|
+-------+-------------------+



In [27]:
lr=LinearRegression(labelCol='Yearly Amount Spent')

In [28]:
lr_model=lr.fit(train_data)

In [29]:
test_results=lr_model.evaluate(test_data)

In [30]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|  -4.31916520507724|
|-3.5779080783403856|
|  7.321063480834255|
|  5.151733440338035|
|-21.191450884900064|
| 19.692811685766458|
| 4.1980674431594025|
|  4.262665908170561|
| 1.2188304110106856|
|  3.281535594867762|
|-13.663610584695562|
|-1.1978487959739823|
|  18.34003849480314|
| -2.193621588698022|
| -17.86049642375275|
|   8.03155438142511|
| -9.145677044464207|
|-2.0569025468068958|
|  -8.79883759828948|
|  8.807918353864011|
+-------------------+
only showing top 20 rows



In [31]:
test_results.rootMeanSquaredError

10.944917051229636

In [32]:
unl_dt=test_data.select('features')

In [33]:
predict=lr_model.transform(unl_dt)

In [34]:
predict.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[30.8364326747734...|471.82106563206685|
|[30.8794843441274...|493.78450806319506|
|[30.9716756438877...|487.31754627605847|
|[31.0472221394875...|387.34566574868336|
|[31.1239743499119...|508.13850472466584|
|[31.3123495994443...|443.89860634217416|
|[31.3584771924370...|  490.977883006316|
|[31.3662121671876...|426.32621664831436|
|[31.3895854806643...| 408.8507806489722|
|[31.4459724827577...| 481.5954293402608|
|[31.5741380228732...| 558.0728827452824|
|[31.5761319713222...| 542.4244327853023|
|[31.6005122003032...| 460.8328129962938|
|[31.7216523605090...|349.97054822057066|
|[31.8164283341993...| 518.9829879274091|
|[31.8209982016720...|416.64372663178824|
|[31.8279790554652...| 449.1484245914057|
|[31.8530748017465...| 461.3420260091589|
|[31.8854062999117...|  398.902110570765|
|[31.9549038566348...|431.18996158606296|
+--------------------+------------

In [35]:
df=spark.read.csv('C:\\Users\\User\\Downloads\\Python-and-Spark-for-Big-Data-master\\Python-and-Spark-for-Big-Data-master\\Spark_for_Machine_Learning\\Linear_Regression\\cruise_ship_info.csv',inferSchema=True,header=True)

In [36]:
df.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Elation|   Carnival| 15|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Fantasy|   Carnival| 23| 

In [53]:
from pyspark.ml.feature import StringIndexer

In [54]:
s_assm=StringIndexer(inputCol='Cruise_line',outputCol='Cruise_cat')
Indexer=s_assm.fit(df).transform(df)

In [55]:
Indexer.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+----------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Cruise_cat|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+----------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|      16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|      16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|       1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|       1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|       1.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|       1.0|
|    Elation|   Carnival| 15

In [37]:
df.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

In [56]:
assm=VectorAssembler(inputCols=['Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density','Cruise_cat'],outputCol='features')

In [58]:
op=assm.transform(Indexer)

In [69]:
final_data = op.select("features", "crew")

In [70]:
train_data,test_data=final_data.randomSplit([0.7,0.3])

In [82]:
df.select(corr('crew','cabins')).show()

+------------------+
|corr(crew, cabins)|
+------------------+
|0.9508226063578497|
+------------------+



In [71]:
lrm=LinearRegression(labelCol='crew')

In [72]:
l_m=lrm.fit(train_data)

In [73]:
result_test=l_m.evaluate(test_data)

In [74]:
result_test.rootMeanSquaredError

0.703465453073099

In [75]:
result_test.r2

0.9650609363513445

In [76]:
result_test.residuals.show()

+--------------------+
|           residuals|
+--------------------+
| -0.3853975578463569|
|-0.08346469828342862|
|-0.47983146426847156|
| -1.3135264387145114|
|  0.8532566773289059|
| -0.7287873897400434|
|-0.24114136576560696|
| -0.6278238338252073|
| -0.7488658342034196|
|  0.8989712617611314|
|-0.28256606365975756|
| 0.14818289212119318|
|-0.22525132879641419|
| -0.7381949639713064|
| -0.7074456492758134|
|  1.2831036948060266|
| -0.2755644509898687|
| -1.3055487647017774|
|  0.7294920298647636|
| 0.04093276103934684|
+--------------------+
only showing top 20 rows



In [77]:
ulb_data=test_data.select('features')

In [78]:
pred=l_m.transform(ulb_data)

In [79]:
pred.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[4.0,220.0,54.0,1...|21.385397557846357|
|[5.0,133.5,39.59,...| 13.21346469828343|
|[6.0,158.0,43.7,1...|14.079831464268471|
|[7.0,89.6,25.5,9....| 11.18352643871451|
|[8.0,91.0,22.44,9...|10.146743322671094|
|[8.0,110.0,29.74,...|12.328787389740043|
|[9.0,59.058,17.0,...| 7.641141365765607|
|[9.0,90.09,25.01,...| 9.317823833825207|
|[9.0,105.0,27.2,8...| 11.42886583420342|
|[9.0,113.0,26.74,...| 11.48102873823887|
|[9.0,116.0,26.0,9...|11.282566063659758|
|[10.0,77.0,20.16,...| 8.851817107878807|
|[10.0,91.62700000...| 9.225251328796414|
|[10.0,105.0,27.2,...|11.418194963971306|
|[10.0,110.0,29.74...|12.307445649275813|
|[10.0,151.4,26.2,...|11.246896305193973|
|[11.0,86.0,21.24,...|  9.57556445098987|
|[11.0,138.0,31.14...|13.155548764701777|
|[12.0,77.104,20.0...| 8.860507970135236|
|[12.0,108.865,27....|10.959067238960653|
+--------------------+------------

In [83]:
churn=spark.read.csv('C:\\Users\\User\\Downloads\\Python-and-Spark-for-Big-Data-master\\Python-and-Spark-for-Big-Data-master\\Spark_for_Machine_Learning\\Logistic_Regression\\customer_churn.csv',inferSchema=True,header=True)

In [84]:
churn.show()

+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|              Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|   Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|      Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|
|        Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|
|      Phillip White|42.0|       8010.76|              0| 6.71|     10.0|2014-04-22 12:43:12|13120 Daniel Moun...|           Smith Inc|    1|
|     

In [85]:
churn.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [99]:
#churn.groupBy('Company').count().show()

In [98]:
churn.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [100]:
assembler=VectorAssembler(inputCols=['Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites'],outputCol='features')

In [101]:
output_c=assembler.transform(churn)

In [103]:
output_c.select('features').show()

+--------------------+
|            features|
+--------------------+
|[42.0,11066.8,0.0...|
|[41.0,11916.22,0....|
|[38.0,12884.75,0....|
|[42.0,8010.76,0.0...|
|[37.0,9191.58,0.0...|
|[48.0,10356.02,0....|
|[44.0,11331.58,1....|
|[32.0,9885.12,1.0...|
|[43.0,14062.6,1.0...|
|[40.0,8066.94,1.0...|
|[30.0,11575.37,1....|
|[45.0,8771.02,1.0...|
|[45.0,8988.67,1.0...|
|[40.0,8283.32,1.0...|
|[41.0,6569.87,1.0...|
|[38.0,10494.82,1....|
|[45.0,8213.41,1.0...|
|[43.0,11226.88,0....|
|[53.0,5515.09,0.0...|
|[46.0,8046.4,1.0,...|
+--------------------+
only showing top 20 rows



In [104]:
final_dat=output_c.select('features','churn')

In [105]:
train_data,test_data=final_dat.randomSplit([0.7,0.3])

In [106]:
from pyspark.ml.classification import LogisticRegression

In [107]:
lo_reg=LogisticRegression(labelCol='churn')

In [117]:
lo_fit=lo_reg.fit(train_data)

In [118]:
lo_fit.summary.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[25.0,9672.03,0.0...|  0.0|[4.72788523511001...|[0.99123239414596...|       0.0|
|[26.0,8787.39,1.0...|  1.0|[0.90846296558983...|[0.71268553502687...|       0.0|
|[26.0,8939.61,0.0...|  0.0|[6.32293195565026...|[0.99820854120368...|       0.0|
|[27.0,8628.8,1.0,...|  0.0|[5.41524136677862...|[0.99557143540453...|       0.0|
|[28.0,8670.98,0.0...|  0.0|[7.60721680010137...|[0.99950339396359...|       0.0|
|[28.0,9090.43,1.0...|  0.0|[1.70081557685651...|[0.84564122392543...|       0.0|
|[28.0,11204.23,0....|  0.0|[2.01774566485242...|[0.88264770478580...|       0.0|
|[28.0,11245.38,0....|  0.0|[3.77076794172064...|[0.97748426819790...|       0.0|
|[29.0,5900.78,1.0...|  0.0|[4.13163698955419...|[0.98419716649240...|       0.0|
|[29.0,8688.17,1

In [119]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator

In [120]:
pred_label=lo_fit.evaluate(test_data)

In [121]:
pred=pred_label.predictions

In [122]:
biny=BinaryClassificationEvaluator(labelCol='churn',rawPredictionCol='prediction')

In [123]:
auc=biny.evaluate(pred)

In [124]:
auc

0.7761622435535478

In [125]:
final_lo_model=lo_reg.fit(final_dat)

In [126]:
new_customers = spark.read.csv('C:\\Users\\User\\Downloads\\Python-and-Spark-for-Big-Data-master\\Python-and-Spark-for-Big-Data-master\\Spark_for_Machine_Learning\\Logistic_Regression\\new_customers.csv',inferSchema=True,
                              header=True)

In [127]:
new_customers.show()

+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+
|         Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|         Company|
+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+
| Andrew Mccall|37.0|       9935.53|              1| 7.71|      8.0|2011-08-29 18:37:54|38612 Johnny Stra...|        King Ltd|
|Michele Wright|23.0|       7526.94|              1| 9.28|     15.0|2013-07-22 18:19:54|21083 Nicole Junc...|   Cannon-Benson|
|  Jeremy Chang|65.0|         100.0|              1|  1.0|     15.0|2006-12-11 07:48:13|085 Austin Views ...|Barron-Robertson|
|Megan Ferguson|32.0|        6487.5|              0|  9.4|     14.0|2016-10-28 05:32:13|922 Wright Branch...|   Sexton-Golden|
|  Taylor Young|32.0|      13147.71|              1| 10.0|      8.0|2012-03-20 00:36:46|Unit 0789 Box 073...|  

In [129]:
final_outputs=assembler.transform(new_customers)

In [130]:
final_outputs.show()

+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+--------------------+
|         Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|         Company|            features|
+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+--------------------+
| Andrew Mccall|37.0|       9935.53|              1| 7.71|      8.0|2011-08-29 18:37:54|38612 Johnny Stra...|        King Ltd|[37.0,9935.53,1.0...|
|Michele Wright|23.0|       7526.94|              1| 9.28|     15.0|2013-07-22 18:19:54|21083 Nicole Junc...|   Cannon-Benson|[23.0,7526.94,1.0...|
|  Jeremy Chang|65.0|         100.0|              1|  1.0|     15.0|2006-12-11 07:48:13|085 Austin Views ...|Barron-Robertson|[65.0,100.0,1.0,1...|
|Megan Ferguson|32.0|        6487.5|              0|  9.4|     14.0|2016-10-28 05:32:13|922 Wright Branch...|   

In [132]:
data_final=final_lo_model.transform(final_outputs)

In [134]:
data_final.select('features','prediction').show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[37.0,9935.53,1.0...|       0.0|
|[23.0,7526.94,1.0...|       1.0|
|[65.0,100.0,1.0,1...|       1.0|
|[32.0,6487.5,0.0,...|       1.0|
|[32.0,13147.71,1....|       0.0|
|[22.0,8445.26,1.0...|       1.0|
+--------------------+----------+

