# Logistic Regression Project

In [22]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

SEED = 1738

spark = SparkSession.builder.appName("Logistic Regression Project").getOrCreate()

In [78]:
df = spark.read.csv("../course_materials/Spark_for_Machine_Learning/Logistic_Regression/customer_churn.csv", header = True, inferSchema = True)
df_final_test = spark.read.csv("../course_materials/Spark_for_Machine_Learning/Logistic_Regression/new_customers.csv", header = True, inferSchema = True)

In [4]:
print(f"Number of rows: {df.count()}")

df.printSchema()

df.show()

Number of rows: 900
root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)

+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|              Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|   Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1

In [75]:
df_final_test.show()

+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+
|         Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|         Company|
+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+
| Andrew Mccall|37.0|       9935.53|              1| 7.71|      8.0|2011-08-29 18:37:54|38612 Johnny Stra...|        King Ltd|
|Michele Wright|23.0|       7526.94|              1| 9.28|     15.0|2013-07-22 18:19:54|21083 Nicole Junc...|   Cannon-Benson|
|  Jeremy Chang|65.0|         100.0|              1|  1.0|     15.0|2006-12-11 07:48:13|085 Austin Views ...|Barron-Robertson|
|Megan Ferguson|32.0|        6487.5|              0|  9.4|     14.0|2016-10-28 05:32:13|922 Wright Branch...|   Sexton-Golden|
|  Taylor Young|32.0|      13147.71|              1| 10.0|      8.0|2012-03-20 00:36:46|Unit 0789 Box 073...|  

<br>

---

<br>

In [16]:
for row in df.head(20):
    print(row["Location"])

10265 Elizabeth Mission Barkerburgh, AK 89518
6157 Frank Gardens Suite 019 Carloshaven, RI 17756
1331 Keith Court Alyssahaven, DE 90114
13120 Daniel Mount Angelabury, WY 30645-4695
765 Tricia Row Karenshire, MH 71730
6187 Olson Mountains East Vincentborough, PR 74359
4846 Savannah Road West Justin, IA 87713-3460
25271 Roy Expressway Suite 147 Brownport, FM 59852-6150
3725 Caroline Stravenue South Christineview, MA 82059
363 Sandra Lodge Suite 144 South Ann, WI 51655-7561
Unit 8120 Box 9160 DPO AA 43432
Unit 1895 Box 0949 DPO AA 40249
897 Kelley Overpass Suite 349 West Rebekahport, AZ 44793
11488 Weaver Cape Hernandezberg, WI 63417-8544
1774 Peter Row Apt. 712 New Autumn, MT 18782
45408 David Path East Kimberlyshire, HI 54903-6698
28216 Wright Mount Apt. 356 Alichester, DE 40999-2369
Unit 4948 Box 4814 DPO AP 42669
69203 Crosby Divide Apt. 878 Parkerview, CO 87064
9569 Caldwell Crescent Tanyaborough, RI 30637


In [13]:
df.agg(
    F.countDistinct(df["Location"]).alias("num_locations"),
    F.countDistinct(df["Company"]).alias("num_companies")
).show()

+-------------+-------------+
|num_locations|num_companies|
+-------------+-------------+
|          900|          873|
+-------------+-------------+



In [28]:
df.groupBy("Churn").agg(
    F.count(df["Churn"]).alias("count"),
    F.mean(df["Age"]).alias("mean_age"),
    F.mean(df["Total_Purchase"]).alias("mean_purchase"),
    F.mean(df["Account_Manager"]).alias("prop_with_acct_manager"),
    F.mean(df["Years"]).alias("mean_years"),
    F.mean(df["Num_Sites"]).alias("mean_num_sites"),
    F.mean(F.year(df["Onboard_date"])).alias("mean_onboard_year")
).show()

+-----+-----+-----------------+------------------+----------------------+------------------+-----------------+------------------+
|Churn|count|         mean_age|     mean_purchase|prop_with_acct_manager|        mean_years|   mean_num_sites| mean_onboard_year|
+-----+-----+-----------------+------------------+----------------------+------------------+-----------------+------------------+
|    1|  150|42.99333333333333|10192.179933333337|                  0.56|5.8835999999999995|            10.66|2010.5133333333333|
|    0|  750|41.58133333333333|10036.952853333332|    0.4653333333333333|5.1510666666666625|8.173333333333334|2010.8586666666667|
+-----+-----+-----------------+------------------+----------------------+------------------+-----------------+------------------+



<br>

---

<br>

In [30]:
feature_cols = [
    "Age",
    "Total_Purchase",
    "Account_Manager",
    "Years",
    "Num_Sites"
]

assembler = VectorAssembler(
    inputCols = feature_cols,
    outputCol = "features"
)

df_model = assembler.transform(df)

df_model.show()

+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+--------------------+
|              Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|            features|
+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+--------------------+
|   Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|[42.0,11066.8,0.0...|
|      Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|[41.0,11916.22,0....|
|        Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|[38.0,12884.75,0....|
|      Phillip White|4

In [31]:
df_train = df_model.sampleBy("Churn", fractions = {0: 0.7, 1:0.7}, seed = SEED)
df_test = df_model.subtract(df_train)

df_train.groupBy("Churn").agg(
    F.count("Churn").alias("count")
).show()

df_test.groupBy("Churn").agg(
    F.count("Churn").alias("count")
).show()

+-----+-----+
|Churn|count|
+-----+-----+
|    1|  110|
|    0|  541|
+-----+-----+

+-----+-----+
|Churn|count|
+-----+-----+
|    1|   40|
|    0|  209|
+-----+-----+



In [63]:
logreg_estimator = LogisticRegression(
    featuresCol = "features",
    labelCol = "Churn"
)

logreg_model = logreg_estimator.fit(df_train)

logreg_results = logreg_model.evaluate(df_test)

df_results = logreg_results.predictions

In [66]:
logreg_results.accuracy

0.891566265060241

In [71]:
print(f"Labels: {logreg_results.labels}")
print(f"False Positive Rate: {logreg_results.falsePositiveRateByLabel}")
print(f"True Positive Rate: {logreg_results.truePositiveRateByLabel}")
print(f"Recall: {logreg_results.recallByLabel}")

Labels: [0.0, 1.0]
False Positive Rate: [0.4, 0.05263157894736842]
True Positive Rate: [0.9473684210526315, 0.6]
Recall: [0.9473684210526315, 0.6]


In [73]:
logreg_eval = BinaryClassificationEvaluator(
    labelCol = "Churn",
    metricName = "areaUnderROC"
)

print(f"Area Under ROC Curve: {logreg_eval.evaluate(df_results)}")

Area Under ROC Curve: 0.8827751196172232


<br>

---

<br>

In [80]:
final_model = logreg_estimator.fit(df_model)

In [81]:
df_model_final_test = assembler.transform(df_final_test)

final_results = final_model.transform(df_model_final_test)

In [85]:
final_results.select(
    [
        "Names",
        "Age",
        "Total_Purchase",
        "Account_Manager",
        "Years",
        "Num_Sites",
        "Company",
        "probability",
        "prediction"
    ]
).show()

+--------------+----+--------------+---------------+-----+---------+----------------+--------------------+----------+
|         Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|         Company|         probability|prediction|
+--------------+----+--------------+---------------+-----+---------+----------------+--------------------+----------+
| Andrew Mccall|37.0|       9935.53|              1| 7.71|      8.0|        King Ltd|[0.90218015921764...|       0.0|
|Michele Wright|23.0|       7526.94|              1| 9.28|     15.0|   Cannon-Benson|[0.00198380259784...|       1.0|
|  Jeremy Chang|65.0|         100.0|              1|  1.0|     15.0|Barron-Robertson|[0.02255113312433...|       1.0|
|Megan Ferguson|32.0|        6487.5|              0|  9.4|     14.0|   Sexton-Golden|[0.00608622076714...|       1.0|
|  Taylor Young|32.0|      13147.71|              1| 10.0|      8.0|        Wood LLC|[0.75115056144900...|       0.0|
| Jessica Drake|22.0|       8445.26|              1| 3.4