# Dataset Overview

In [1]:
import pandas as pd
df = pd.read_csv('public.csv')

# Use Pyspark to view dataset 

In [7]:
# These part is  for windows version, if you use ubuntu, remember to edit import pyspark part
# ----
import findspark

findspark.init('/home/austin/spark-2.1.0-bin-hadoop2.7')
findspark.find()
import pyspark
findspark.find()
# ----
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Churn_Modelling").getOrCreate()
df = spark.read.csv('public.csv',header=True,inferSchema=True)
df.printSchema()


ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=Churn_Modelling, master=local[*]) created by getOrCreate at <ipython-input-3-cf3af1eb4e06>:12 

# Do your work here

In [None]:
cols = df.columns
df.groupby("Exited").count().show()
df.groupby("Exited").mean().show()

In [None]:
from imblearn.over_sampling import SMOTE


In [None]:
from pyspark.ml.feature import Normalizer

# Normalize each Vector using $L^1$ norm.
normalizer = Normalizer(inputCol="CreditScore", outputCol="normCreditScore", p=2.0)
l1NormData = normalizer.transform(df)
l1NormData.show('CreditScore')
# Normalize each Vector using $L^\infty$ norm.
#lInfNormData = normalizer.transform(df, {normalizer.p: float("inf")})
#print(l1NormData)


In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
categoricalColumns = ["Geography", "Gender"]
stages = []
for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    # Use OneHotEncoder to convert categorical variables into binary SparseVectors
    # encoder = OneHotEncoderEstimator(inputCol=categoricalCol + "Index", outputCol=categoricalCol + "classVec")
    encoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(), outputCol=categoricalCol + "classVec", dropLast = False)
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]

#m = stages[2].fit(df)
#dft = m.transform(df)
#stages[3].transform(dft)["GenderclassVec","GenderIndex"].show()

In [None]:
numericCols = ["Age","Tenure", "Balance", "NumOfProducts", "HasCrCard", "IsActiveMember","EstimatedSalary"]
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [None]:
from pyspark.ml.classification import LogisticRegression
partialPipeline = Pipeline().setStages(stages)
pipelineModel = partialPipeline.fit(df)
preppedDataDF = pipelineModel.transform(df)
#preppedDataDF.show()
# Fit model to prepped data
lrModel = LogisticRegression(labelCol='Exited').fit(preppedDataDF)

#display(lrModel, preppedDataDF, "ROC")
# Keep relevant columns
selectedcols = ["features"] + cols
dataset = preppedDataDF.select(selectedcols)
display(dataset)

### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)
print(trainingData.count())
print(testData.count())



In [None]:
# Create initial LogisticRegression model
lr = LogisticRegression(labelCol="Exited", featuresCol="features", maxIter=10)

# Train model with Training Data
lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)
# View model's predictions and probabilities of each prediction class
# You can select any columns in the above schema to view as well. For example's sake we will choose age & occupation
selected = predictions.select("Exited", "prediction", "probability", "EstimatedSalary", "AGE")
#selected.show(100)

from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(labelCol="Exited",rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)
#testData.groupby("Exited").count().show()
#predictions.groupby("prediction").count().show()

In [None]:
from pyspark.ml.classification import RandomForestClassifier

# Create an initial RandomForest model.
rf = RandomForestClassifier(labelCol="Exited", featuresCol="features",numTrees=200)

# Train model with Training Data
rfModel = rf.fit(trainingData)

# Make predictions on test data using the Transformer.transform() method.
predictions = rfModel.transform(testData)
#predictions.printSchema()

from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(labelCol="Exited")
evaluator.evaluate(predictions)

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
'''
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lr.maxIter, [1, 5, 10])
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Run cross validations
cvModel = cv.fit(trainingData)
# this will likely take a fair amount of time because of the amount of models that we're creating and testing

## Use test set to measure the accuracy of our model on new data
predictions = cvModel.transform(testData)

## cvModel uses the best model found from the Cross Validation
# Evaluate best model
evaluator.evaluate(predictions)

print('Model Intercept: ', cvModel.bestModel.intercept)
weights = cvModel.bestModel.coefficients
weights = [(float(w),) for w in weights]  # convert numpy type to float, and to tuple
weightsDF = sqlContext.createDataFrame(weights, ["Feature Weight"])
display(weightsDF)

# View best model's predictions and probabilities of each prediction class
selected = predictions.select("Exited", "prediction", "probability", "EstimatedSalary", "AGE")
selected.show()
'''

# Evaluation Part

## Load private dataset, the same structure as public dataset

In [None]:
df_private = spark.read.csv('public.csv',header=True,inferSchema=True)  # TA takes public dataset as example

## Do prediction with your PySpark model here

## Print Your result as the following type

In [None]:
df_private.select('CustomerId','Exited').show(5)

## TA will use the following function to get your prediction result (f-1 score)

In [None]:
from sklearn import metrics
import numpy as np
data_array =  np.array(testData.select('Exited').collect())
prediction_array = np.array(predictions.select('prediction').collect())
metrics.f1_score(data_array,prediction_array)  

In [None]:
predictions.select('CustomerId','prediction').show(100)
testData.select('CustomerId','Exited').show(100)