In [1]:
from pyspark.sql import SparkSession

In [2]:
from pyspark import SparkConf, SparkContext

In [4]:
from pyspark.sql import SQLContext
sc = SparkContext()
sqlContext = SQLContext(sc)

In [6]:
d = sqlContext.read.format('csv').options(header=True,inferSchema=True).load('mod11/PS_20174392719_1491204439457_log.csv')

In [7]:
d.createTempView("Table1")

In [9]:
d.printSchema()

root
 |-- step: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- newbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)
 |-- isFlaggedFraud: integer (nullable = true)



In [11]:
d.columns

['step',
 'type',
 'amount',
 'nameOrig',
 'oldbalanceOrg',
 'newbalanceOrig',
 'nameDest',
 'oldbalanceDest',
 'newbalanceDest',
 'isFraud',
 'isFlaggedFraud']

In [12]:
d.show(1)

+----+-------+-------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|   type| amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+-------+-------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|   1|PAYMENT|9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|
+----+-------+-------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
only showing top 1 row



In [16]:
my_cols=d.select(['type','amount','nameOrig','oldbalanceOrg','newbalanceOrig','nameDest','oldbalanceDest','newbalanceDest', 'isFraud', 'isFlaggedFraud'])

In [17]:
my_final_data = my_cols.na.drop() #Drop Null Values

In [18]:
from pyspark.ml.feature import (VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer)

In [19]:
type_indexer = StringIndexer(inputCol="type", outputCol="typeIndex")
type_encoder = OneHotEncoder(inputCol="typeIndex", outputCol="typeVec")

In [21]:
fraud_indexer = StringIndexer(inputCol="isFraud", outputCol="fraudIndex")
fraud_encoder = OneHotEncoder(inputCol="fraudIndex", outputCol="fraudVec")


In [35]:
assembler=VectorAssembler(inputCols=['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest'], outputCol='features')

In [36]:
from pyspark.ml.classification import LogisticRegression

In [37]:
from pyspark.ml import Pipeline

In [42]:
log_reg_data = LogisticRegression(featuresCol= "features", labelCol="isFraud")

In [43]:
pipeline = Pipeline(stages=[type_indexer, fraud_indexer, type_encoder, fraud_encoder, assembler, log_reg_data])

In [44]:
train_data, test_data = my_final_data.randomSplit([0.70, 0.30])

In [45]:
fit_model = pipeline.fit(train_data)

In [46]:
results = fit_model.transform(test_data)

In [47]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [49]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='isFraud')