<a href="https://colab.research.google.com/github/jcestevezc/Machine-Learning-Big-Data/blob/master/Machine%20Learning%20on%20Spark/Introduction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Machine Learning on Spark

## 0. Methodology CRISP-DM and ASUM-DM

![CRISP](https://github.com/jcestevezc/Machine-Learning-Big-Data/blob/master/Machine%20Learning%20on%20Spark/CRISP.png?raw=true)


![ASUM](https://github.com/jcestevezc/Machine-Learning-Big-Data/blob/master/Machine%20Learning%20on%20Spark/ASUM.jpg?raw=true)



### 1.1. Creating the spark session and context

In [1]:
# ! pip install pyspark



In [2]:
# Import the PySpark module
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

In [3]:
##Create SparkContext
sc = SparkContext.getOrCreate()

# Create SparkSession object
spark = SparkSession.builder \
                    .master('local[*]') \
                    .appName('test') \
                    .getOrCreate()

In [4]:
# What version of Spark?
print(spark.version)

3.0.0


### Read data from CSV file (infering schema) [Indice](#Indice)

In [7]:
## Is not the best choose for large data sets
flights = spark.read.csv('/content/sample_data/flights_small.csv',
                         sep=',',
                         header=True,
                         inferSchema=True,
                         nullValue='NA'
                        )

### Read data from CSV file (defining schema)

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Specify column names and types
schema = StructType([
    StructField("id", IntegerType()),
    StructField("text", StringType()),
    StructField("label", IntegerType())
])

# Load data from a delimited file
sms = spark.read.csv('https://raw.githubusercontent.com/jcestevezc/Machine-Learning-Big-Data/blob/master/Machine%20Learning%20with%20PySpark/sms.csv', sep=';', header=False, schema=schema)

# Print schema of DataFrame
sms.printSchema()

### Exploring data

In [None]:
# Get number of records
print("The data contain %d records." % flights.count())

In [None]:
# View the first five records
flights.show(5)

In [None]:
flights.describe().toPandas().transpose()

In [None]:
# Check column data types
flights.dtypes

### Selecting columns

In [None]:
flights_drop = flights.drop('distance','hour')

In [None]:
flights_select = flights.select('year','month')

### Filtering out missing data

In [None]:
flights.filter('arr_delay is null').count()

In [None]:
flights_no_miss = flights.filter('arr_delay is null')

In [None]:
flights_no_miss_2 = flights.dropna()

In [None]:
# Terminate the cluster
spark.stop()

### Column manipulation

In [None]:
# Import the required function
from pyspark.sql.functions import round

# Convert 'mile' to 'km' and drop 'mile' column
flights_km = flights.withColumn('km', round(flights.mile * 1.60934, 0)) \
                    .drop('mile')
flights_km.head()
# Create 'label' column indicating whether flight delayed (1) or not (0)
flights_km = flights_km.withColumn('label', (flights_km.delay >=15).cast('integer'))

# Check first five records
flights_km.show(5)

### Indexing categorical data

In [None]:
from pyspark.ml.feature import StringIndexer

# Create an indexer
indexer = StringIndexer(inputCol='carrier', outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(flights)

# Indexer creates a new column with numeric index values
flights_indexed = indexer_model.transform(flights)

# Repeat the process for the other categorical feature
flights_indexed = StringIndexer(inputCol='org', outputCol='org_idx').fit(flights_indexed).transform(flights_indexed)

### Assembling columns

In [None]:
# Create an assembler object

assembler = VectorAssembler(
    inputCols=['mon', 
               'dom' , 
               'dow',
               'carrier_idx',
               'org_idx',
               'km',
               'depart',
               'duration'
              ], outputCol='features')

# Consolidate predictor columns
flights_assembled = assembler.transform(flights)

### Train/test split

In [None]:
# Split into training and testing sets in a 80:20 ratio
flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=17)

# Check that training set has around 80% of records
training_ratio = flights_train.count() / flights.count()
print(training_ratio)

### Build a Decision Tree

In [None]:
# Import the Decision Tree Classifier class
from pyspark.ml.classification import DecisionTreeClassifier

# Create a classifier object and fit to the training data
tree = DecisionTreeClassifier()
tree_model = tree.fit(flights_train)

# Create predictions for the testing data and take a look at the predictions
prediction = tree_model.transform(flights_test)
prediction.select('label', 'prediction', 'probability').show(5, False)

### Evaluate the Decision Tree

In [None]:
# Create a confusion matrix
prediction.groupBy('label', 'prediction').count().show()

# Calculate the elements of the confusion matrix
TN = prediction.filter('prediction = 0 AND label = prediction').count()
TP = prediction.filter('prediction = 1 AND label = prediction').count()
FN = prediction.filter('prediction = 0 AND label != prediction').count()
FP = prediction.filter('prediction = 1 AND label != prediction').count()

# Accuracy measures the proportion of correct predictions
accuracy = (TN + TP) / (TN + TP + FN + FP) 
print(accuracy)