In [1]:
import pandas as pd  
import numpy as np
import scipy as sp
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
%matplotlib inline

In [2]:
dataset = pd.read_excel ('bank.xlsx', sheet_name='Sheet1')
dataset.drop('.', axis=1, inplace=True)
dataset.head()

Unnamed: 0,Account No,DATE,TRANSACTION DETAILS,CHQ.NO.,VALUE DATE,WITHDRAWAL AMT,DEPOSIT AMT,BALANCE AMT
0,409000611074',2017-06-29,TRF FROM Indiaforensic SERVICES,,2017-06-29,,1000000.0,1000000.0
1,409000611074',2017-07-05,TRF FROM Indiaforensic SERVICES,,2017-07-05,,1000000.0,2000000.0
2,409000611074',2017-07-18,FDRL/INTERNAL FUND TRANSFE,,2017-07-18,,500000.0,2500000.0
3,409000611074',2017-08-01,TRF FRM Indiaforensic SERVICES,,2017-08-01,,3000000.0,5500000.0
4,409000611074',2017-08-16,FDRL/INTERNAL FUND TRANSFE,,2017-08-16,,500000.0,6000000.0


In [3]:
dataset.columns = ['Account_No', 'DATE', 'TRANSACTION_DETAILS',	'CHQ_NO',
                   'VALUE_DATE', 'WITHDRAWAL_AMT', 'DEPOSIT_AMT', 'BALANCE_AMT']
dataset.to_csv('bank.csv', index = False, header=True) 

In [4]:
from pyspark import SparkContext
sc = SparkContext(master='local[*]', appName='Spark_App')

## A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.

### Creating RDDs

There are two ways to create an RDD in PySpark. You can parallelize a list

In [5]:
data = sc.parallelize(
    [('Amber', 22), ('Alfred', 23), ('Skye',4), ('Albert', 12), 
     ('Amber', 9)])

In [6]:
sc.range(5).collect()

[0, 1, 2, 3, 4]

In [7]:
myRDD = sc.parallelize(range(6), 3)
sc.runJob(myRDD, lambda part: [x * x for x in part])

[0, 1, 4, 9, 16, 25]

In [8]:
myRDD = sc.parallelize(range(6), 3)
sc.runJob(myRDD, lambda part: [x * x for x in part], [0, 2], True)

[0, 1, 16, 25]

In [9]:
sc.parallelize([2, 3, 4]).count()

3

In [10]:
rdd = sc.parallelize([1, 2, 3, 4, 5])
rdd.filter(lambda x: x % 2 == 0).collect()

[2, 4]

In [11]:
rdd1 = sc.parallelize([1, 10, 2, 3, 4, 5])
rdd2 = sc.parallelize([1, 6, 2, 3, 7, 8])
rdd1.intersection(rdd2).collect()

[1, 2, 3]

In [12]:
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2)])
sorted(x.leftOuterJoin(y).collect())

[('a', (1, 2)), ('b', (4, None))]

In [13]:
rdd = sc.parallelize(["b", "a", "c"])
sorted(rdd.map(lambda x: (x, 1)).collect())

[('a', 1), ('b', 1), ('c', 1)]

In [14]:
x = sc.parallelize([("a", ["apple", "banana", "lemon"]), ("b", ["grapes"])])
def f(x): return len(x)
x.mapValues(f).collect()

[('a', 3), ('b', 1)]

In [15]:
sc.parallelize([1, 2, 3]).mean()

2.0

In [16]:
rdd = sc.parallelize([1, 1, 2, 3])
rdd.union(rdd).collect()

[1, 1, 2, 3, 1, 1, 2, 3]

#### Schema

RDDs are *schema-less* data structures.

In [17]:
data_heterogenous = sc.parallelize([('Ferrari', 'fast'), {'Porsche': 100000}, ['Spain','visited', 4504]]).collect()
data_heterogenous

[('Ferrari', 'fast'), {'Porsche': 100000}, ['Spain', 'visited', 4504]]

You can access the data in the object as you would normally do in Python.

In [18]:
data_heterogenous[1]['Porsche']

100000