# Averages

### Mean & Median

In [1]:
sc.version

'3.0.0-preview2'

In [None]:
rdd = sc.parallelize([101]+range(100)+[102,103,104],1000,100)

In [None]:
sum = rdd.sum()
n = rdd.count()
mean = sum/n
print(mean)

In [None]:
rdd.sortBy(lambda x : x).zipWithIndex().map(lambda (value,key) : (key,value)).collect()

In [None]:
sortedAndIndexed = rdd.sortBy(lambda x : x).zipWithIndex().map(lambda (value,key) : (key,value))
n = sortedAndIndexed.count()

if (n % 2 == 1):
    index = (n-1)/2
    print(sortedAndIndexed.lookup(index))
else:
    index1 = (n/2)-1
    index2 = n/2
    value1 = sortedAndIndexed.lookup(index1)[0]
    value2 = sortedAndIndexed.lookup(index2)[0]
    print(value1+value2)/2

Note: the MEDIAN value is more outlier resistant. 

### Standard Deviation

In [10]:
# Example 1
lst = (1,2,4,5,34,1,32,4,34,2,1,3)

rdd = sc.parallelize(lst)

# To calculate de Standard Deviation is necessary the MEAN
sum = rdd.sum()
n = rdd.count()
mean = sum/n
print(mean)


# Calculate Standard Deviation (.map() is to apply a function to each entry in a RDD)
# from math import sqrt
sqrt(rdd.map(lambda x : pow(x-mean,2)).sum()/n)

10.25


13.392317449443418

In [8]:
# Example 2 - Mean and SD
rdd1 = sc.parallelize([49]*100)

# Mean
sum = rdd1.sum()
n = rdd1.count()
mean = sum/n
print(mean)

# SD
sqrt(rdd1.map(lambda x : pow(x-mean,2)).sum()/n)

# Note: The SD is 0 because all values are lying on the mean, so they are not spread at all!

49.0


0.0

In [9]:
# Example 3 - Mean and SD
rdd2 = sc.parallelize([49]*100+[100])

# Mean
sum = rdd2.sum()
n = rdd1.count()
mean = sum/n
print(mean)

# SD
sqrt(rdd2.map(lambda x : pow(x-mean,2)).sum()/n)

#Note: The mean stays the same, but SD increases, since on value now is spread far away from the mean!

50.0


5.0990195135927845

### Skewness

In [53]:
# Example

lst = (1,2,4,5,34,1,32,4,34,2,1,3)

rdd4 = sc.parallelize(lst)

# To calculate de Standard Deviation is necessary the MEAN
sum = rdd4.sum()
n = rdd4.count()
mean = sum/n
print(mean)


# Calculate Standard Deviation in put in s variable
# from math import sqrt
sd = sqrt(rdd4.map(lambda x : pow(x-mean,2)).sum()/n)
print(sd)


# Let's cast and to float since we are using it to create a normalization constant. 
# Make sure it doesn't get rounded to an integer value
n = float(n)

# Now let's calculate the SKEW: (formula)
skewness = 1/n*(rdd4.map(lambda x : pow(x-mean,3)/pow(sd,3)).sum())
print(skewness)

10.25
13.392317449443418
1.1299979110691871


### Kurtosis

In [65]:
# Example 1

lst = (1,2,4,5,34,1,32,4,34,2,1,3)

rdd5 = sc.parallelize(lst)

# To calculate de Standard Deviation is necessary the MEAN
sum = rdd5.sum()
n = rdd5.count()
mean = sum/n
print(mean)


# Calculate Standard Deviation in put in s variable
# from math import sqrt
sd = sqrt(rdd5.map(lambda x : pow(x-mean,2)).sum()/n)
print(sd)


# Let's cast and to float since we are using it to create a normalization constant. 
# Make sure it doesn't get rounded to an integer value
n = float(n)

# Now let's calculate the KURTOSIS: (formula)
kurtosis = 1/n*(rdd5.map(lambda x : pow(x-mean,4)/pow(sd,4)).sum())
print(kurtosis)

10.25
13.392317449443418
2.326135570534074


### Covariance & Correlation

In [68]:
# RDDs
rddX = sc.parallelize(range(100))
rddY = sc.parallelize(range(100))

In [72]:
# Mean 
meanX = rddX.sum()/float(rddX.count())
meanY = rddY.sum()/float(rddY.count())
print(meanX)
print(meanY)

49.5
49.5


In [None]:
# Zip them together (XY)
rddXY = rddX.zip(rddY)
covXY = rddXY.map(lambda (x,y) : (x-meanX)*(y-meanY)).sum()/rddXY.count()
covXY

In [None]:
# Standard Deviation
from math import sqrt
n = rddXY.count()
sdX = sqrt(rddX.map(lambda x : pow(x-meanX,2)).sum()/n)
sdY = sqrt(rddY.map(lambda x : pow(x-meanY,2)).sum()/n)
print(sdX)
print(sdY)

In [None]:
# To calculate the Correlation by dividing the covariance by the product of the standard deviation from 
# each column. 
corrXY = covXY / (sdX * sdY)
corrXY

### Correlation Matrix 

In [None]:
import random
from pyspark.mllib.stat import Statistics

column1 = sc.parallelize(range(100))
column2 = sc.parallelize(range(100,200))
column3 = sc.parallelize(list(reversed(100)))
column4 = sc.parallelize(random.sample(range(100),100))
data = column1.zip(column2).zip(column3).zip(column4).map(lambda (((a,b),c),d) : (a,b,c,d)).map(lambda (a,b,c,d):[a,b,c,d]) #this code braks in Py3.x
Statistics.corr(data)

### Using statistics lib

In [55]:
import statistics 

In [59]:
lst1 = range(101)

In [60]:
media = statistics.mean(lst1)
media

50

In [61]:
mediana = statistics.median(lst1)
mediana

50

In [None]:
# Check the lib
help(statistics)

# End