In [2]:
from pyspark import SparkContext
import findspark
import numpy as np

sc = SparkContext("local[*]", "Ejercicio1")


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/18 19:13:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/02/18 19:13:24 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Ejercicio 1

In [3]:
def readFile (filename): 
    '''Arguments: 
    filename – name of the spam dataset file 
    12 columns: 11 features/dimensions (X) + 1 column with labels (Y) 
    Y -- Train labels (0 if normal traffic, 1 if botnet)  
    m rows: number of examples (m) 
    Returns: 
    An RDD containing the data of filename. Each example (row) of the file 
    corresponds to one RDD record. Each record of the RDD is a tuple (X,y).  
    “X” is an array containing the 11 features (float number) of an example  
    “y” is the 12th column of an example (integer 0/1) '''
    result = sc.textFile(filename)
    map_result = result.map(lambda row: [float(x) for x in row.split(",")])
    rdd_xy = map_result.map(lambda row: (row[:11],row[11]))
    return rdd_xy

In [4]:
# Prueba
data=readFile("./botnet_reduced_10k_l.csv")
data.take(2)

[([3545.3018916840147,
   3198.0139469522546,
   80.00015469454229,
   1.0000019205086303,
   444960086.4643011,
   476.8073064086366,
   12.999999992936175,
   -7.710424743123667e-09,
   87.00000148917339,
   19099430.15956688,
   2468368394.7593513],
  0.0),
 ([-1.21308761436012e-06,
   7.999989079213265,
   65499.99887793755,
   0.9999997452701503,
   61.99988768910407,
   69.99980788311223,
   11.999999995904211,
   -2.35688187855132e-08,
   8.00000498340475,
   2468369617.006896,
   2468368392.7377276],
  1.0)]

In [5]:
rows_rdd = data.map(lambda line: line[0])
print(rows_rdd.take(1))
print()
cols_rdd = rows_rdd.flatMap(lambda row: [(i, (x,x*x, 1)) for i, x in enumerate(row)])
print(cols_rdd.take(5))
print()
group_rdd = cols_rdd.reduceByKey(lambda a,b:(a[0]+b[0],a[1]+b[1],a[2]+b[2]))
print(group_rdd.take(1))
print()
mean_rdd = group_rdd.map(lambda t: (t[0],(t[1][0] / t[1][2], np.sqrt((t[1][1] / t[1][2]) - (t[1][0] / t[1][2])**2))))
print(mean_rdd.collect())
print()
broadcast_var = sc.broadcast(dict(mean_rdd.collect()))

[[3545.3018916840147, 3198.0139469522546, 80.00015469454229, 1.0000019205086303, 444960086.4643011, 476.8073064086366, 12.999999992936175, -7.710424743123667e-09, 87.00000148917339, 19099430.15956688, 2468368394.7593513]]

[(0, (3545.3018916840147, 12569165.503178252, 1)), (1, (3198.0139469522546, 10227293.204901138, 1)), (2, (80.00015469454229, 6400.024751150697, 1)), (3, (1.0000019205086303, 1.000003841020949, 1)), (4, (444960086.4643011, 1.979894785463183e+17, 1))]

[(0, (12803754.219009735, 42093817354.54738, 10000))]

[(0, (1280.3754219009734, np.float64(1603.1283524554865))), (2, (6731.840347965938, np.float64(16143.570758867212))), (4, (121275247.72856577, np.float64(231897601.15814963))), (6, (9.076034067369386, np.float64(5.298563791442084))), (8, (123.35245169147989, np.float64(90.51210108183089))), (10, (2265313681.7355, np.float64(1295737133.3017318))), (1, (21567.839327212547, np.float64(24184.80186045207))), (3, (60997.64689050441, np.float64(132772.42926971006))), (5, (1

In [6]:
def normalize (RDD_Xy): 
    '''Arguments: 
    RDD_Xy is an RDD containing data examples. Each record of the RDD is a tuple 
    (X,y). 
    “X” is an array containing the 11 features (float number) of an example 
    “y” is the label of the example (integer 0/1)  
    Returns: 
    An RDD rescaled to N(0,1) in each column (mean=0, standard deviation=1) '''
    def map_normalize (RDD_Xy): 
        result = []
        x, y = RDD_Xy
        var = broadcast_var.value
        for i, x in enumerate(x):
             mean_aux, std_aux = var[i]
             if(std_aux!=0):
                 result.append((x - mean_aux)/std_aux)
             else:
                 result.append(0.0)
        return result, y
        
    rdd_norm = RDD_Xy.map(map_normalize)
    return rdd_norm

In [7]:
rdd_norm = normalize(data)
print(rdd_norm.take(2))

[([np.float64(1.4128166757913607), np.float64(-0.7595607144625546), np.float64(-0.4120426820452796), np.float64(-0.45940747807420984), np.float64(1.3958093448107236), np.float64(-0.3538621824769221), np.float64(0.7405716114816884), np.float64(-0.8945780973436418), np.float64(-0.4016308291135645), np.float64(-2.9103919941544856), np.float64(0.15670980463949324)], 0.0), ([np.float64(-0.7986730576830795), np.float64(-0.8914623101952646), np.float64(3.640344469496744), np.float64(-0.45940747809059307), np.float64(-0.5229686944711893), np.float64(-0.35387136240303985), np.float64(0.551841224079898), np.float64(-0.8945781049085767), np.float64(-1.2744422605302952), np.float64(0.4762025966160194), np.float64(0.156709803079282)], 1.0)]


In [8]:
# QUITAR: COMPROBACION SI DA 1 PARA CADA INDICE LA SUMA
import math
# comprobar que cada columna queda ~ media 0 y std 1
x_norm = rdd_norm.map(lambda xy: xy[0])

check = (
    x_norm
    .flatMap(lambda row: [(i, (v, v*v, 1)) for i, v in enumerate(row)])
    .reduceByKey(lambda a,b: (a[0]+b[0], a[1]+b[1], a[2]+b[2]))
    .mapValues(lambda t: (
        t[0]/t[2],  # mean
        math.sqrt(max((t[1]/t[2]) - (t[0]/t[2])**2, 0.0))  # std
    ))
    .collect()
)

print(sorted(check, key=lambda x: x[0]))
# medias ~ 0, std ~ 1 (salvo redondeo numérico)

[(0, (np.float64(1.737507915322567e-14), 1.0000000000000009)), (1, (np.float64(9.445244586459011e-15), 0.999999999999989)), (2, (np.float64(-1.3302070556164836e-14), 0.9999999999999883)), (3, (np.float64(2.3310064989345848e-14), 0.9999999999999803)), (4, (np.float64(-1.3432810419544694e-15), 0.9999999999999976)), (5, (np.float64(-4.6409098786170944e-15), 1.0000000000000069)), (6, (np.float64(1.645616976020392e-15), 0.999999999999988)), (7, (np.float64(-1.1839595970286609e-14), 1.000000000000001)), (8, (np.float64(3.481659405224491e-17), 0.99999999999998)), (9, (np.float64(2.1675106154361856e-14), 0.9999999999998741)), (10, (np.float64(-2.4653701302668196e-14), 1.0000000000000524))]


In [13]:
# PRIMERA PRUEBA TRAIN
import numpy as np
import math

def sigmoid(z):
    if z >= 0:
        ez = math.exp(-z)
        return 1.0 / (1.0 + ez)
    else:
        ez = math.exp(z)
        return ez / (1.0 + ez)

def _sample_grad(xy, w, b):
    X, y = xy
    X = np.asarray(X, dtype=np.float64)
    y = float(y)

    z = float(np.dot(w, X) + b)
    y_hat = sigmoid(z)
    diff = y_hat - y

    grad_w = diff * X      # vector (11,)
    grad_b = diff          # escalar

    loss = -(y * math.log(y_hat)+(1-y)*math.log(1-y_hat))
    return (grad_w, grad_b, loss)

def train(RDD_Xy, iterations, learning_rate, lambda_reg):
    '''Arguments: 
    RDD_Xy --- RDD containing data examples. Each record of the RDD is a tuple 
    (X,y). 
    “X” is an array containing the 11 features (float number) of an example 
    “y” is the label of the example (integer 0/1)  
    iterations -- number of iterations of the optimization loop 
    learning_rate -- learning rate of the gradient descent 
    lambda_reg – regularization rate: l2 es el que vamos a aplicar
    
    Returns: 
    A list or array containing the weights “w” and bias “b”	at the end of the 
    training process'''	

    sc = RDD_Xy.context
    data = RDD_Xy.cache()

    m = data.count()
    if m == 0:
        raise ValueError("RDD_Xy vacío")

    k = len(data.first()[0])  # 11
    #cost=[]
    # inicialización
    rng = np.random.default_rng(42)
    w = rng.normal(0, 0.01, size=k).astype(np.float64)
    b = float(rng.normal(0, 0.01))
    for i in range(iterations):
        bc_w = sc.broadcast(w)
        bc_b = sc.broadcast(b)

        # suma de gradientes por todo el dataset
        sum_grad_w, sum_grad_b, sum_loss = data.map(
            lambda xy: _sample_grad(xy, bc_w.value, bc_b.value)
        ).reduce(
            lambda a, c: (a[0] + c[0],a[1] + c[1], a[2] + c[2])
        )

        bc_w.unpersist()
        bc_b.unpersist()

        # promedio + L2
        grad_w = (sum_grad_w / m) + (lambda_reg / k) * w   # si tu rúbrica usa /k
        grad_b = (sum_grad_b / m)
        
        # update
        w = w - learning_rate * grad_w
        b = b - learning_rate * grad_b
        reg_term = (lambda_reg / (2*k)) * float(np.dot(w,w))
        J = (sum_loss / m ) + reg_term
        #cost[i]=J
        
        print(f"Loss en iteracion {i}: {J}")

    return [w, b]

In [14]:
def accuracy (w, b, RDD_Xy): 
    '''Arguments: 
    w -- weights 
    b -- bias 
    RDD_Xy – RDD containing examples to be predicted  
    Returns: 
    accuracy -- the number of predictions that are correct divided by the number         
    of records (examples) in RDD_xy.  
    Predict function can be used for predicting a single example'''
    pred_ok = RDD_Xy.map(
            lambda xy: 1 if predict(w, b, xy[0]) == int(xy[1]) else 0
        )
    
    correct = pred_ok.reduce(lambda a, c: a + c)
    total = RDD_Xy.count()
    return correct / total if total > 0 else 0.0

In [15]:
def predict (w, b, X): 
    '''Arguments: 
    w -- weights 
    b -- bias 
    X – Example to be predicted  
     
    Returns: 
    Y_pred – a value (0/1) corresponding to the prediction of X '''
    threshold=0.5
    z = float(np.dot(np.asarray(w, dtype=float), np.asarray(X, dtype=float)) + float(b))
    p = sigmoid(z)
    return 1 if p >= threshold else 0

In [16]:
# read data
data=readFile("./botnet_tot_syn_l_3.csv")
# standarize
data = normalize(data)

ws = train(data,20,5,1.5)
print(ws)
w,b = ws
acc = accuracy(w,b,data)
print("acc:",acc)

26/02/18 19:13:57 WARN BlockManager: Task 43 already completed, not releasing lock for rdd_25_0
                                                                                

Loss en iteracion 0: 1.1773034242148737
Loss en iteracion 1: 0.3237438545305561
Loss en iteracion 2: 0.5389194864123462
Loss en iteracion 3: 0.3629788586171003
Loss en iteracion 4: 0.5102610587597608
Loss en iteracion 5: 0.36875847433671377
Loss en iteracion 6: 0.4941290525559061
Loss en iteracion 7: 0.37244209516993365
Loss en iteracion 8: 0.4834929427127869
Loss en iteracion 9: 0.3753502590459937
Loss en iteracion 10: 0.47577420534518844
Loss en iteracion 11: 0.37787589774353636
Loss en iteracion 12: 0.4698760928829802
Loss en iteracion 13: 0.38015241834542696
Loss en iteracion 14: 0.4652373565093517
Loss en iteracion 15: 0.38224327063321406
Loss en iteracion 16: 0.46153221522031307
Loss en iteracion 17: 0.3841895214635521
Loss en iteracion 18: 0.45855503728238484
Loss en iteracion 19: 0.386022742624327
[array([-0.08318148, -0.37121413, -0.16567924, -0.1438316 , -0.61248303,
        0.39552139,  0.09157951, -0.52443077,  0.23814446,  0.31961884,
        0.22016539]), -0.0606372660256

In [17]:
sc.stop()