Installing for google colab

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

Be careful with the spark version

In [None]:
!wget -q https://dlcdn.apache.org/spark/spark-3.0.3/spark-3.0.3-bin-hadoop2.7.tgz
!tar xf spark-3.0.3-bin-hadoop2.7.tgz

In [None]:
!pip install -q findspark

In [None]:
# Environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.3-bin-hadoop2.7"

Starting a pyspark session

In [None]:
import findspark
findspark.init()

In [None]:
pyspark_python = "/usr/local/bin/python"

from pyspark.sql import SparkSession
spark = SparkSession.builder \
        .appName("2-Feature-Preparation") \
        .master("local") \
        .config("spark.pyspark.python",pyspark_python) \
        .getOrCreate()

In [None]:
spark

Reading the data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataset_path = "/content/drive/MyDrive/UCSP/Big-Data/Final-project/data-processed"

data = spark.read \
            .format("parquet") \
            .load(dataset_path)

events = data.count()
print("There are {} events".format(events))

There are 18045 events


In [None]:
labels = ['QCD', 'tt', 'W+jets']
counts = data.groupBy('label').count().collect()

qcd_events = 0
tt_events = 0 
wjets_events = 0

print('There are:')
for i in range(3):
    print('\t* {} {} events (frac = {:.3f})'
          .format(
              counts[i][1],
              labels[counts[i].label],
              counts[i][1]*1.0/events
          ))
    if counts[i].label==0:
        qcd_events = counts[i][1]
    elif counts[i].label==1:
        tt_events = counts[i][1] 
    elif counts[i].label==2:
        wjets_events = counts[i][1]

There are:
	* 4101 tt events (frac = 0.227)
	* 13647 W+jets events (frac = 0.756)
	* 297 QCD events (frac = 0.016)


Feature preparation

In [None]:
data.printSchema()

root
 |-- hfeatures: vector (nullable = true)
 |-- lfeatures: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: double (containsNull = true)
 |-- label: integer (nullable = true)



In [None]:
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf

vector_dense_udf = udf(lambda r : Vectors.dense(r),VectorUDT())
data = data.withColumn('hfeatures_dense',vector_dense_udf('hfeatures'))

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import MinMaxScaler

## One-Hot-Encode
# Use OneHotEncoderEstimator for Spark 2.x and OneHotEncoder for Spark 3.x
encoder = OneHotEncoder(inputCols=["label"],
                        outputCols=["encoded_label"],
                        dropLast=False)

## Scale feature vector
scaler = MinMaxScaler(inputCol="hfeatures_dense",
                      outputCol="HLF_input")

pipeline = Pipeline(stages=[encoder, scaler])

%time fitted_pipeline = pipeline.fit(data)

CPU times: user 68.4 ms, sys: 10.4 ms, total: 78.8 ms
Wall time: 11.3 s


In [None]:
# Apply the pipeline to data
data = fitted_pipeline.transform(data)

In [None]:
data.printSchema()

root
 |-- hfeatures: vector (nullable = true)
 |-- lfeatures: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: double (containsNull = true)
 |-- label: integer (nullable = true)
 |-- hfeatures_dense: vector (nullable = true)
 |-- encoded_label: vector (nullable = true)
 |-- HLF_input: vector (nullable = true)



In [None]:
import math

class lepAngularCoordinates():
    """
    This class is used to store the lepton and compute DeltaR 
    from the other particles
    """
    def __init__(self, eta, phi):
        self.Eta = eta
        self.Phi = phi
    
    def DeltaR(self, eta, phi):
        deta = self.Eta - eta
        
        dphi = self.Phi - phi       
        pi = math.pi
        while dphi >  pi: dphi -= 2*pi
        while dphi < -pi: dphi += 2*pi
            
        return math.sqrt(deta*deta + dphi*dphi)

In [None]:
from pyspark.sql.types import ArrayType, DoubleType
from sklearn.preprocessing import StandardScaler

@udf(returnType=ArrayType(ArrayType(DoubleType())))
def transform(particles):
    ## The isolated lepton is the first partiche in the list
    ISOlep = lepAngularCoordinates(particles[0][5], particles[0][6])
    
    ## Sort the particles based on the distance from the isolated lepton
    particles.sort(key = lambda part: ISOlep.DeltaR(part[5], part[6]),
                   reverse=True)
    
    ## Standardize
    particles = StandardScaler().fit_transform(particles).tolist()
    
    return particles

In [None]:
data = data.withColumn('GRU_input', transform('lfeatures'))

In [None]:
data.printSchema()

root
 |-- hfeatures: vector (nullable = true)
 |-- lfeatures: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: double (containsNull = true)
 |-- label: integer (nullable = true)
 |-- hfeatures_dense: vector (nullable = true)
 |-- encoded_label: vector (nullable = true)
 |-- HLF_input: vector (nullable = true)
 |-- GRU_input: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: double (containsNull = true)



Undersampled the dataset

In [None]:
qcd = data.filter('label=0')
tt = data.filter('label=1')
wjets = data.filter('label=2')

In [None]:
# Create the undersampled dataframes
# False means to sample without repetition
tt = tt.sample(False, qcd_events*1.0/tt_events) 
wjets = wjets.sample(False, qcd_events*1.0/wjets_events)

dataUndersampled = qcd.union(tt).union(wjets)

In [None]:
dataUndersampled.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|    1|  292|
|    2|  328|
|    0|  297|
+-----+-----+



Shuffle the dataset

In [None]:
from pyspark.sql.functions import rand 
trainUndersampled, testUndersampled = dataUndersampled.randomSplit([0.8, 0.2], seed=42)
trainUndersampled = trainUndersampled.orderBy(rand(seed=42))

Save the dataset as Apache Parquet files

In [None]:
PATH = "/content/drive/MyDrive/UCSP/Big-Data/Final-project/data-for-training/"

numTestPartitions = 1

%time testUndersampled.coalesce(numTestPartitions).write.parquet(PATH + 'testUndersampled.parquet')

CPU times: user 2.93 s, sys: 338 ms, total: 3.27 s
Wall time: 10min 43s


In [None]:
numTrainPartitions = 1

%time trainUndersampled.coalesce(numTrainPartitions).write.parquet(PATH + 'trainUndersampled.parquet')

CPU times: user 3.67 s, sys: 464 ms, total: 4.14 s
Wall time: 13min 28s


In [None]:
spark.stop()