# Processing Big Data - Deequ Analysis
A library for measuring the data quality of large datasets. 

## Import dependencies

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import pydeequ
from pydeequ.analyzers import *
from pydeequ.profiles import *
from pydeequ.suggestions import *
from pydeequ.checks import *
from pydeequ.verification import *

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import DecimalType, DoubleType, IntegerType, DateType, NumericType, StructType, StringType, StructField

Please set env variable SPARK_VERSION


In [2]:
spark = (SparkSession
    .builder
    .config("spark.jars.packages", pydeequ.deequ_maven_coord)
    .config("spark.jars.excludes", pydeequ.f2j_maven_coord)
    .getOrCreate())

## Read data into spark dataframe

In [18]:
#read parquet file
#file = 'train'

#df = spark.read.csv('train.csv')

df_pandas = pd.read_csv('train.csv')
df = spark.createDataFrame(df_pandas)

## **Run tests on the dataset**

## Test 1 - Null values ⛔️
Checking the data for completeness.

In [19]:
df.printSchema()

root
 |-- doi: string (nullable = true)
 |-- text_id: string (nullable = true)
 |-- text: string (nullable = true)
 |-- sdg: long (nullable = true)
 |-- labels_negative: long (nullable = true)
 |-- labels_positive: long (nullable = true)
 |-- agreement: double (nullable = true)
 |-- id: long (nullable = true)



In [20]:
#Checking for NULL values
checkResult = VerificationSuite(spark) \
                    .onData(df) \
                    .addCheck(
                    Check(spark,CheckLevel.Warning, "missing values")\
                    .isComplete('doi')\
                    .isComplete('text_id')\
                    .isComplete('text')\
                    .isComplete('sdg')\
                    .isComplete('labels_negative')\
                    .isComplete('labels_positive')\
                    .isComplete('agreement')\
                    .isComplete('id')\
                    .areComplete(df.columns))\
                    .run()

checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
checkResult_df.toPandas()


Unnamed: 0,check,check_level,check_status,constraint,constraint_status,constraint_message
0,missing values,Warning,Success,"CompletenessConstraint(Completeness(doi,None))",Success,
1,missing values,Warning,Success,"CompletenessConstraint(Completeness(text_id,No...",Success,
2,missing values,Warning,Success,"CompletenessConstraint(Completeness(text,None))",Success,
3,missing values,Warning,Success,"CompletenessConstraint(Completeness(sdg,None))",Success,
4,missing values,Warning,Success,CompletenessConstraint(Completeness(labels_neg...,Success,
5,missing values,Warning,Success,CompletenessConstraint(Completeness(labels_pos...,Success,
6,missing values,Warning,Success,"CompletenessConstraint(Completeness(agreement,...",Success,
7,missing values,Warning,Success,"CompletenessConstraint(Completeness(id,None))",Success,
8,missing values,Warning,Success,ComplianceConstraint(Compliance(Combined Compl...,Success,


## Test 2 - Zero Values 🅾️

Checking for zero values within the dataset.

In [21]:
#TODO: Write your code here
checkResult = VerificationSuite(spark) \
                    .onData(df) \
                    .addCheck(
                    Check(spark,CheckLevel.Warning, "Non Zero Values")\
                    .satisfies("sdg == 0", "Zero values", lambda x: x==0)\
                    .satisfies("labels_negative == 0", "Zero values", lambda x: x==0)\
                    .satisfies("labels_positive == 0", "Zero values", lambda x: x==0)\
                    .satisfies("agreement == 0", "Zero values", lambda x: x==0)\
                    .satisfies("id == 0", "Zero values", lambda x: x==0)\
                    )\
                    .run()

checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
checkResult_df.toPandas()

Unnamed: 0,check,check_level,check_status,constraint,constraint_status,constraint_message
0,Non Zero Values,Warning,Warning,"ComplianceConstraint(Compliance(Zero values,sd...",Success,
1,Non Zero Values,Warning,Warning,"ComplianceConstraint(Compliance(Zero values,la...",Failure,Value: 0.3996685168054271 does not meet the co...
2,Non Zero Values,Warning,Warning,"ComplianceConstraint(Compliance(Zero values,la...",Failure,Value: 0.024668516805427074 does not meet the ...
3,Non Zero Values,Warning,Warning,"ComplianceConstraint(Compliance(Zero values,ag...",Failure,Value: 0.023936170212765957 does not meet the ...
4,Non Zero Values,Warning,Warning,"ComplianceConstraint(Compliance(Zero values,id...",Success,


## Test 3 - Negative values ➖️
Checking that all values in the data are positive.


In [22]:
#TODO: Write your code here
#Check for Negative values
checkResult = VerificationSuite(spark) \
                    .onData(df) \
                    .addCheck(
                    Check(spark,CheckLevel.Warning, "Non Negative Values")\
                    .isNonNegative('sdg')\
                    .isNonNegative('labels_negative')\
                    .isNonNegative('labels_positive')\
                    .isNonNegative('agreement')\
                    .isNonNegative('id')\
                    )\
                    .run()

checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
checkResult_df.toPandas()


Unnamed: 0,check,check_level,check_status,constraint,constraint_status,constraint_message
0,Non Negative Values,Warning,Success,ComplianceConstraint(Compliance(sdg is non-neg...,Success,
1,Non Negative Values,Warning,Success,ComplianceConstraint(Compliance(labels_negativ...,Success,
2,Non Negative Values,Warning,Success,ComplianceConstraint(Compliance(labels_positiv...,Success,
3,Non Negative Values,Warning,Success,ComplianceConstraint(Compliance(agreement is n...,Success,
4,Non Negative Values,Warning,Success,ComplianceConstraint(Compliance(id is non-nega...,Success,


## Test 4 - Determine Maximum Values ⚠️
We want to find the maximum values in the dataset for the numerical fields. Extremum values can often be used to define an upper bound for the column values so we can define them as the threshold values.

In [23]:
#TODO: Write your code here
num_cols = ['sdg','labels_negative','labels_positive','agreement','id']

result = ColumnProfilerRunner(spark) \
    .onData(df) \
    .run()

for col, profile in result.profiles.items():
    #print(col,profile)
  if col in num_cols:
       print(f'Column: \'{col}\'')
       print('\t',f'Maximum Value: {profile.maximum}')

Column: 'sdg'
	 Maximum Value: 15.0
Column: 'agreement'
	 Maximum Value: 1.0
Column: 'labels_positive'
	 Maximum Value: 925.0
Column: 'id'
	 Maximum Value: 25944.0
Column: 'labels_negative'
	 Maximum Value: 837.0


## Test 5 - Duplication 👥️
Lastly, we want to determine the uniqueness of the items found in the dataframe. 

The first thing to check will be if the primary key values within the dataset are unique - in our case, that will be a combination of the **text_id** and the **id**. Secondly, we want to check if the entries are all unique, which is done by checking for duplicates across that whole dataset.

In [24]:
#TODO: Write your code here
checkResult = VerificationSuite(spark) \
                    .onData(df) \
                    .addCheck(
                    Check(spark,CheckLevel.Error, "Unique Values")\
                    .hasUniqueness(("text_id","id"), lambda x: x == 1)\
                    )\
                    .run()

checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
checkResult_df.toPandas()


Unnamed: 0,check,check_level,check_status,constraint,constraint_status,constraint_message
0,Unique Values,Error,Success,UniquenessConstraint(Uniqueness(Stream(text_id...,Success,
