In [1]:
from pyspark.sql import SparkSession, Row

spark = (SparkSession
    .builder
    .config("spark.jars.packages", pydeequ.deequ_maven_coord)
    .config("spark.jars.excludes", pydeequ.f2j_maven_coord)
    .getOrCreate())

In [34]:
df = spark.read.csv('train.csv',header=True)

In [37]:
df.printSchema()

root
 |-- PassengerId: string (nullable = true)
 |-- Survived: string (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- SibSp: string (nullable = true)
 |-- Parch: string (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [42]:
from pydeequ.profiles import *

In [39]:
result = ColumnProfilerRunner(spark) \
            .onData(df) \
            .run()

In [16]:
result.columnProfileClasses['StandardColumnProfile']

pydeequ.profiles.StandardColumnProfile

In [20]:
result.columnProfileClasses['NumericColumnProfile']

pydeequ.profiles.NumericColumnProfile

In [41]:
for col, profile in result.profiles.items():
    print(f'Column \'{col}\'')
    print('\t',f'completeness: {profile.completeness}')
    print('\t',f'approximate number of distinct values: {profile.approximateNumDistinctValues}')
    print('\t',f'datatype: {profile.dataType}')
    if profile.dataType == 'Integral' or profile.dataType == 'Fractional':
        print('\t',f"minimum: {profile.minimum}")
        print('\t',f"maximum: {profile.maximum}")
        print('\t',f"mean: {profile.mean}")
        print('\t',f"standard deviation: {profile.stdDev}")

Column 'PassengerId'
	 completeness: 1.0
	 approximate number of distinct values: 888
	 datatype: Integral
	 minimum: 1.0
	 maximum: 891.0
	 mean: 446.0
	 standard deviation: 257.20938292890224
Column 'Name'
	 completeness: 1.0
	 approximate number of distinct values: 936
	 datatype: String
Column 'Ticket'
	 completeness: 1.0
	 approximate number of distinct values: 710
	 datatype: String
Column 'Pclass'
	 completeness: 1.0
	 approximate number of distinct values: 3
	 datatype: Integral
	 minimum: 1.0
	 maximum: 3.0
	 mean: 2.308641975308642
	 standard deviation: 0.8356019334795166
Column 'Parch'
	 completeness: 1.0
	 approximate number of distinct values: 7
	 datatype: Integral
	 minimum: 0.0
	 maximum: 6.0
	 mean: 0.38159371492704824
	 standard deviation: 0.8056047612452213
Column 'Embarked'
	 completeness: 0.9977553310886644
	 approximate number of distinct values: 3
	 datatype: String
Column 'Age'
	 completeness: 0.8013468013468014
	 approximate number of distinct values: 83
	 data

In [46]:
from pydeequ.analyzers import *

for i in df.columns:
    analysisResult = AnalysisRunner(spark) \
                        .onData(df) \
                        .addAnalyzer(Completeness(i)) \
                        .addAnalyzer(CountDistinct(i)) \
                        .addAnalyzer(Maximum(i)) \
                        .addAnalyzer(DataType(i)) \
                        .run()
    a_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult, pandas = True)
    print(a_df)

    entity     instance                        name  value
0   Column  PassengerId                Completeness    1.0
1   Column  PassengerId              Histogram.bins    5.0
2   Column  PassengerId       Histogram.abs.Boolean    0.0
3   Column  PassengerId     Histogram.ratio.Boolean    0.0
4   Column  PassengerId    Histogram.abs.Fractional    0.0
5   Column  PassengerId  Histogram.ratio.Fractional    0.0
6   Column  PassengerId      Histogram.abs.Integral  891.0
7   Column  PassengerId    Histogram.ratio.Integral    1.0
8   Column  PassengerId       Histogram.abs.Unknown    0.0
9   Column  PassengerId     Histogram.ratio.Unknown    0.0
10  Column  PassengerId        Histogram.abs.String    0.0
11  Column  PassengerId      Histogram.ratio.String    0.0
12  Column  PassengerId               CountDistinct  891.0
    entity  instance                        name  value
0   Column  Survived                Completeness    1.0
1   Column  Survived              Histogram.bins    5.0
2   Co

    entity  instance                        name       value
0   Column  Embarked                Completeness    0.997755
1   Column  Embarked              Histogram.bins    5.000000
2   Column  Embarked       Histogram.abs.Boolean    0.000000
3   Column  Embarked     Histogram.ratio.Boolean    0.000000
4   Column  Embarked    Histogram.abs.Fractional    0.000000
5   Column  Embarked  Histogram.ratio.Fractional    0.000000
6   Column  Embarked      Histogram.abs.Integral    0.000000
7   Column  Embarked    Histogram.ratio.Integral    0.000000
8   Column  Embarked       Histogram.abs.Unknown    2.000000
9   Column  Embarked     Histogram.ratio.Unknown    0.002245
10  Column  Embarked        Histogram.abs.String  889.000000
11  Column  Embarked      Histogram.ratio.String    0.997755
12  Column  Embarked               CountDistinct    3.000000


In [47]:
spark.stop()