In [169]:
from pyspark.sql import SparkSession
import os
import subprocess
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
import pyspark.pandas as ps



In [3]:
spark = SparkSession.builder.master("local[4]").appName("Transform").config("spark.ui.port", '4050').getOrCreate()
sc = spark.sparkContext

24/12/18 12:09:16 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [32]:
fileName = subprocess.check_output('ls extractedDataset | grep ".*.csv"', shell=True, text=True).removesuffix('\n')
path = os.getcwd() + "/extractedDataset/" + fileName
df = spark.read.csv(path, header=True, nullValue='')

In [35]:
print(df.show())

+-------------+------------+-----------------+--------------------+-----------------+-----------+--------------+-------------+--------+---------------+
|accessionYear|artistGender|artistNationality|             culture|       department|isHighlight|isTimelineWork|objectEndDate|objectID|     objectName|
+-------------+------------+-----------------+--------------------+-----------------+-----------+--------------+-------------+--------+---------------+
|         1979|        NULL|         American|                NULL|The American Wing|      false|         false|         1907|      14|           Coin|
|         1989|        NULL|         American|            American|The American Wing|      false|         false|         1814|     108|        Andiron|
|         1946|        NULL|             NULL|                NULL|The American Wing|      false|         false|         1890|     366|         Basket|
|         1946|        NULL|             NULL|            American|The American Wing|   

In [37]:
print(df.describe().show())

24/12/18 13:49:58 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+------------------+------------+-----------------+-----------------+--------------------+-----------+--------------+------------------+------------------+--------------------+
|summary|     accessionYear|artistGender|artistNationality|          culture|          department|isHighlight|isTimelineWork|     objectEndDate|          objectID|          objectName|
+-------+------------------+------------+-----------------+-----------------+--------------------+-----------+--------------+------------------+------------------+--------------------+
|  count|              5028|         128|             2671|             1690|                5048|       5048|          5048|              5048|              5048|                5027|
|   mean|1956.0318915686664|        NULL|             NULL|             NULL|                NULL|       NULL|          NULL|1411.4724643423137| 403327.7834786054|                NULL|
| stddev| 35.73250594529043|        NULL|             NULL|             NUL

In [None]:
#create custom function to convert gender column to boolean indicating if artist is female
# 0=Male, 1=Female
# in the database, they only store a value in the gender field if the artist is female
# otherwise it is null
def convertGender(s):
    if s == 'male':
        return 0
    else:
        return 1
# register user defined functions
convertGenderUDF = udf(lambda x:convertGender(x), IntegerType())

# take artistGender column, apply user-defined function to columns and put it in new column 
def genderColumn(df):
    # change nulls to male here because I don't know what datatype spark uses to represent nulls
    # So I cant detect if a value is null when I'm applying convertGender to each individual value in the column
    x = df.na.fill(value='male', subset=['artistGender'])
    # create new column that is made from applying the custom function to the gender column
    x = x.withColumn("isFemale", convertGenderUDF(col("artistGender")))
    x = x.drop(col("artistGender"))
    return x

In [None]:
# create dummy variables for department column
# meaning we add a column for each department, in each of these columns,
# the value will be 1 if that object is in that department, otherwise it is 0
def departmentColumn(df):


In [None]:
def transformDataset(df):
    # in the database, they only store a value in the gender field if the artist is female
    # otherwise it is null
    # if there are multiple artists attributed to an object, the gender of each artist is shown, separated by a '|'
    x = genderColumn(df)
    # change the department column to a series of dummy variables
    # since there are a small number of distinct departments (X)
    return x

In [155]:
x = transformDataset(df)
x.show()

+-------------+-----------------+--------------------+-----------------+-----------+--------------+-------------+--------+---------------+--------+
|accessionYear|artistNationality|             culture|       department|isHighlight|isTimelineWork|objectEndDate|objectID|     objectName|isFemale|
+-------------+-----------------+--------------------+-----------------+-----------+--------------+-------------+--------+---------------+--------+
|         1979|         American|                NULL|The American Wing|      false|         false|         1907|      14|           Coin|       0|
|         1989|         American|            American|The American Wing|      false|         false|         1814|     108|        Andiron|       0|
|         1946|             NULL|                NULL|The American Wing|      false|         false|         1890|     366|         Basket|       0|
|         1946|             NULL|            American|The American Wing|      false|         false|         1870

In [163]:
gender = x.select("isFemale").rdd.map(lambda x: x[0])
gender.reduce(lambda x, y: x+y)

128

In [165]:
x.select("department").distinct().show()

+--------------------+
|          department|
+--------------------+
|         Islamic Art|
|           Asian Art|
|      Arms and Armor|
|   Costume Institute|
|Ancient Near East...|
|        Medieval Art|
|   The American Wing|
|         Photographs|
| Greek and Roman Art|
|European Sculptur...|
|Robert Lehman Col...|
|       The Cloisters|
|The Michael C. Ro...|
| Drawings and Prints|
|        Egyptian Art|
| Musical Instruments|
|       The Libraries|
|Modern and Contem...|
|  European Paintings|
|                NULL|
+--------------------+

