In [43]:
from pyspark.sql import SparkSession
import os
import subprocess
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType, StringType
import pyspark.pandas as ps
import numpy as np
import pyspark.ml
import pandas as pd

In [2]:
spark = SparkSession.builder.master("local[4]").appName("TransformPandas").config("spark.ui.port", '4050').getOrCreate()
sc = spark.sparkContext

24/12/18 17:50:27 WARN Utils: Your hostname, Jacobs-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.104 instead (on interface en0)
24/12/18 17:50:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/18 17:50:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
fileName = subprocess.check_output('ls extractedDataset | grep ".*.csv"', shell=True, text=True).removesuffix('\n')
path = os.getcwd() + "/extractedDataset/" + fileName

In [4]:
df = ps.read_csv(path, index_col=None)



In [5]:
df.head()

Unnamed: 0,accessionYear,artistGender,artistNationality,culture,department,isHighlight,isTimelineWork,objectEndDate,objectID,objectName
0,1979,,American,,The American Wing,False,False,1907,14,Coin
1,1989,,American,American,The American Wing,False,False,1814,108,Andiron
2,1946,,,,The American Wing,False,False,1890,366,Basket
3,1946,,,American,The American Wing,False,False,1870,540,Bitters bottle
4,1937,,,American,The American Wing,False,False,1800,653,Bottle fragment


In [6]:
df.describe()

Unnamed: 0,objectEndDate,objectID
count,5048.0,5048.0
mean,1411.472464,403327.783479
std,982.326369,246533.23984
min,-5000.0,14.0
25%,1596.0,217951.0
50%,1843.0,383850.0
75%,1909.0,591846.0
max,2038.0,920333.0


In [7]:
def genderColumn(df):
    #change nulls to male, just makes it easier to detect if the gender is not female when doing the mapping
    df['artistGender'] = df['artistGender'].fillna(value='male')
    df['isFemale'] = df['artistGender'].map(lambda x: 0 if x=='male' else 1)
    df = df.drop(columns=['artistGender'])
    return df

In [8]:
def makeDepartmentDummies(df: ps.DataFrame):
    dummies = ps.get_dummies(df['department'], prefix="department")
    df = df.drop(columns=['department'])
    df = ps.concat([df, dummies], axis='columns')
    return df

In [9]:
# Function that applies all of the transformations at once
def transformDataset(df):
    # in the database, they only store a value in the gender field if the artist is female
    # otherwise it is null
    # if there are multiple artists attributed to an object, the gender of each artist is shown, separated by a '|'
    # I'm going to simplify this a bit by just having this column equal to 1 if there is at least 1 female artist attributed
    df = genderColumn(df)
    # change the department column to a series of dummy variables
    # since there are a small number of distinct departments (X)
    df = makeDepartmentDummies(df)
    # change isHighlight and isTimelineWork to integers
    df['isTimelineWork'] = df['isTimelineWork'].map(lambda x: int(x))
    df['isHighlight'] = df['isHighlight'].map(lambda x: int(x))
    return df

In [10]:
mod_df = transformDataset(df)
mod_df.head(10)

24/12/18 17:51:05 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Unnamed: 0,accessionYear,artistNationality,culture,isHighlight,isTimelineWork,objectEndDate,objectID,objectName,isFemale,department_Ancient Near Eastern Art,department_Arms and Armor,department_Asian Art,department_Costume Institute,department_Drawings and Prints,department_Egyptian Art,department_European Paintings,department_European Sculpture and Decorative Arts,department_Greek and Roman Art,department_Islamic Art,department_Medieval Art,department_Modern and Contemporary Art,department_Musical Instruments,department_Photographs,department_Robert Lehman Collection,department_The American Wing,department_The Cloisters,department_The Libraries,department_The Michael C. Rockefeller Wing
0,1979,American,,0,0,1907,14,Coin,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,1989,American,American,0,0,1814,108,Andiron,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,1946,,,0,0,1890,366,Basket,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,1946,,American,0,0,1870,540,Bitters bottle,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,1937,,American,0,0,1800,653,Bottle fragment,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
5,1946,American,American,0,0,1880,779,Bowl,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
6,1910,,"Chinese, for American market",0,0,1790,856,Bowl,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
7,1910,,"Chinese, for American market",0,0,1790,867,Bowl,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
8,1917,,Mexican,0,0,1845,895,Cup,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
9,1883,,,0,0,1800,906,Box,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [76]:
from pyspark.ml.feature import VectorAssembler
sparkDF2 = spark.createDataFrame(mod_df)
# columns = mod_df.columns.to_list()
# features = columns.remove('isHighlight')
# print(features)
# vec = VectorAssembler(inputCols=features, outputCol='isHighlight')
# vec_x = vec.transform(mod_df)


PySparkTypeError: [CANNOT_INFER_SCHEMA_FOR_TYPE] Can not infer schema for type: `str`.