# Importing Library

In [1]:
import csv

# About Artworks Data

## Museum of Modern Art Collection
The Museum of Modern Art (MoMA) acquired its first artworks in 1929, the year it was established. Today, the Museum’s evolving collection contains almost 200,000 works from around the world spanning the last 150 years. The collection includes an ever-expanding range of visual expression, including painting, sculpture, printmaking, drawing, photography, architecture, design, film, and media and performance art.

MoMA is committed to helping everyone understand, enjoy, and use our collection. The Museum’s website features 72,706 artworks from 20,956 artists. The artworks dataset contains 130,262 records, representing all of the works that have been accessioned into MoMA’s collection and cataloged in our database. It includes basic metadata for each work, including title, artist, date, medium, dimensions, and date acquired by the Museum. Some of these records have incomplete information and are noted as “not curator approved.” 

The follwoing work is down on the artiest dataset which contains 16,730 records, representing all the artists who have work in MoMA's collection and have been cataloged in MOMA's database. The dataset includes information about 'Title', 'Artist', 'Nationality', 'BeginDate', 'EndDate', 'Gender', 'Date' and 'Department'. It has been already enhanced from the dataset shared by MOMA which included basic metadata for each artist, including name, nationality, gender, birth year, and death year.

https://www.kaggle.com/datasets/momanyc/museum-collection?resource=download

# Loading Data

In [2]:
f = open("data.csv", encoding = "utf-8")

data = csv.reader(f)

In [3]:
moma = list(data)
len(moma)

16730

In [4]:
headers = moma[0]
print(headers)

['Title', 'Artist', 'Nationality', 'BeginDate', 'EndDate', 'Gender', 'Date', 'Department']


In [5]:
artworks = moma[1:]

### Getting Departments

In [6]:
depts = []

for row in artworks:
    dept = row[-1]
    depts.append(dept)

In [7]:
len(depts)

16729

In [8]:
unq_depts = list(set(depts))

In [9]:
print(unq_depts)

['Photography', 'Drawings', 'Painting & Sculpture', 'Media and Performance Art', 'Fluxus Collection', 'Architecture & Design', 'Prints & Illustrated Books', 'Film']


In [10]:
len(unq_depts)

8

### Getting Nationality and Gender of Artists

In [11]:
print(artworks[:3])

[['Dress MacLeod from Tartan Sets', 'Sarah Charlesworth', '(American)', '(1947)', '(2013)', '(Female)', '1986', 'Prints & Illustrated Books'], ['Duplicate of plate from folio 11 verso (supplementary suite, plate 4) from ARDICIA', 'Pablo Palazuelo', '(Spanish)', '(1916)', '(2007)', '(Male)', '1978', 'Prints & Illustrated Books'], ['Tailpiece (page 55) from SAGESSE', 'Maurice Denis', '(French)', '(1870)', '(1943)', '(Male)', '1889-1911', 'Prints & Illustrated Books']]


In [12]:
for row in artworks:
    row[2] = row[2].replace("(","").replace(")","")
    row[-3] = row[-3].replace("(","").replace(")","")

In [13]:
print(artworks[:3])

[['Dress MacLeod from Tartan Sets', 'Sarah Charlesworth', 'American', '(1947)', '(2013)', 'Female', '1986', 'Prints & Illustrated Books'], ['Duplicate of plate from folio 11 verso (supplementary suite, plate 4) from ARDICIA', 'Pablo Palazuelo', 'Spanish', '(1916)', '(2007)', 'Male', '1978', 'Prints & Illustrated Books'], ['Tailpiece (page 55) from SAGESSE', 'Maurice Denis', 'French', '(1870)', '(1943)', 'Male', '1889-1911', 'Prints & Illustrated Books']]


In [14]:
# checking for missing nationalities information
counts = 0
for row in artworks:
    
    if row[2] == "":
        counts+=1

print(f"{counts} Nationality records are missing")

494 Nationality records are missing


In [15]:
# checking for missing gender information
counts = 0
for row in artworks:
    
    if row[-3] == "":
        counts+=1
print(f"{counts} Gender records are missing")

794 Gender records are missing


In [16]:
# filling missing values
for row in artworks:
    if row[2] == "":
        row[2] = row[2].replace("", "Nationality Unknown")
    if row[-3] == "":
        row[-3] = row[-3].replace("", "Gender Unknown")

### Standardizing Date

In [17]:
# the function converts years in dataset
def convert_date(date):
    if date != "":
        date = date.replace("(","").replace(")","")
        date = int(date)
    return date

In [18]:
for row in artworks:
    row[3] = convert_date(row[3])
    row[4] = convert_date(row[4])

In [19]:
print(artworks[:3])

[['Dress MacLeod from Tartan Sets', 'Sarah Charlesworth', 'American', 1947, 2013, 'Female', '1986', 'Prints & Illustrated Books'], ['Duplicate of plate from folio 11 verso (supplementary suite, plate 4) from ARDICIA', 'Pablo Palazuelo', 'Spanish', 1916, 2007, 'Male', '1978', 'Prints & Illustrated Books'], ['Tailpiece (page 55) from SAGESSE', 'Maurice Denis', 'French', 1870, 1943, 'Male', '1889-1911', 'Prints & Illustrated Books']]


#### The dataset also contains dates which do not follow conventions

In [20]:
sub_dat = []

for row in artworks:
    sub_dat.append(row[-2])
    
unq_sub = list(set(sub_dat))

In [21]:
unq_sub = list(set(sub_dat))
len(unq_sub)

1154

In [22]:
print(unq_sub)

['1994', 'c. 1964-1965', '1914-1942', '1949-1952', '1900-1944', '1900-1938', '(1952)', '(c. 1972)', '1896', '(1903)', '(1902)', '(2007-2008)', '1878', '1974-1977', '1911-1931', '1996-1997', '1897-1902', '(1913)', '(c .1981)', '1936-1941', '1943-1944', 'c. 1909-1910', '1974-1975', '1970-1973', '1855', '1900-1936', '1945-1946', '1920-1922', '1964', 'c. 1962-1963', 'c. 1911', 'c.1941', 'c. 1855-1860', '1932-1958', 'c. 1979', '1990-1991', 'c. 1907', '1941', '1837-1852', '(c. 1917)', '1914-1916', 'c. 1927-1930', 'c. 1920-1924', '1901', '1877', '1906', '1885', '(1929)', '1930-1931', 'c. 1880', '1919-1923', '1949', '1997-2003', '2000', '1967', 'c. 1940', '(1916)', '1866', '1985-1987', '2002-2003', '1961-1962', '1908-1950', '1895-1900', '(1935)', 'c. 1930', '1974-1984', '(1922-1923)', '1969', '(1974-1975)', '(1971-1973)', '(1986)', '1923-1948', '1972-1973', '1908-1924', '1958-1965', '1948-1950', '1912-1927', '(1917-1918)', '1938-1959', '(1999)', '(1957)', '1947-1953', 'c. 1969-1970', '1998', '

#### Removing unwanted Characters

In [23]:
bad_chr = ["(",")", "c", "C", ".", "'", "s", " "]


def demise(date):
    for c in bad_chr:
        date = date.replace(c,"")
    return date

In [24]:
for row in artworks:
    row[-2] = demise(row[-2])

In [25]:
sub_dat = []

for row in artworks:
    sub_dat.append(row[-2])

In [26]:
unq_sub = list(set(sub_dat))

In [27]:
len(unq_sub)

742

In [28]:
print(unq_sub)

['1994', '1914-1942', '1949-1952', '1900-1944', '1900-1938', '1938-1960', '1910-30', '1896', '1878', '1974-1977', '1911-1931', '1996-1997', '1936-1941', '1943-1944', '1974-1975', '1970-1973', '1855', '1900-1936', '1945-1946', '1920-1922', '1964', '1941', '1837-1852', '1914-1916', '1885', '1877', '1901', '1906', '1930-1931', '1969-1973', '1919-1923', '1949', '1905-1914', '1997-2003', '2000', '1967', '1987-1992', '1866', '1985-1987', '2002-2003', '1961-1962', '1908-1950', '1895-1900', '1974-1984', '1969', '1923-1948', '1972-1973', '1908-1924', '1958-1965', '1948-1950', '1912-1927', '1939-1940', '1938-1959', '2008-2010', '1967-1972', '1947-1953', '1998', '1920-1930', '1927-1952', '1953-1954', '1973-1978', '1933-1935', '1921-1923', '1954-1955', '1947-1958', '1939-1943', '1923-1928', '1952-1954', '1906-1912', '1964-1965', '1952-1953', '1996', '1912-1926', '1943-1981', '1880', '1961', '1940', '2018', '1947-1964', '1911-1914', '1962-1983', '1905', '1934', '1930-1935', '1993-1994', '1965-1968'

#### Taking average of dates in '1932-1966' format

In [29]:
for row in artworks:
    if "-" in row[-2]:
        splt = row[-2].split("-")
        f = int(splt[0])
        s = int(splt[1])
        avg = (f+s)/2
        avg = int(avg)
        row[-2] = avg
    else:
        row[-2] = int(row[-2])

In [30]:
sub_dat = []

for row in artworks:
    sub_dat.append(row[-2])

In [31]:
unq_sub = list(set(sub_dat))

In [32]:
len(unq_sub)

182

In [33]:
print(unq_sub)

[1798, 1805, 970, 1886, 1811, 1818, 1826, 1832, 1836, 1837, 1840, 1844, 1845, 1846, 1849, 1850, 1851, 1852, 1853, 1854, 1855, 1856, 1857, 1858, 1859, 1860, 1861, 1862, 1863, 1864, 1865, 1866, 1868, 1869, 1870, 1871, 1872, 1873, 1874, 1875, 1876, 1877, 1878, 1880, 1881, 1882, 1883, 1884, 1885, 985, 1887, 1888, 1889, 1890, 1891, 1892, 1893, 1894, 1895, 1896, 1897, 1898, 1899, 1900, 1901, 1902, 1903, 1904, 1905, 1906, 1907, 1908, 1909, 1910, 1911, 1912, 1913, 1914, 1915, 1916, 1917, 1918, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,

# Cleaned data

In [34]:
print(artworks[:3])

[['Dress MacLeod from Tartan Sets', 'Sarah Charlesworth', 'American', 1947, 2013, 'Female', 1986, 'Prints & Illustrated Books'], ['Duplicate of plate from folio 11 verso (supplementary suite, plate 4) from ARDICIA', 'Pablo Palazuelo', 'Spanish', 1916, 2007, 'Male', 1978, 'Prints & Illustrated Books'], ['Tailpiece (page 55) from SAGESSE', 'Maurice Denis', 'French', 1870, 1943, 'Male', 1900, 'Prints & Illustrated Books']]


In [35]:
f = open("artists_updated.csv","w", encoding = "utf-8", newline = "")

In [36]:
writer = csv.writer(f, delimiter = ",")

In [37]:
writer.writerow(headers)

for row in artworks:
    writer.writerow(row)