# Intermediate Python

In [1]:
from csv import reader

file = 'Data/artworks.csv'
open_file = open(file, encoding='utf-8')
read_file = reader(open_file)
art = list(read_file)
open_file.close()

In [2]:
len(art)

16726

In [3]:
headers = art[0]
moma = art[1:]

In [4]:
headers

['Title',
 'Artist',
 'Nationality',
 'BeginDate',
 'EndDate',
 'Gender',
 'Date',
 'Department']

## Data Description
* Title: The title of the artwork.
* Artist: The name of the artist who created the artwork.
* Nationality: The nationality of the artist.
* BeginDate: The year in which the artist was born.
* EndDate: The year in which the artist died.
* Gender: The gender of the artist.
* Date: The date that the artwork was created.
* Department: The department inside MoMA to which the artwork belongs.

### Using String Replacement
Remove '(' and ')' from the gender and nationality columns

In [5]:
moma[0]

['Dress MacLeod from Tartan Sets',
 'Sarah Charlesworth',
 '(American)',
 '(1947)',
 '(2013)',
 '(Female)',
 '1986',
 'Prints & Illustrated Books']

In [6]:
for row in moma:
    nationality = row[2]
    nationality = nationality.replace('(','')
    nationality = nationality.replace(')','')
    row[2] = nationality
    
    gender = row[5]
    gender = gender.replace('(','')
    gender = gender.replace(')','')
    row[5] = gender

In [7]:
moma[0]

['Dress MacLeod from Tartan Sets',
 'Sarah Charlesworth',
 'American',
 '(1947)',
 '(2013)',
 'Female',
 '1986',
 'Prints & Illustrated Books']

### Data Cleaning

In [8]:
for row in moma:
    # Clean/standardize gender
    gender = row[5]
    gender = gender.title()
    if not gender:
        gender = 'Gender Unknown/Other'
    row[5] = gender 
    
    # Clean/standardize nationality
    nationality = row[2]
    nationality = nationality.title()
    if not nationality:
        nationality = 'Nationality Unknown'
    row[2] = nationality 

In [9]:
moma[0]

['Dress MacLeod from Tartan Sets',
 'Sarah Charlesworth',
 'American',
 '(1947)',
 '(2013)',
 'Female',
 '1986',
 'Prints & Illustrated Books']

### Clean and Convert Start/End Dates (Artist's birthdate and deathdate)

In [10]:
def clean_and_convert(date):
    # check that we don't have an empty string
    if date != "":
        # move the rest of the function inside
        # the if statement
        date = date.replace("(", "")
        date = date.replace(")", "")
        date = int(date)
    return date

for row in moma:
    BeginDate = row[3]
    EndDate = row[4]
    clean_BeginDate = clean_and_convert(BeginDate)
    clean_EndDate = clean_and_convert(EndDate)
    row[3] = clean_BeginDate
    row[4] = clean_EndDate

### Cleaning the "Date" Column
Example Data:
* 1912
* 1929
* 1913-1923
* (1951)
* 1994
* 1934
* c. 1915
* 1995
* c. 1912
* (1988)
* 2002
* 1957-1959
* c. 1955.
* c. 1970's
* C. 1990-1999

#### Prototyping

In [11]:
strings = ["good!", "morn?ing", "good?!", "morniZZZZng"]
bad_chars = ["!", "?", "Z"]

In [12]:
def strip_chars(string):
    for char in bad_chars:
        string = string.replace(char, "")
    return string

In [13]:
cleaned_strings = []
for string in strings:
    string = strip_chars(string)
    cleaned_strings.append(string)

In [14]:
cleaned_strings

['good', 'morning', 'good', 'morning']

#### Application for `Date` Column

In [15]:
test_data = ["1912", "1929", "1913-1923",
             "(1951)", "1994", "1934",
             "c. 1915", "1995", "c. 1912",
             "(1988)", "2002", "1957-1959",
             "c. 1955.", "c. 1970's", 
             "C. 1990-1999"]

bad_chars = ["(",")","c","C",".","s","'", " "]

def strip_characters(string):
    for char in bad_chars:
        string = string.replace(char, '')
    return string

stripped_test_data = []

for date in test_data:
    clean_date = strip_characters(date)
    stripped_test_data.append(clean_date)

print(stripped_test_data)

['1912', '1929', '1913-1923', '1951', '1994', '1934', '1915', '1995', '1912', '1988', '2002', '1957-1959', '1955', '1970', '1990-1999']


### Further `Date` Cleaning
There are still some year ranges. Since we're dealing with artists' age, being exact isn't necessarily integral to our analysis. Here's how we'll proceed:
* Where there is a single year, we'll keep it.
* Where there is a year range, we'll average the two years.

In [19]:
def process_date(string):
    if '-' in string:
        split_string = string.split('-')
        date1 = int(split_string[0])
        date2 = int(split_string[1])
        avg = (date1 + date2) / 2
        rounded_avg = round(avg)
        return rounded_avg
    return int(string)

In [21]:
test_range = '1990'
process_date(test_range)

1990

In [22]:
processed_test_data = []

for year in stripped_test_data:
    processed_year = process_date(year)
    processed_test_data.append(processed_year)

In [23]:
len(processed_test_data)

15

In [24]:
print(stripped_test_data)

['1912', '1929', '1913-1923', '1951', '1994', '1934', '1915', '1995', '1912', '1988', '2002', '1957-1959', '1955', '1970', '1990-1999']


### Use New Processing Function on Moma Dataset

In [31]:
print(list(enumerate(headers)))

[(0, 'Title'), (1, 'Artist'), (2, 'Nationality'), (3, 'BeginDate'), (4, 'EndDate'), (5, 'Gender'), (6, 'Date'), (7, 'Department')]


In [33]:
for record in moma:
    date = record[6]
    stripped_date = strip_characters(date)
    processed_date = process_date(stripped_date)
    record[6] = processed_date

In [37]:
# Quick verification
for record in moma[:15]:
    print(record[6])

1986
1978
1900
1934
1903
1957
1924
1980
2001
1941
1950
1963
1910
1934
1997
