In [1]:
# Pandas, numpy and matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Regular expressions and string manipulation (we don't use string in this one)
import re
import string

# File management
import os
from collections import defaultdict

# NLTK functions for generating the corpus
import nltk
from nltk.corpus import PlaintextCorpusReader

# Custom functions to handle data
from Met_utils import *

In [2]:
# Importing raw data
metData = MetData("MetObjects.csv")

In [3]:
# Looking at the first few entries
metData.data.head()

Unnamed: 0,Object Number,Is Highlight,Is Public Domain,Object ID,Department,Object Name,Title,Culture,Period,Dynasty,...,Subregion,Locale,Locus,Excavation,River,Classification,Rights and Reproduction,Link Resource,Metadata Date,Repository
0,1979.486.1,False,False,1,American Decorative Arts,Coin,One-dollar Liberty Head Coin,,,,...,,,,,,Metal,,http://www.metmuseum.org/art/collection/search/1,11/26/2018 8:00:04 AM,"Metropolitan Museum of Art, New York, NY"
1,1980.264.5,False,False,2,American Decorative Arts,Coin,Ten-dollar Liberty Head Coin,,,,...,,,,,,Metal,,http://www.metmuseum.org/art/collection/search/2,11/26/2018 8:00:04 AM,"Metropolitan Museum of Art, New York, NY"
2,67.265.9,False,False,3,American Decorative Arts,Coin,Two-and-a-Half Dollar Coin,,,,...,,,,,,Metal,,http://www.metmuseum.org/art/collection/search/3,11/26/2018 8:00:04 AM,"Metropolitan Museum of Art, New York, NY"
3,67.265.10,False,False,4,American Decorative Arts,Coin,Two-and-a-Half Dollar Coin,,,,...,,,,,,Metal,,http://www.metmuseum.org/art/collection/search/4,11/26/2018 8:00:04 AM,"Metropolitan Museum of Art, New York, NY"
4,67.265.11,False,False,5,American Decorative Arts,Coin,Two-and-a-Half Dollar Coin,,,,...,,,,,,Metal,,http://www.metmuseum.org/art/collection/search/5,11/26/2018 8:00:04 AM,"Metropolitan Museum of Art, New York, NY"


In [4]:
metData.data.shape

(472669, 43)

In [5]:
(metData.data['Department'] + ' ' + metData.data['Title']).loc[0]

'American Decorative Arts One-dollar Liberty Head Coin'

In [6]:
metData.filter_dep(5).head()

Unnamed: 0,Object Number,Is Highlight,Is Public Domain,Object ID,Department,Object Name,Title,Culture,Period,Dynasty,...,Subregion,Locale,Locus,Excavation,River,Classification,Rights and Reproduction,Link Resource,Metadata Date,Repository
30651,96.14.193,False,True,35966,Asian Art,Piece,,Japan,Edo period (1615–1868),,...,,,,,,Leatherwork,,http://www.metmuseum.org/art/collection/search...,11/26/2018 8:00:04 AM,"Metropolitan Museum of Art, New York, NY"
30652,96.14.1896,False,True,35967,Asian Art,Panel,,China,,,...,,,,,,Leatherwork,,http://www.metmuseum.org/art/collection/search...,11/26/2018 8:00:04 AM,"Metropolitan Museum of Art, New York, NY"
30653,09.3,False,True,35968,Asian Art,Wall hanging,清 佚名 台南地區荷蘭城堡\t|Forts Zeelandia and Provinti...,China,,,...,,,,,,Paintings,,http://www.metmuseum.org/art/collection/search...,11/26/2018 8:00:04 AM,"Metropolitan Museum of Art, New York, NY"
30654,12.37.135,False,False,35969,Asian Art,Hanging scroll,,China,Qing dynasty (1644–1911),,...,,,,,,Paintings,,http://www.metmuseum.org/art/collection/search...,11/26/2018 8:00:04 AM,"Metropolitan Museum of Art, New York, NY"
30655,13.100.22,False,True,35970,Asian Art,Hanging scroll,明 丁雲鵬 潯陽送客圖 軸|Song of the Lute,China,late Ming dynasty (1368–1644),,...,,,,,,Paintings,,http://www.metmuseum.org/art/collection/search...,11/26/2018 8:00:04 AM,"Metropolitan Museum of Art, New York, NY"


In [7]:
metData.depts

['American Decorative Arts',
 'European Sculpture and Decorative Arts',
 'Modern and Contemporary Art',
 'Arms and Armor',
 'Medieval Art',
 'Asian Art',
 'Costume Institute',
 'Islamic Art',
 'Arts of Africa, Oceania, and the Americas',
 'Drawings and Prints',
 'Greek and Roman Art',
 'Photographs',
 'Ancient Near Eastern Art',
 'European Paintings',
 'Robert Lehman Collection',
 'The Cloisters',
 'Musical Instruments',
 'Egyptian Art',
 'The Libraries']

In [8]:
# Choosing the different department
depts_indices = [5, 6, 11, 16, 17]
depts_list = [metData.depts[n] for n in depts_indices]
print(depts_list)

['Asian Art', 'Costume Institute', 'Photographs', 'Musical Instruments', 'Egyptian Art']


In [9]:
# Feature selection
feature_cols = ['Object Name', 'Title', 'Artist Display Name',
                       'Medium', 'Classification', 'Credit Line']

In [10]:
textData = metData.gather_text(feature_cols)

In [11]:
# List of punctuation and stop words to be removed from the corpus
useless_words = nltk.corpus.stopwords.words("english") \
                + list(string.punctuation)

In [12]:
[w for w in re.sub(r'[,.;@#?!&$-]+', ' ', textData['text'].iloc[0]).split() if w not in useless_words]

['Coin',
 'One',
 'dollar',
 'Liberty',
 'Head',
 'Coin',
 'James',
 'Barton',
 'Longacre',
 'Gold',
 'Metal',
 'Gift',
 'Heinz',
 'L',
 'Stoppelmann',
 '1979']

In [13]:
def generate_word_list(text, useless_words=useless_words):
    if text == None:
        return None
    
    else:
        word_list = re.sub(r'[,.;@#?!&$-]+', ' ', text).split()
    
        return [w for w in word_list if w not in useless_words]

In [17]:
textData['text list'] = textData['text'].apply(lambda x: generate_word_list(x))

In [18]:
textData.head()

Unnamed: 0,Object ID,text,text list
0,1,Coin One-dollar Liberty Head Coin James Barto...,"[Coin, One, dollar, Liberty, Head, Coin, James..."
1,2,Coin Ten-dollar Liberty Head Coin Christian G...,"[Coin, Ten, dollar, Liberty, Head, Coin, Chris..."
2,3,Coin Two-and-a-Half Dollar Coin Gold Metal G...,"[Coin, Two, Half, Dollar, Coin, Gold, Metal, G..."
3,4,Coin Two-and-a-Half Dollar Coin Gold Metal G...,"[Coin, Two, Half, Dollar, Coin, Gold, Metal, G..."
4,5,Coin Two-and-a-Half Dollar Coin Gold Metal G...,"[Coin, Two, Half, Dollar, Coin, Gold, Metal, G..."


In [11]:
test1 = np.array([1, 2, 3, 4, 5])
test1

array([1, 2, 3, 4, 5])

In [19]:
test2 = np.concatenate((np.array([0]), test1))
print(test2)

[0 1 2 3 4 5]


In [21]:
test3 = np.zeros(5)
test3

array([0., 0., 0., 0., 0.])