In [92]:
import pandas as pd
import re

artworks = pd.read_csv('artworks.csv')
artworks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110 entries, 0 to 109
Data columns (total 41 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   Title                                            110 non-null    object 
 1   Slug                                             110 non-null    object 
 2   Collection ID                                    110 non-null    object 
 3   Locale ID                                        110 non-null    object 
 4   Item ID                                          110 non-null    object 
 5   Created On                                       110 non-null    object 
 6   Updated On                                       110 non-null    object 
 7   Published On                                     76 non-null     object 
 8   Author                                           110 non-null    object 
 9   Does Not Have a Curated Collecti

In [None]:

# 1. Load the state list and artworks
state_list = pd.read_csv('stateList.csv')  
artworks = pd.read_csv('artworks.csv')

# 2. Define columns to search within the artworks dataframe
text_cols = [
    'Coverage',
    'Artwork Description',
    'Full Location of Artwork Creation (For the Map)',
]

# 3. Create whole-word regex patterns for id and value only
def make_whole_word_pattern(name):
    return re.compile(rf'\b{name}\b|{name}$')

# Precompile regex patterns for id and value only
state_list['id_pattern'] = state_list['id'].apply(lambda x: make_whole_word_pattern(re.escape(str(x))))
state_list['value_pattern'] = state_list['value'].apply(lambda x: make_whole_word_pattern(re.escape(str(x))))

# 4. Match using regex for id/value and substring for abbr
def match_states(row):
    combined_text = ' '.join([str(row[col]) for col in text_cols])
    matches = []
    for _, state in state_list.iterrows():
        abbr = str(state['abbr'])
        if (
            state['id_pattern'].search(combined_text) or
            state['value_pattern'].search(combined_text) or
            (abbr in combined_text)
        ):
            matches.append(state['value'])  # Always append the official full state name
    # Remove duplicates while preserving order
    seen = set()
    return [s for s in matches if not (s in seen or seen.add(s))]

# 5. Apply matching function
artworks['state_matches'] = artworks.apply(match_states, axis=1)

# 6. Expand to columns: state1, state2, ..., using pd.NA for missing
max_states = artworks['state_matches'].apply(len).max()
for i in range(max_states):
    col_name = f'state{i+1}'
    artworks[col_name] = artworks['state_matches'].apply(lambda x: x[i] if i < len(x) else pd.NA)

# Optional: drop the helper column
artworks.drop(columns=['state_matches'], inplace=True)

# 8. Preview
artworks.head()


Unnamed: 0,Title,Slug,Collection ID,Locale ID,Item ID,Created On,Updated On,Published On,Author,Does Not Have a Curated Collection,...,Artist Statement,Submitted Creator Bio,Submitted Title of Collection,Submitted Keywords/ Subject,Tags,Order,I agree to the terms and conditions,Number of pages views,Cover_img,state1
0,A Sacred Stone,wael-darweish-1571b,6695bb622f410170af1426cf,660437127c5c859535a81612,66fa69eddbee463c9f212c10,Mon Sep 30 2024 09:05:49 GMT+0000 (Coordinated...,Thu Oct 24 2024 14:15:57 GMT+0000 (Coordinated...,Mon Oct 28 2024 15:15:43 GMT+0000 (Coordinated...,wael-darweish,False,...,,,,"Mediterranean Fire, Mediterranean Art, Egyptia...",mediterranean-art; mediterranean-fire; egyptia...,48.0,False,,https://cdn.prod.website-files.com/660437127c5...,
1,Arab Spring 1,hani-alqam-e27b0,6695bb622f410170af1426cf,660437127c5c859535a81612,66fa69eeeb505d82c9d613c5,Mon Sep 30 2024 09:05:50 GMT+0000 (Coordinated...,Thu Oct 24 2024 14:04:38 GMT+0000 (Coordinated...,Mon Oct 28 2024 15:15:43 GMT+0000 (Coordinated...,hani-alqam,False,...,,,,,,41.0,False,,https://cdn.prod.website-files.com/660437127c5...,
2,Arab Spring 2,hani-alqam,6695bb622f410170af1426cf,660437127c5c859535a81612,66fa69eedbee463c9f212c68,Mon Sep 30 2024 09:05:50 GMT+0000 (Coordinated...,Thu Oct 24 2024 14:04:53 GMT+0000 (Coordinated...,Mon Oct 28 2024 15:15:43 GMT+0000 (Coordinated...,hani-alqam,False,...,,,,,,42.0,False,,https://cdn.prod.website-files.com/660437127c5...,
3,Art as Resistance: A Story from Immigration De...,sarah-turnbull-first-author-joanne-vincett-sec...,6695bb622f410170af1426cf,660437127c5c859535a81612,66fa69eeaea6e43b211b3c52,Mon Sep 30 2024 09:05:50 GMT+0000 (Coordinated...,Fri Nov 08 2024 14:19:32 GMT+0000 (Coordinated...,Fri Nov 08 2024 14:19:42 GMT+0000 (Coordinated...,sarah-turnbull,True,...,,,,,,16.0,False,,https://cdn.prod.website-files.com/660437127c5...,
4,Ashrama - What is your refuge?,aashray-harishankar,6695bb622f410170af1426cf,660437127c5c859535a81612,66fa69e3e0497f34159e61a8,Mon Sep 30 2024 09:05:39 GMT+0000 (Coordinated...,Fri Dec 13 2024 04:22:26 GMT+0000 (Coordinated...,Fri Dec 13 2024 04:29:35 GMT+0000 (Coordinated...,aashray-harishankar,True,...,,"Aashray Harishankar is a composer, audio engin...",,"Refuge, Installation, Art, CalArts, Sound Art,...",refuge; installation; art; calarts; sound-art;...,150.0,False,,https://cdn.prod.website-files.com/660437127c5...,California


In [89]:

# Load countries data
alts = pd.read_csv('countryList_alts.csv')         # columns: id, value
official = pd.read_csv('countryList_official.csv') # columns: id, value

# Load US states data
state_list = pd.read_csv('stateList.csv')          # columns: id, value, abbr


### === COUNTRY MATCHING ===

# Normalize column headers in artworks
artworks.columns = artworks.columns.str.strip()

# Define columns to search for country detection
country_text_cols = [
    'Coverage',
    'Artwork Description',
    'Full Location of Artwork Creation (For the Map)',
    'Submitted Keywords/ Subject',
    'Submitted Creator Bio'
]

# Ensure expected columns exist
missing_cols = [col for col in country_text_cols if col not in artworks.columns]
if missing_cols:
    raise KeyError(f"Missing expected columns in artworks.csv: {missing_cols}")

# Preprocess for country matching
for col in country_text_cols:
    artworks[col] = artworks[col].fillna('').str.lower()

alts['value'] = alts['value'].fillna('').str.lower()

# Compile regex for alt country names
def make_country_pattern(name):
    return re.compile(rf'\b{name}\b|{name}$', re.IGNORECASE)

alts['pattern'] = alts['value'].apply(lambda v: make_country_pattern(re.escape(v)))

# Match alt names → collect country IDs
def find_country_ids(row):
    combined_text = ' '.join([row[col] for col in country_text_cols])
    return list({alts.iloc[i]['id'] for i, pat in enumerate(alts['pattern']) if pat.search(combined_text)})

artworks['country_ids'] = artworks.apply(find_country_ids, axis=1)

# Map to official country names
id_to_official = dict(zip(official['id'], official['value']))
artworks['country_names'] = artworks['country_ids'].apply(lambda ids: [id_to_official.get(cid) for cid in ids if cid in id_to_official])

# Expand country columns
max_countries = artworks['country_names'].apply(len).max()
for i in range(max_countries):
    col = f'country{i+1}'
    artworks[col] = artworks['country_names'].apply(lambda x: x[i] if i < len(x) else pd.NA)

# Cleanup
artworks.drop(columns=['country_ids', 'country_names'], inplace=True)
artworks


Unnamed: 0,Title,Slug,Collection ID,Locale ID,Item ID,Created On,Updated On,Published On,Author,Does Not Have a Curated Collection,...,Submitted Keywords/ Subject,Tags,Order,I agree to the terms and conditions,Number of pages views,Cover_img,state1,country1,country2,country3
0,A Sacred Stone,wael-darweish-1571b,6695bb622f410170af1426cf,660437127c5c859535a81612,66fa69eddbee463c9f212c10,Mon Sep 30 2024 09:05:49 GMT+0000 (Coordinated...,Thu Oct 24 2024 14:15:57 GMT+0000 (Coordinated...,Mon Oct 28 2024 15:15:43 GMT+0000 (Coordinated...,wael-darweish,False,...,"mediterranean fire, mediterranean art, egyptia...",mediterranean-art; mediterranean-fire; egyptia...,48.0,False,,https://cdn.prod.website-files.com/660437127c5...,,Egypt,Lebanon,
1,Arab Spring 1,hani-alqam-e27b0,6695bb622f410170af1426cf,660437127c5c859535a81612,66fa69eeeb505d82c9d613c5,Mon Sep 30 2024 09:05:50 GMT+0000 (Coordinated...,Thu Oct 24 2024 14:04:38 GMT+0000 (Coordinated...,Mon Oct 28 2024 15:15:43 GMT+0000 (Coordinated...,hani-alqam,False,...,,,41.0,False,,https://cdn.prod.website-files.com/660437127c5...,,Lebanon,Jordan,
2,Arab Spring 2,hani-alqam,6695bb622f410170af1426cf,660437127c5c859535a81612,66fa69eedbee463c9f212c68,Mon Sep 30 2024 09:05:50 GMT+0000 (Coordinated...,Thu Oct 24 2024 14:04:53 GMT+0000 (Coordinated...,Mon Oct 28 2024 15:15:43 GMT+0000 (Coordinated...,hani-alqam,False,...,,,42.0,False,,https://cdn.prod.website-files.com/660437127c5...,,Lebanon,Jordan,
3,Art as Resistance: A Story from Immigration De...,sarah-turnbull-first-author-joanne-vincett-sec...,6695bb622f410170af1426cf,660437127c5c859535a81612,66fa69eeaea6e43b211b3c52,Mon Sep 30 2024 09:05:50 GMT+0000 (Coordinated...,Fri Nov 08 2024 14:19:32 GMT+0000 (Coordinated...,Fri Nov 08 2024 14:19:42 GMT+0000 (Coordinated...,sarah-turnbull,True,...,,,16.0,False,,https://cdn.prod.website-files.com/660437127c5...,,United Kingdom,,
4,Ashrama - What is your refuge?,aashray-harishankar,6695bb622f410170af1426cf,660437127c5c859535a81612,66fa69e3e0497f34159e61a8,Mon Sep 30 2024 09:05:39 GMT+0000 (Coordinated...,Fri Dec 13 2024 04:22:26 GMT+0000 (Coordinated...,Fri Dec 13 2024 04:29:35 GMT+0000 (Coordinated...,aashray-harishankar,True,...,"refuge, installation, art, calarts, sound art,...",refuge; installation; art; calarts; sound-art;...,150.0,False,,https://cdn.prod.website-files.com/660437127c5...,California,United States,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,Untitled 2,thameur-mejri-4a337,6695bb622f410170af1426cf,660437127c5c859535a81612,66fa69eb1eebf91c37b1ddb4,Mon Sep 30 2024 09:05:47 GMT+0000 (Coordinated...,Thu Oct 24 2024 14:13:15 GMT+0000 (Coordinated...,Mon Oct 28 2024 15:15:43 GMT+0000 (Coordinated...,thameur-mejri,False,...,,,71.0,False,,https://cdn.prod.website-files.com/660437127c5...,,Lebanon,Tunisia,
106,Untitled 3,thameur-mejri-c6f68,6695bb622f410170af1426cf,660437127c5c859535a81612,66fa69ebdde36efb259de115,Mon Sep 30 2024 09:05:47 GMT+0000 (Coordinated...,Thu Oct 24 2024 14:13:29 GMT+0000 (Coordinated...,Mon Oct 28 2024 15:15:43 GMT+0000 (Coordinated...,thameur-mejri,False,...,"african art, tunisian art, contemporary art, a...",african-art; tunisian-art; contemporary-art; a...,73.0,False,,https://cdn.prod.website-files.com/660437127c5...,,Iraq,Lebanon,Tunisia
107,Untitled 4,thameur-mejri-b2d8e,6695bb622f410170af1426cf,660437127c5c859535a81612,66fa69ea6216b28c63744cc0,Mon Sep 30 2024 09:05:46 GMT+0000 (Coordinated...,Thu Oct 24 2024 14:13:44 GMT+0000 (Coordinated...,Mon Oct 28 2024 15:15:43 GMT+0000 (Coordinated...,thameur-mejri,False,...,,,74.0,False,,https://cdn.prod.website-files.com/660437127c5...,,Lebanon,Tunisia,
108,Untitled 5,thameur-mejri-2d55b,6695bb622f410170af1426cf,660437127c5c859535a81612,66fa69ea3f7f8c3999f49420,Mon Sep 30 2024 09:05:46 GMT+0000 (Coordinated...,Thu Oct 24 2024 14:13:57 GMT+0000 (Coordinated...,Mon Oct 28 2024 15:15:43 GMT+0000 (Coordinated...,thameur-mejri,False,...,,,75.0,False,,https://cdn.prod.website-files.com/660437127c5...,,Lebanon,Tunisia,


In [91]:
artworks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110 entries, 0 to 109
Data columns (total 46 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   Title                                            110 non-null    object 
 1   Slug                                             110 non-null    object 
 2   Collection ID                                    110 non-null    object 
 3   Locale ID                                        110 non-null    object 
 4   Item ID                                          110 non-null    object 
 5   Created On                                       110 non-null    object 
 6   Updated On                                       110 non-null    object 
 7   Published On                                     76 non-null     object 
 8   Author                                           110 non-null    object 
 9   Does Not Have a Curated Collecti

In [None]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm  # Optional: for progress bar

# Base URL
base_url = 'https://www.theamplificationproject.org/projects/'

# Create a column for image sources
img_sources = []

# Loop over each slug and fetch the image source
for slug in tqdm(artworks['Slug']):
    url = base_url + str(slug)
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise error for bad responses
        soup = BeautifulSoup(response.text, 'html.parser')
        img_tag = soup.find('img', class_='project-cover')
        if img_tag and img_tag.get('src'):
            img_sources.append(img_tag['src'])
        else:
            img_sources.append(None)
    except Exception as e:
        print(f"Failed to fetch {url}: {e}")
        img_sources.append(None)

# Add to dataframe
artworks['Cover_img'] = img_sources

# Preview result
artworks[['Slug', 'Cover_img']].head()


 21%|██        | 23/110 [00:05<00:18,  4.71it/s]

Failed to fetch https://www.theamplificationproject.org/projects/doe-project: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/doe-project


 23%|██▎       | 25/110 [00:05<00:18,  4.66it/s]

Failed to fetch https://www.theamplificationproject.org/projects/dummy-project: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/dummy-project


 25%|██▌       | 28/110 [00:06<00:18,  4.50it/s]

Failed to fetch https://www.theamplificationproject.org/projects/klaudja-sulaj-008d4: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/klaudja-sulaj-008d4


 44%|████▎     | 48/110 [00:10<00:13,  4.73it/s]

Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-837fc: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-837fc
Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-be992: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-be992


 45%|████▌     | 50/110 [00:10<00:12,  4.65it/s]

Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-0614a: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-0614a
Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-8132f: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-8132f


 47%|████▋     | 52/110 [00:11<00:12,  4.83it/s]

Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller
Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-63759: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-63759


 48%|████▊     | 53/110 [00:11<00:12,  4.71it/s]

Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-9f002: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-9f002


 49%|████▉     | 54/110 [00:11<00:12,  4.43it/s]

Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-ab64a: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-ab64a


 51%|█████     | 56/110 [00:12<00:12,  4.35it/s]

Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-c9ce5: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-c9ce5
Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-f6498: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-f6498


 52%|█████▏    | 57/110 [00:13<00:25,  2.11it/s]

Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-4e657: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-4e657


 53%|█████▎    | 58/110 [00:13<00:26,  1.95it/s]

Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-fb9b3: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-fb9b3


 54%|█████▎    | 59/110 [00:14<00:25,  2.03it/s]

Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-f4bf2: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-f4bf2


 55%|█████▌    | 61/110 [00:15<00:21,  2.31it/s]

Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-9a899: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-9a899
Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-7a4bc: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-7a4bc


 56%|█████▋    | 62/110 [00:15<00:24,  1.95it/s]

Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-fd45d: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-fd45d


 57%|█████▋    | 63/110 [00:17<00:38,  1.22it/s]

Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-073ef: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-073ef


 58%|█████▊    | 64/110 [00:18<00:40,  1.13it/s]

Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-7559d: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-7559d


 59%|█████▉    | 65/110 [00:18<00:34,  1.32it/s]

Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-029f9: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-029f9


 60%|██████    | 66/110 [00:19<00:36,  1.22it/s]

Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-2594b: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-2594b


 62%|██████▏   | 68/110 [00:20<00:25,  1.63it/s]

Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-4a53c: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-4a53c
Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-794c7: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-794c7


 63%|██████▎   | 69/110 [00:21<00:24,  1.68it/s]

Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-2cc30: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-2cc30


 64%|██████▎   | 70/110 [00:22<00:29,  1.35it/s]

Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-57811: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-57811


 65%|██████▍   | 71/110 [00:23<00:27,  1.41it/s]

Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-8f8a5: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-8f8a5


 65%|██████▌   | 72/110 [00:24<00:29,  1.27it/s]

Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-a7482: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-a7482


 66%|██████▋   | 73/110 [00:24<00:25,  1.45it/s]

Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-493bc: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-493bc


 67%|██████▋   | 74/110 [00:25<00:24,  1.45it/s]

Failed to fetch https://www.theamplificationproject.org/projects/lili-muller-1: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lili-muller-1


 69%|██████▉   | 76/110 [00:26<00:20,  1.64it/s]

Failed to fetch https://www.theamplificationproject.org/projects/lilli-muller-16b05: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/lilli-muller-16b05


 85%|████████▌ | 94/110 [00:30<00:03,  4.44it/s]

Failed to fetch https://www.theamplificationproject.org/projects/klaudja-sulaj: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/klaudja-sulaj


 95%|█████████▍| 104/110 [00:34<00:02,  2.32it/s]

Failed to fetch https://www.theamplificationproject.org/projects/klaudja-sulaj-24594: 404 Client Error: Not Found for url: https://www.theamplificationproject.org/projects/klaudja-sulaj-24594


100%|██████████| 110/110 [00:35<00:00,  3.06it/s]


Unnamed: 0,Slug,Cover_img
0,wael-darweish-1571b,https://cdn.prod.website-files.com/660437127c5...
1,hani-alqam-e27b0,https://cdn.prod.website-files.com/660437127c5...
2,hani-alqam,https://cdn.prod.website-files.com/660437127c5...
3,sarah-turnbull-first-author-joanne-vincett-sec...,https://cdn.prod.website-files.com/660437127c5...
4,aashray-harishankar,https://cdn.prod.website-files.com/660437127c5...


In [90]:
artworks.to_csv('cleaned_data_2025-05-07.csv', index=False)
