## Humanities Digital Archives Project, Part 1b Data Prep
Shruti Gupta, Lisa Over, L. Sooter

In [135]:
import warnings;
warnings.filterwarnings('ignore');

In [136]:
# Import libraries
import pandas as pd
import numpy as np
import nltk
import re
import copy
import contractions
import spacy
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

### Load for_prep_items.csv

In [137]:
# Load Islandora items metadata csv file
pd.set_option('display.max_columns', None)
items = pd.read_csv('for_prep_items.csv', encoding='latin-1', dtype='unicode')

# Display the head
items.head()

Unnamed: 0.1,Unnamed: 0,Identifier,Title,Creator,Sort Date,Type of Resource,Genre,Abstract,Source,Site Membership,Collection Membership,Ttle_Abs
0,0,000001.PIC,Musicians,Paul Slantis,1950-01-01T00:00:00,still image,photograph,Six young African-American males playing percu...,"Paul Slantis Photograph Collection, ca. 1946-1...","HistPitt, Digital",collection.68,Musicians. Six young African-American males pl...
1,1,000001.UE,Sylvania Locals Conference,United Electrical Workers,1945-04-29T00:00:00,still image,photograph,Leaders of Sylvania Locals. Front row (L-R): L...,"UE News Photograph Collection, 1933-1998UE New...","HistPitt, Digital",collection.89,Sylvania Locals Conference. Leaders of Sylvani...
2,2,000002.PIC,USMC Pilot with McDonnell FH Phantom,Paul Slantis,1950-01-01T00:00:00,still image,photograph,United States Marine Corps pilot with his McDo...,"Paul Slantis Photograph Collection, ca. 1946-1...","HistPitt, Digital",collection.68,USMC Pilot with McDonnell FH Phantom. United S...
3,3,000002.UE,Soup Kitchen,United Electrical Workers,1940-01-01T00:00:00,still image,photograph,Drinking donated coffee at Johnsonburg Plant S...,"UE News Photograph Collection, 1933-1998UE New...","HistPitt, Digital",collection.89,Soup Kitchen. Drinking donated coffee at Johns...
4,4,000003.PIC,Greater Pittsburgh International Airport Opening,Paul Slantis,1952-05-31T00:00:00,still image,photograph,US Navy airplanes lined up on the tarmac of th...,"Paul Slantis Photograph Collection, ca. 1946-1...","HistPitt, Digital",collection.68,Greater Pittsburgh International Airport Openi...


#### Combine Abstract and Title

In [138]:
# Replace NaN with empty string
items = items.replace(np.nan, '', regex=True)

In [139]:
items['Ttle_Abs'] = items['Title'].values + '. ' + items['Abstract'].values

In [140]:
items['Ttle_Abs'].head()

0    Musicians. Six young African-American males pl...
1    Sylvania Locals Conference. Leaders of Sylvani...
2    USMC Pilot with McDonnell FH Phantom. United S...
3    Soup Kitchen. Drinking donated coffee at Johns...
4    Greater Pittsburgh International Airport Openi...
Name: Ttle_Abs, dtype: object

#### Correct spelling in items dataset

In [141]:
nltk.download('words')
words = set(nltk.corpus.words.words())

[nltk_data] Downloading package words to /Users/lisaover/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [142]:
# Create a file with words not in English dictionary
word_lst = []
for i in range(len(items)):
    # define punctuation
    punctuations = '''!()[]{};:'"\,<>./?@#$%^&*_~'''
    # remove punctuation from the string
    no_punct = ''
    
    w_lst = items['Ttle_Abs'].iloc[i].split(' ')
    
    for w in w_lst:
        for char in w:
            if char not in punctuations and char is not None:
                no_punct = no_punct + char
        if no_punct != '' and no_punct not in words:
            word_lst.append(no_punct)
        no_punct = ''
with open('nodict_abstract.txt', 'w') as filehandle:
    for item in word_lst:
        filehandle.write('%s\n' % item)

In [144]:
# Correct obvious spelling issues
def correct_spelling(t):
    r = t.lower()
    r = r.replace('pittsburghã\x95s', 'pittsburgh')
    r = r.replace('&amp;', ' ')
    r = r.replace(' edu ', ' education ')
    # post office and post gazette were lumped together because of 'post' - no bigram created for them
    r = r.replace('post office', 'post_office') 
    r = r.replace('post gazette', 'post_gazette')
    r = r.replace("ã•", "'")
    r = r.replace("ã","")
    r = r.replace("ãs", "'s")
    r = r.replace('ã', '')
    # us was considered pronoun and messes with the data cleaning
    r = r.replace('us navy', 'united states navy') 
    r = r.replace('presenation', 'presentation')
    #r = r.replace('african american', 'african-american')
    r = r.replace('compenstaion', 'compensation')
    r = r.replace('dapartment', 'department')
    r = r.replace('buisiness', 'business')
    r = r.replace('roomlibrary', 'room library')
    r = r.replace('pennsylvania1', 'pennsylvania')
    r = r.replace('suporting', 'supporting')
    r = r.replace('photgrapher', 'photographer')
    r = r.replace('exibition', 'exhibition')
    r = r.replace('managementtogetherto', 'management together to')
    r = r.replace('profie', 'profile')
    r = r.replace('alittle', 'a little')
    r = r.replace('aformentioned', 'aforementioned')
    r = r.replace('constuctign', 'construction')
    r = r.replace('simulatneously', 'simultaneously')
    r = r.replace('constuction', 'construction')
    r = r.replace('constuctor', 'constructor')
    r = r.replace('colellection', 'collection')
    r = r.replace('photogrpah', 'photograph')
    r = r.replace('volunter', 'volunteer')
    r = r.replace('friendrelativearistocrat', 'friend relative aristocrat')
    r = r.replace('passsed', 'passed')
    r = r.replace('adminstration', 'administration')
    r = r.replace('administrationbook', 'administration book')
    r = r.replace('agendabook', 'agenda book')
    r = r.replace('presbyterianuniversity', 'presbyterian university')
    r = r.replace('ttwenty', 'twenty')
    r = r.replace('acomplishments', 'accomplishments')
    r = r.replace('univesity', 'university')
    r = r.replace('|||', ' ')
    r = r.replace('communictions', 'communications')
    r = r.replace(' - ', '-')
    r = r.replace('- ', '-')
    r = r.replace(' -', '-')
    r = r.replace('--', ' ')
    
    # Remove URLs
    pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    r = pattern.sub('', r)
    
    pattern = re.compile('www(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    r = pattern.sub('', r)
    
    pattern = re.compile('msp33(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    r = pattern.sub('', r)
    
    pattern = re.compile('msp80(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    r = pattern.sub('', r)
    
    pattern = re.compile('msp117(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    r = pattern.sub('', r)
    
    pattern = re.compile('msp285(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    r = pattern.sub('', r)
    
    pattern = re.compile('201000(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    r = pattern.sub('', r)
    
    return r

In [145]:
items['Abstract_cspell'] = items.apply(lambda txt: correct_spelling(txt['Ttle_Abs']), axis=1)

In [146]:
word_lst = []
for i in range(len(items)):
    # define punctuation
    punctuations = '''!()[]{};:'"\,<>./?@#$%^&*_~'''
    # remove punctuation from the string
    no_punct = ''
    
    w_lst = items['Abstract_cspell'].iloc[i].split(' ')
    
    for w in w_lst:
        for char in w:
            if char not in punctuations and char is not None:
                no_punct = no_punct + char
        if no_punct != '' and no_punct not in words:
            word_lst.append(no_punct)
        no_punct = ''
with open('nodict_cspell.txt', 'w') as filehandle:
    for item in word_lst:
        filehandle.write('%s\n' % item)

#### Expand contractions

https://stackoverflow.com/questions/49828463/running-replace-contractions-across-all-text-files-in-same-directory-and-outpu

In [147]:
def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(str(text))

In [148]:
# Call replace_contractions
items['Abstract_noc'] = items.apply(lambda txt: replace_contractions(txt['Abstract_cspell']), axis=1)

#### Remove stop words, punctuation, and special characters

In [149]:
# https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
def clean_text(t):
    text = re.sub(r"[^a-zA-Z0-9]"," ",t)
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(text) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    filtered_sentence = [] 
  
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
    
    return ' '.join(filtered_sentence)


In [150]:
# Call clean_text
items['Abstract_clean'] = items.apply(lambda txt: clean_text(txt['Abstract_noc']), axis=1)

In [151]:
items.head()

Unnamed: 0.1,Unnamed: 0,Identifier,Title,Creator,Sort Date,Type of Resource,Genre,Abstract,Source,Site Membership,Collection Membership,Ttle_Abs,Abstract_cspell,Abstract_noc,Abstract_clean
0,0,000001.PIC,Musicians,Paul Slantis,1950-01-01T00:00:00,still image,photograph,Six young African-American males playing percu...,"Paul Slantis Photograph Collection, ca. 1946-1...","HistPitt, Digital",collection.68,Musicians. Six young African-American males pl...,musicians. six young african-american males pl...,musicians. six young african-american males pl...,musicians six young african american males pla...
1,1,000001.UE,Sylvania Locals Conference,United Electrical Workers,1945-04-29T00:00:00,still image,photograph,Leaders of Sylvania Locals. Front row (L-R): L...,"UE News Photograph Collection, 1933-1998UE New...","HistPitt, Digital",collection.89,Sylvania Locals Conference. Leaders of Sylvani...,sylvania locals conference. leaders of sylvani...,sylvania locals conference. leaders of sylvani...,sylvania locals conference leaders sylvania lo...
2,2,000002.PIC,USMC Pilot with McDonnell FH Phantom,Paul Slantis,1950-01-01T00:00:00,still image,photograph,United States Marine Corps pilot with his McDo...,"Paul Slantis Photograph Collection, ca. 1946-1...","HistPitt, Digital",collection.68,USMC Pilot with McDonnell FH Phantom. United S...,usmc pilot with mcdonnell fh phantom. united s...,usmc pilot with mcdonnell fh phantom. united s...,usmc pilot mcdonnell fh phantom united states ...
3,3,000002.UE,Soup Kitchen,United Electrical Workers,1940-01-01T00:00:00,still image,photograph,Drinking donated coffee at Johnsonburg Plant S...,"UE News Photograph Collection, 1933-1998UE New...","HistPitt, Digital",collection.89,Soup Kitchen. Drinking donated coffee at Johns...,soup kitchen. drinking donated coffee at johns...,soup kitchen. drinking donated coffee at johns...,soup kitchen drinking donated coffee johnsonbu...
4,4,000003.PIC,Greater Pittsburgh International Airport Opening,Paul Slantis,1952-05-31T00:00:00,still image,photograph,US Navy airplanes lined up on the tarmac of th...,"Paul Slantis Photograph Collection, ca. 1946-1...","HistPitt, Digital",collection.68,Greater Pittsburgh International Airport Openi...,greater pittsburgh international airport openi...,greater pittsburgh international airport openi...,greater pittsburgh international airport openi...


#### Lemmatize words

In [152]:
nlp = spacy.load('en', parse=True, tag=True, entity=True)

In [153]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '_PRON_' else word.text for word in text])
    return text

In [154]:
# Call lemmatize_text
items['Abstract_lemma'] = items.apply(lambda txt: lemmatize_text(txt['Abstract_clean']), axis=1)

#### Eliminate nulls/empty fields and write dataset to file

In [155]:
# Only items where Abstract is not null, i.e., Abstract exists in items file
items2 = items[items['Abstract_lemma'].notnull()]
items2.shape

(124539, 16)

In [156]:
# Only items where Abstract is not null, i.e., Abstract exists in items file
items2 = items[(items['Abstract_cspell'].values != '') & (items['Abstract_lemma'].values != '')]
items2.shape

(124517, 16)

In [157]:
items2['Abstract_lemma'].isnull().sum()

0

In [158]:
items2.to_csv(r'for_models_items.csv')

### Load for_prep_combined.csv

In [61]:
# Load combined items/finding aids csv file
pd.set_option('display.max_columns', None)
comb = pd.read_csv('for_prep_combined.csv', encoding='latin-1', dtype='unicode')

# Display the head
comb.head()

Unnamed: 0.1,Unnamed: 0,ItemID,cTitle,iTitle,iCreator,iSort Date,iType of Resource,iGenre,iAbstract,iSource,iSite Membership,iCollection Membership,faAcq Number,faID,faTitle of Collection,faTemporal Coverage,faAbstract,faScope and Content,faSub Heading(s),faBio or History
0,0,000001.PIC,Paul Slantis Photographs,Musicians,Paul Slantis,1950-01-01T00:00:00,still image,photograph,Six young African-American males playing percu...,"Paul Slantis Photograph Collection, ca. 1946-1...","HistPitt, Digital",collection.68,AIS.1991.19a,US-PPiU-ais199119a,Paul Slantis Photographs,ca. 1946-1956,"The Paul Slantis Photographs contain 1,508 4x5...","The collection contains approximately 1,508 4x...",Greater Pittsburgh International Airport.|||La...,"A native of Pittsburgh's Oakland neighborhood,..."
1,1,000001.UE,"United Electrical, Radio, and Machine Workers...",Sylvania Locals Conference,United Electrical Workers,1945-04-29T00:00:00,still image,photograph,Leaders of Sylvania Locals. Front row (L-R): L...,"UE News Photograph Collection, 1933-1998UE New...","HistPitt, Digital",collection.89,UE.14.1,US-PPiU-UE14-1,UE News Photograph Collection,1933-1998,This collection contains photographs dating fr...,This collection contains photographs dating fr...,"Carey, James B.|||Carter, Jimmy, 1924-|||Chave...","In 1939, the UE News replaced the People's Pre..."
2,2,000002.PIC,Paul Slantis Photographs,USMC Pilot with McDonnell FH Phantom,Paul Slantis,1950-01-01T00:00:00,still image,photograph,United States Marine Corps pilot with his McDo...,"Paul Slantis Photograph Collection, ca. 1946-1...","HistPitt, Digital",collection.68,AIS.1991.19a,US-PPiU-ais199119a,Paul Slantis Photographs,ca. 1946-1956,"The Paul Slantis Photographs contain 1,508 4x5...","The collection contains approximately 1,508 4x...",Greater Pittsburgh International Airport.|||La...,"A native of Pittsburgh's Oakland neighborhood,..."
3,3,000002.UE,"United Electrical, Radio, and Machine Workers...",Soup Kitchen,United Electrical Workers,1940-01-01T00:00:00,still image,photograph,Drinking donated coffee at Johnsonburg Plant S...,"UE News Photograph Collection, 1933-1998UE New...","HistPitt, Digital",collection.89,UE.14.1,US-PPiU-UE14-1,UE News Photograph Collection,1933-1998,This collection contains photographs dating fr...,This collection contains photographs dating fr...,"Carey, James B.|||Carter, Jimmy, 1924-|||Chave...","In 1939, the UE News replaced the People's Pre..."
4,4,000003.PIC,Paul Slantis Photographs,Greater Pittsburgh International Airport Opening,Paul Slantis,1952-05-31T00:00:00,still image,photograph,US Navy airplanes lined up on the tarmac of th...,"Paul Slantis Photograph Collection, ca. 1946-1...","HistPitt, Digital",collection.68,AIS.1991.19a,US-PPiU-ais199119a,Paul Slantis Photographs,ca. 1946-1956,"The Paul Slantis Photographs contain 1,508 4x5...","The collection contains approximately 1,508 4x...",Greater Pittsburgh International Airport.|||La...,"A native of Pittsburgh's Oakland neighborhood,..."
