# Notes 
- Some article content is wiki listings.. remove from dataset
- Some article content is not specific to the POI
- Some article content is repeated across multilte POIs

In [1]:
%matplotlib inline
import re
import nltk
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
from ast import literal_eval

In [2]:
from nltk.corpus import stopwords

In [3]:
stopset = set(stopwords.words('english'))

In [4]:
datadir = '/Users/klarnemann/Documents/Insight/Insight_project/data'

In [5]:
fhp_df_f = '%s/federal_historic_places_by_city_for_nlp.xlsx' % (datadir)
fhp_df = pd.read_excel(fhp_df_f, index_col=0)
#fhp_df = fhp_df.drop('Article_tokens', axis=1)
#fhp_df = fhp_df.drop_duplicates()
#fhp_df.sort_values(by=['Location'], inplace=True)
#fhp_df.reset_index(inplace=True, drop=True)
#fhp_df.to_excel(fhp_df_f)

In [6]:
park_df_f = '%s/parks_by_city_for_nlp.xlsx' % (datadir)
park_df = pd.read_excel(park_df_f, index_col=0)
#park_df = park_df.drop_duplicates()
#park_df = park_df.drop('Article_tokens', axis=1)
#park_df.sort_values(by=['Location'], inplace=True)
#park_df.reset_index(inplace=True, drop=True)
#park_df.to_excel(park_df_f)

In [7]:
museum_df_f = '%s/museums_by_city_for_nlp.xlsx' % (datadir)
museum_df = pd.read_excel(museum_df_f, index_col=0)
#museum_df = museum_df.drop_duplicates()
#museum_df = museum_df.drop('Article_tokens', axis=1)
#museum_df.sort_values(by=['Location'], inplace=True)
#museum_df.reset_index(inplace=True, drop=True)
#museum_df.to_excel(museum_df_f)

In [8]:
#poi_df = pd.read_csv('%s/poi_for_nlp.csv' % (datadir), index_col=0)
#poi_df.to_csv('%s/poi_for_nlp.csv' % (datadir))
#poi_df.to_excel('%s/poi_for_nlp.xlsx' % (datadir))

# Formatting data

In [9]:
def clean_text(text, lower=False, properNouns=True):
    if not properNouns:
        ttext = tokenize_text(text, lower=False)
        tagged_text = nltk.pos_tag(ttext)
        edited_text = [word for word, tag in tagged_text if tag != 'NNP' and tag != 'NNPS']            
        text =  ' '.join(edited_text)
    if lower:
        text = text.lower()
    text = re.sub('[^a-zA-Z]+', ' ', text)
    return text

In [10]:
def tokenize_text_from_df(df, col='Article_content', out_col='Article_tokens', \
                          lower=True, stopwords=False, stopset=stopset, \
                          properNouns=True):
    df[out_col] = ''
    n_rows, n_cols = df.shape
    for i in np.arange(n_rows):
        raw_text = df[col].iloc[i]
        if type(raw_text) == float:
            continue
        text = clean_text(raw_text, lower, properNouns)
        tokenizer = nltk.RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(text)
        if stopwords:
            tokens = [w for w in tokens if not w in stopset]
        df[out_col].iloc[i] = tokens
    return df

In [11]:
def tokenize_text(raw_text, lower=True, stopwords=False, stopset=stopset, properNouns=True):
    if type(raw_text) == float:
        return ''
    text = clean_text(raw_text, lower, properNouns)
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    if stopwords:
        tokens = [w for w in tokens if not w in stopset]
    return tokens

## Examine dataframes

In [12]:
park_df.head(5)

Unnamed: 0,Name,Location,Article_content
0,Palmetto Island State Park,"Abbeville, Louisiana",Palmetto Island State Park is a recent additio...
1,Ames Nowell State Park,"Abington, Massachusetts",Ames Nowell State Park is a 700-acre (280 ha) ...
2,Tombigbee National Forest,"Ackerman, Mississippi",Tombigbee National Forest is a U.S. National F...
3,Port Royal State Park,"Adams, Tennessee",Port Royal State Historic Park is a 26 acre (1...
4,Reed Bingham State Park,"Adel, Georgia","Reed Bingham State Park is a 1,613 acre (6.53 ..."


In [13]:
museum_df.head(5)

Unnamed: 0,Name,Location,Article_content
0,Penns Valley Area Historical Museum,"Aaronsburg, Pennsylvania",Valley Forge National Historical Park is the s...
1,Sam Guarino Blacksmith Shop Museum,"Abbeville, Louisiana",This list of museums in Louisiana is a list of...
2,Abbeville Museum,"Abbeville, Louisiana",Abbeville (French pronunciation: [ab.vil] (lis...
3,Louisiana Military Hall of Fame and Museum,"Abbeville, Louisiana","The landing craft, vehicle, personnel (LCVP) o..."
4,Burt-Stark Mansion,"Abbeville, South Carolina","The Burt-Stark Mansion, also known as Armistea..."


In [14]:
fhp_df.head(5)

Unnamed: 0,Name,Location,Article_content
0,Civil War Earthworks at Tallahatchie Crossing,"Abbeville, Mississippi",The Civil War Earthworks at Tallahatchie Cross...
1,Gunpowder Meetinghouse,"Aberdeen Proving Ground, Maryland",Gunpowder Meetinghouse is a historic Methodist...
2,Presbury Meetinghouse,"Aberdeen Proving Ground, Maryland",The Presbury Meetinghouse is a historic Method...
3,Eisenhower Home,"Abilene, Kansas","The Eisenhower Home in Abilene, Kansas at the ..."
4,Federal Building,"Abilene, Texas",A federal building is a building housing local...


## Make input dataframe for NLP

### By POI

In [15]:
poi_df = pd.concat([fhp_df, park_df, museum_df])
poi_df = poi_df.drop_duplicates()
poi_df.sort_values(by=['Location'], inplace=True)
poi_df.reset_index(inplace=True, drop=True)
print(poi_df.shape)
poi_df.head()
poi_df.to_excel('%s/all_poi.xlsx' % (datadir))

(23897, 3)


In [16]:
del park_df
del museum_df
del fhp_df

In [17]:
import wikipedia

In [18]:
wiki_results = []
for i in poi_df.index:
    if i % 500 == 0:
        print(i)
    try:
        poi = poi_df.loc[i, 'Name']
        location = poi_df.loc[i, 'Location']
        city, state = location.split(', ')
        title = wikipedia.page(poi).title
        text = wikipedia.page(poi).content
        if city and state in text:
            wiki_results.append([location, poi, title, text])
            continue
        else:
            search = '%s %s' % (poi, location)
            title = wikipedia.page(search).title
            text = wikipedia.page(search).content
            wiki_results.append([location, poi, title, text])
            continue
    except:
        continue

0




  lis = BeautifulSoup(html).find_all('li')


500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000
20500
21000
21500
22000
22500
23000
23500


In [19]:
input_df = pd.DataFrame(wiki_results, columns=['Location', 'Name', 'Article_title', 
                                               'Article_content'])
input_df.head()

Unnamed: 0,Location,Name,Article_title,Article_content
0,"Aaronsburg, Pennsylvania",Penns Valley Area Historical Museum,Valley Forge National Historical Park,Valley Forge National Historical Park is the s...
1,"Abbeville, Louisiana",Palmetto Island State Park,Palmetto Island State Park,Palmetto Island State Park is a recent additio...
2,"Abbeville, Louisiana",Sam Guarino Blacksmith Shop Museum,List of museums in Louisiana,This list of museums in Louisiana is a list of...
3,"Abbeville, Louisiana",Abbeville Museum,"Abbeville, Louisiana",Abbeville is a city in and the parish seat of ...
4,"Abbeville, Louisiana",Louisiana Military Hall of Fame and Museum,LCVP (United States),"The landing craft, vehicle, personnel (LCVP) o..."


In [36]:
# Remove articles that aren't exclusively about the POI
check_inds = []
rm_inds = []
for i in np.arange(len(input_df.index)):
    city = input_df.Location.iloc[i]
    text = input_df.Article_content.iloc[i]
    title = input_df.Article_title.iloc[i]
    try:
        if title == city:
            rm_inds.append(i)
            continue
        elif 'list' in title.lower():
            rm_inds.append(i)
            continue
        elif 'listing' in title.lower():
            rm_inds.append(i)
            continue
        elif title != input_df.Name.iloc[i]:
            check_inds.append(i)
    except:
        rm_inds.append(i)

In [42]:
input_df['Check'] = 0
input_df['Check'].iloc[check_inds] = 1

In [79]:
input_df.Check.sum()

7875

In [46]:
#input_df.to_csv('%s/all_poi_wiki_search_with_title_location_2.csv' % (datadir))

In [80]:
input_df.loc[np.array(input_df.Check, dtype=bool)].iloc[:2000].to_excel('%s/check_all_poi_wiki_search_with_title_location_0-2000.xlsx' % (datadir))

In [None]:
# Remove articles that aren't exclusively about the POI
rm_inds = []
for i, text in enumerate(poi_df.Article_content):
    try:
        if 'This list of' in text[:12]:
            rm_inds.append(i)
        elif 'This is a list of' in text[:17]:
            rm_inds.append(i)
        elif 'An ' == text[:3]:
            check_inds.append(i)
        elif 'A ' == text[:2]:
            check_inds.append(i)
        elif 'is a city located in' in text[:40]:
            check_inds.append(i)
        elif 'is a town located in' in text[:40]:
            check_inds.append(i)
        elif 'is a borough located in' in text[:40]:
            check_inds.append(i)
        elif 'the population was' in text:
            check_inds.append(i)
    except:
        check_inds.append(i)

In [None]:
fix_inds = [244, 2128, 5204, 6753, 11400, 16168, 17265, 20957, 23298]
rm_inds = list(set(rm_inds).difference(set(fix_inds)))

In [None]:
nlp_poi_df = poi_df.copy()
nlp_poi_df.Article_content.iloc[rm_inds] = np.nan
nlp_poi_df.dropna(axis=0, how='any', inplace=True)
nlp_poi_df.reset_index(inplace=True, drop=True)
#nlp_poi_df.to_excel('%s/all_poi_for_nlp.xlsx' % (datadir))

In [None]:
nlp_poi_df.shape

In [None]:
remove_inds = [0, 2, 3, 10, 11, 12, 13, 15, 16, 17, 19, 21, 26, 28, 31, 32, 36, 38, 
              40, 45, 56, 57, 75, 76, 77, 79, 80, 85, 86, 87, 90, 91, 92, 93, 98
              103, 107, 112, 115, 123, 128, 138, 140, 141, 143, 146, 147, 148, 149, 
              151, 152, 156, 157, 165, 166, 170, 178, 183, 186, 189, 191, 193]

In [None]:
#nlp_poi_df.iloc[199].Article_content

In [None]:
nlp_poi_df.iloc[:20]

### By City

In [84]:
def get_article_content_by_city(city, df):
    city_mask = df.Location == city
    tmp_content = list(df.loc[city_mask].Article_content)
    text = (' ').join(tmp_content)
    return text

In [104]:
def get_pois_by_city(city, df):
    city_mask = df.Location == city
    tmp_content = list(df.loc[city_mask].Name)
    text = (', ').join(tmp_content)
    return text

In [105]:
raw_city_articles = []
city_pois = []
cities = input_df.Location.unique()
for city in cities:
    tmp_article = get_article_content_by_city(city, input_df)
    raw_city_articles.append(tmp_article)
    tmp_pois = get_pois_by_city(city, input_df)
    city_pois.append(tmp_pois)

In [106]:
poi_by_city_df = pd.DataFrame(np.array([cities, city_pois, raw_city_articles]).T, 
                              columns=['Location', 'POIs', 'Article_content'])

In [107]:
poi_by_city_df.head(10)

Unnamed: 0,Location,POIs,Article_content
0,"Abbeville, Louisiana",Palmetto Island State Park,Palmetto Island State Park is a recent additio...
1,"Abbeville, Mississippi",Civil War Earthworks at Tallahatchie Crossing,The Civil War Earthworks at Tallahatchie Cross...
2,"Abbeville, South Carolina",Burt-Stark Mansion,"The Burt-Stark Mansion, also known as Armistea..."
3,"Abercrombie, North Dakota",Fort Abercrombie State Historic Site,"Fort Abercrombie, in North Dakota, was an Amer..."
4,"Aberdeen Proving Ground, Maryland","Gunpowder Meetinghouse, Presbury Meetinghouse",Gunpowder Meetinghouse is a historic Methodist...
5,"Abilene, Kansas","Eisenhower Presidential Library and Museum, Ei...","The Dwight D. Eisenhower Presidential Library,..."
6,"Abingdon, Virginia",William King Museum of Art,"William King Museum of Art, located in Abingdo..."
7,"Abington, Massachusetts",Ames Nowell State Park,Ames Nowell State Park is a 700-acre (280 ha) ...
8,"Abiqui, New Mexico","Georgia O'Keeffe Home and Studio, Florence Haw...",The Georgia O'Keeffe Home and Studio is a hist...
9,"Abita Springs, Louisiana",Abita Mystery House,The Abita Mystery House is a roadside attracti...


In [108]:
poi_by_city_df_f = '%s/unchecked_poi_by_city_for_nlp.csv' % (datadir)
poi_by_city_df.to_csv(poi_by_city_df_f)