/
danfs.py
173 lines (148 loc) · 6.43 KB
/
danfs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# -*- coding: utf-8 -*-
"""Download data from the Dictionary of American Naval Fighting Ships (DANFS)
"""
import argparse
import json
import urllib
from os import path
import requests
from bs4 import BeautifulSoup
from sqlalchemy import create_engine, Table, Column, String, MetaData
metadata = MetaData()
table_danfs = Table("danfs_ships", metadata,
Column('id', String(), primary_key = True),
Column('url', String(), nullable = False),
Column('title', String(), nullable = False),
Column('subtitle', String()),
Column('history', String()))
table_confederate = Table("confederate_ships", metadata,
Column('id', String(), primary_key = True),
Column('url', String(), nullable = False),
Column('title', String(), nullable = False),
Column('subtitle', String()),
Column('history', String()))
class DANFSClient(object):
BASE_URL = "http://www.history.navy.mil"
def api_url(self):
return urllib.parse.urljoin(self.BASE_URL,
"research/histories/ship-histories/danfs/jcr:content/api.json")
def api_confed_url(self):
return urllib.parse.urljoin(self.BASE_URL,
"research/histories/ship-histories/confederate_ships/jcr:content.rollup.json")
def get_groups_list(self):
r = requests.get(self.api_url(), params={'get': 'groupsList'})
return r.json()
def get_sub_groups(self, first):
r = requests.get(self.api_url(),
params={'get': 'subGroupsList',
'first': first})
return r.json()
def get_sub_group_ship_list(self, first, start_char, end_char):
r = requests.get(self.api_url(),
params={'get': 'subGroupShipList',
'first': first,
'second': start_char + '-' + end_char})
return r.json()
def get_all_ship_urls(self):
ships = []
for group in self.get_groups_list()['groups']:
for subgroup in self.get_sub_groups(group['group'])['subGroups']:
if 'isEmpty' not in subgroup:
newships = self.get_sub_group_ship_list(group['group'],
subgroup['rangeStartChar'],
subgroup['rangeEndChar'])['DANFs']
ships += newships
return ships
def get_ship_url(self, path):
return urllib.parse.urljoin(self.BASE_URL, path + '.html')
def get_ship_text(self, path):
shipurl = self.get_ship_url(path)
r = requests.get(shipurl)
html = r.text
soup = BeautifulSoup(html, 'lxml')
try:
bodyContainer = soup.find("div", class_="bodyContainer")
text = ''.join(str(x) for x in bodyContainer.find_all(
"div", class_="text parbase section"))
except AttributeError:
print("Problem with text in %s" % path)
text = ''
return text
def get_confederate_groups(self):
url = self.api_confed_url()
r = requests.get(url)
return r.json()
def get_confederate_ships(self, limit, offset):
r = requests.get(self.api_confed_url(),
params={'offset': offset,
'limit': limit})
return r.json()
def get_confederate_ships_all(self):
ships = []
for letter in self.get_confederate_groups()['ranges']:
if 'isEmpty' not in letter:
pages = self.get_confederate_ships(
letter['limit'], letter['offset'])['pages']
ships += pages
return ships
def get_confederate_ship_url(self, path):
return urllib.parse.urljoin(self.BASE_URL, path + '.html')
def get_confederate_ship_text(self, path):
shipurl = self.get_confederate_ship_url(path)
r = requests.get(shipurl)
if r.status_code == requests.codes.ok:
html = r.text
soup = BeautifulSoup(html, 'lxml')
bodyContainer = soup.find("div", class_="bodyContainer")
text = ''.join(str(x) for x in bodyContainer.find_all(
"div", class_="text parbase section"))
else:
print("Cannot find %s" % path)
text = ''
return text
def insert_confederate_ships(con):
client = DANFSClient()
ins = table_confederate.insert()
ships = client.get_confederate_ships_all()
for ship in ships:
print(ship['title'])
data = {'title': ship['title'],
'subtitle': ship['subtitle'],
'id': path.basename(ship['path']),
'history': client.get_confederate_ship_text(ship['path']),
'url': client.get_confederate_ship_url(ship['path'])
}
con.execute(ins, **data)
def insert_danfs(con):
EXCLUDE = ("What's New", )
client = DANFSClient()
ins = table_danfs.insert()
ships = client.get_all_ship_urls()
for ship in ships:
ship_id = path.basename(ship['path'])
print(ship['title'])
if ship['title'] not in EXCLUDE:
data = {'title': ship['title'],
'subtitle': ship['subtitle'],
'id': ship_id,
'history': client.get_ship_text(ship['path']),
'url': client.get_ship_url(ship['path'])
}
con.execute(ins, **data)
def build(dbname):
engine = create_engine(DB)
metadata.bind = engine
metadata.drop_all()
metadata.create_all(checkfirst = True)
con = metadata.bind
insert_danfs(con)
insert_confederate_ships(con)
def main():
parser = parser = argparse.ArgumentParser(description="Download DANFS data into a database")
parser.add_argument('db', metavar='CONNECTION_STRING', nargs='1',
help="A sqlalchemy connection string to use to directly execute generated SQL on a database.",
default = "sqlite:///danfs.sqlite3")
args = parser.parse_args()
build(args.db)
if __name__ == "__main__":
main()