# Getting things going and highlighting "the problem"

In [1]:
import pandas as pd
import sqlite3

ramphs = pd.read_csv("https://raw.githubusercontent.com/sfsheath/roman-amphitheaters/master/roman-amphitheaters.csv")
chronogrps = pd.read_csv("https://raw.githubusercontent.com/sfsheath/roman-amphitheaters/master/chronogrps.csv")

# I put an uncompressed version of pleiades-places into my folder on the github repo
pleiades = pd.read_csv("https://raw.githubusercontent.com/isaw-ga-3024/isaw-ga-3024.github.io/master/heath-sebastian/pleiades-places.csv")

# One note: because it takes a long time to load the pleiades data,
# avoid running this cell again unless necessary
# As I look back on class, perhaps this is why Mikael reported 
# that his replace was taking a long time. Was the data reloading across the internet?

In [2]:
# ramphs, chronogrps, and pleiades are now pandas DataFrames

print(type(ramphs))
print(type(chronogrps))
print(type(pleiades))

# Again, these are pandas DataFrames, not sql tables!

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [3]:
# OK, to our problem in class.

# inspect the values in both DataFrmes
print(ramphs['pleiades'].head(2))
print(pleiades['id'].head(2))

# They're different. Bummer.

0    http://pleiades.stoa.org/places/893989
1    http://pleiades.stoa.org/places/148217
Name: pleiades, dtype: object
0    265876
1    265877
Name: id, dtype: int64


# A tiny bit of string manipulation

In [4]:
tmp = "Hello World"
tmp.replace("Hello", "") # note that this doesn't remove the space

' World'

# A little bit of pandas dataframe manipulation

In [5]:
# first
ramphs.head(2)

Unnamed: 0,id,title,label,pleiades,type,capacity,modcountry,chronogrp,certainty,youtube,extmajor,extminor,arenamajor,arenaminor,latitude,longitude,elevation
0,duraEuroposAmphitheater,Amphitheater at Dura Europos,Dura,http://pleiades.stoa.org/places/893989,amphitheater,1000.0,Syria,Severan,,,50.0,44.0,31.0,25.0,34.749855,40.728926,223
1,arlesAmphitheater,Amphitheater at Arles,Arles,http://pleiades.stoa.org/places/148217,amphitheater,20000.0,France,Flavian,,https://www.youtube.com/watch?v=oCz-76hb1LU,136.0,107.0,47.0,32.0,43.677778,4.631111,21


In [6]:
# then
ramphs['pleiades'].head(2)

# so it's easy to output either all columns, or just a selection.
# And it's likely that your notebook is trying to nicely format the output of ramphs.head(2).

0    http://pleiades.stoa.org/places/893989
1    http://pleiades.stoa.org/places/148217
Name: pleiades, dtype: object

In [7]:
# it's easy to create new columns
ramphs['new'] = "test"
ramphs['new'].head(5)

# And look!!!! Very important to note that pandas will automatically assign "test" to all rows.
# That's one of its great powers.

0    test
1    test
2    test
3    test
4    test
Name: new, dtype: object

In [8]:
# combining pandas and string manipulation
ramphs['new'].str.replace("e","").head(5) # do note the ".str" before ".replace"

# Again, pandas calculates result for all rows

0    tst
1    tst
2    tst
3    tst
4    tst
Name: new, dtype: object

In [9]:
# here's a useful construct, though dangerous in that 
# it changes data (a concern some of you raised in class)


ramphs['new'] = ramphs['new'].str.replace("t","")
ramphs['new'].head(5)

0    es
1    es
2    es
3    es
4    es
Name: new, dtype: object

# Review of adding tables to an sql database (to keep our terminology straight)

In [10]:
# The conn variable is a 'database connection'
# we are making this database in memory, meaning it won't be saved
conn = sqlite3.connect(':memory:')

# as convenience, I've put the above in a separate cell. You may not need to run it again.


In [11]:
# the following two lines create sql tables from pandas DataFrames
ramphs.to_sql('ramphs',conn,if_exists="replace")
chronogrps.to_sql('chronogrps', conn, if_exists="replace")
pleiades.to_sql('pleiades', conn, if_exists="replace")
# copy-paste, then edit one of the the above lines to make a pleiades table


In [12]:
# let's test that we have sql tables
pd.read_sql("""SELECT id,pleiades FROM RAMPHS WHERE capacity > 40000""",conn)


# but that only confirms all is well working for the ramphs table.
# Good to test others as well.

Unnamed: 0,id,pleiades
0,romeFlavianAmphitheater,http://pleiades.stoa.org/places/423025


In [13]:
pd.read_sql("""SELECT title,id 
FROM pleiades 
WHERE (maxDate > 0) AND (maxDate < 200)""",conn)

Unnamed: 0,title,id
0,Ladle Hill Iron Age Fort,127132165
1,Bullsdown Camp,127132166
2,The Frith,127132167
3,Flower's Barrow,355161047
4,Woden Law hill fort,728181788
5,Pen-y-crug,56616071
6,Clovelly Dykes,975884602
7,Beltany,671053357
8,Pencoedfoel,964510418
9,Pen Dinas hill fort,964510419


In [14]:
pd.read_sql("""SELECT chronogrp,start,end 
FROM chronogrps 
WHERE (start > 0) AND (end < 200)
ORDER BY start""",conn)

Unnamed: 0,chronogrp,start,end
0,First Century,1,99
1,Neronian,54,68
2,Flavian,69,96
3,Late1stEarly2nd,75,125
4,Second Century,100,199
5,Hadrianic,117,138
6,Late Second Century,150,199


In [15]:
pd.read_sql("""PRAGMA table_info(ramphs)""",conn)

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,index,INTEGER,0,,0
1,1,id,TEXT,0,,0
2,2,title,TEXT,0,,0
3,3,label,TEXT,0,,0
4,4,pleiades,TEXT,0,,0
5,5,type,TEXT,0,,0
6,6,capacity,REAL,0,,0
7,7,modcountry,TEXT,0,,0
8,8,chronogrp,TEXT,0,,0
9,9,certainty,TEXT,0,,0


In [16]:
# a reminder
pd.read_sql("""PRAGMA table_info(chronogrps)""",conn)
# will display the columns in the pleiades table (assuming you called it 'pleiades')
# look at those columns, which ones have 'chronological' information

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,index,INTEGER,0,,0
1,1,chronogrp,TEXT,0,,0
2,2,start,INTEGER,0,,0
3,3,end,INTEGER,0,,0


In [17]:
pd.read_sql("""PRAGMA table_info(pleiades)""",conn)

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,index,INTEGER,0,,0
1,1,authors,TEXT,0,,0
2,2,bbox,TEXT,0,,0
3,3,connectsWith,TEXT,0,,0
4,4,created,TEXT,0,,0
5,5,creators,TEXT,0,,0
6,6,currentVersion,INTEGER,0,,0
7,7,description,TEXT,0,,0
8,8,extent,TEXT,0,,0
9,9,featureTypes,TEXT,0,,0


# More mixing of pandas and sql

In [18]:
# worth noting that the result of pd.read_sql statement is a pandas DataFrame
type(pd.read_sql("""SELECT id FROM ramphs WHERE capacity > 20000""",conn))

# this means you can use pandas .groupby on it. This is familiar from last week
pd.read_sql("""SELECT modcountry,capacity FROM ramphs""",conn).groupby('modcountry').count()


Unnamed: 0_level_0,capacity
modcountry,Unnamed: 1_level_1
Albania,1
Algeria,6
Austria,3
Bulgaria,2
Croatia,2
Cyprus,0
France,18
Germany,2
Greece,1
Hungary,0


In [19]:
# "For each amphitheater with a Pleiades ID,
# list that ID's ancient name(s) and chronological information
# First step... what are the steps?"

ramphs['pleiades'] = ramphs['pleiades'].str.replace("http://pleiades.stoa.org/places/","")
ramphs["pleiades"].head(3)

0    893989
1    148217
2    167717
Name: pleiades, dtype: object

In [20]:
ramphs.to_sql('ramphs',conn,if_exists="replace")

In [21]:
pd.read_sql("""
SELECT ramphs.pleiades
FROM ramphs
LIMIT 10
""", conn)

Unnamed: 0,pleiades
0,893989
1,148217
2,167717
3,423025
4,423025
5,423025
6,687854
7,433032
8,256155
9,89304


In [22]:
# At a minimum, write an SQL SELECT query that lists amphitheater IDs that have
# Pleiades identifiers along with the chronological information that pleiades records

pd.read_sql("""
SELECT ramphs.id,pleiades.minDate,pleiades.maxDate
FROM ramphs, pleiades
WHERE ramphs.pleiades = pleiades.id
ORDER BY pleiades.minDate
""", conn)

Unnamed: 0,id,minDate,maxDate
0,pergamumAmphitheater,-1750.0,2000.0
1,antiochAmphitheater,-1750.0,2100.0
2,cyzicusAmphitheater,-1200.0,2000.0
3,arlesAmphitheater,-750.0,2100.0
4,ludusMagnusArena,-750.0,2100.0
5,romeFlavianAmphitheater,-750.0,2100.0
6,romeAmphitheatrumCastrense,-750.0,2100.0
7,pompeiiAmphitheater,-750.0,2100.0
8,conimbrigaAmphitheater,-750.0,640.0
9,cumaeAmphitheater,-750.0,640.0


In [23]:
# MORE INTERESTING AND HARDER: Write an SQL SELECT query that also lists the start column
# from the chronogrps table.
# So four columns at least: ramphs.id, chronogrps.start, pleiades.minDate, pleiades.maxDate

pd.read_sql("""
SELECT ramphs.id, chronogrps.start, pleiades.minDate, pleiades.maxDate
FROM ramphs, pleiades, chronogrps
WHERE (ramphs.pleiades = pleiades.id) AND (ramphs.chronogrp = chronogrps.chronogrp)
ORDER BY chronogrps.start
""", conn)

Unnamed: 0,id,start,minDate,maxDate
0,pompeiiAmphitheater,-70,-750.0,2100.0
1,cumaeAmphitheater,-70,-750.0,640.0
2,pozzuoliEarlyAmphitheater,-70,-750.0,2100.0
3,paestumAmphitheater,-70,-750.0,2100.0
4,avellaAmphitheater,-70,-330.0,640.0
5,ferentoAmphitheater,-70,-550.0,640.0
6,sutriumAmphitheatre,-70,-550.0,640.0
7,telesiaAmphitheatre,-70,-330.0,640.0
8,antiochAmphitheater,-70,-1750.0,2100.0
9,beneventoAmphitheater,-70,-330.0,2100.0


In [24]:
#Foundation of Late Republican Amphitheaters vs Foundation of their Cities

pd.read_sql("""
SELECT ramphs.title, chronogrps.start, chronogrps.end, pleiades.title, pleiades.minDate, pleiades.maxDate
FROM ramphs, pleiades, chronogrps
WHERE (ramphs.pleiades = pleiades.id) AND (ramphs.chronogrp = chronogrps.chronogrp) AND (chronogrps.start >= -70 AND chronogrps.start <= -31)
ORDER BY chronogrps.start
""", conn)



Unnamed: 0,title,start,end,title.1,minDate,maxDate
0,Amphitheater at Abella,-70,-31,Abella,-330.0,640.0
1,Amphitheater at Altripalda,-70,-31,Abellinum,-330.0,640.0
2,Amphitheater at Antioch,-70,-31,Antiochia/Theoupolis,-1750.0,2100.0
3,Amphitheater at Benevento,-70,-31,Beneventum/Maleventum,-330.0,2100.0
4,Amphitheater at Calvi Risorta,-70,-31,Cales,-550.0,640.0
5,Amphitheater at Compsa,-70,-31,Compsa,-550.0,640.0
6,Amphitheater at Cumae,-70,-31,Cumae/Kyme (Campanian),-750.0,640.0
7,Amphitheater at Ferento,-70,-31,Lucus Feroniae,-550.0,640.0
8,Amphitheater at Literno,-70,-31,Liternum,-330.0,640.0
9,Amphitheater at Paestum,-70,-31,Poseidonia/Paestum,-750.0,2100.0


In [27]:
#Difference between the date of the foundation of the city and its amphitheater during this period

df = pd.read_sql("""
SELECT ramphs.title, chronogrps.start, chronogrps.end, pleiades.title, pleiades.minDate, pleiades.maxDate
FROM ramphs, pleiades, chronogrps
WHERE (ramphs.pleiades = pleiades.id) AND (ramphs.chronogrp = chronogrps.chronogrp) AND (chronogrps.start >= -70 AND chronogrps.start <= -31)
ORDER BY chronogrps.start
""", conn)

df['Difference'] = df.start - df.minDate
print(df)

                               title  start  end                      title  \
0             Amphitheater at Abella    -70  -31                     Abella   
1         Amphitheater at Altripalda    -70  -31                  Abellinum   
2            Amphitheater at Antioch    -70  -31       Antiochia/Theoupolis   
3          Amphitheater at Benevento    -70  -31      Beneventum/Maleventum   
4      Amphitheater at Calvi Risorta    -70  -31                      Cales   
5             Amphitheater at Compsa    -70  -31                     Compsa   
6              Amphitheater at Cumae    -70  -31     Cumae/Kyme (Campanian)   
7            Amphitheater at Ferento    -70  -31             Lucus Feroniae   
8            Amphitheater at Literno    -70  -31                   Liternum   
9            Amphitheater at Paestum    -70  -31         Poseidonia/Paestum   
10           Amphitheater at Pompeii    -70  -31                    Pompeii   
11     Amphitheater at Suessa Arunca    -70  -31    

In [28]:
#Average of the difference between the date of the foundation of the city and its amphitheater

df['Difference'].mean()

567.15