# Getting things going and highlighting "the problem"

In [58]:
import pandas as pd
import sqlite3

ramphs = pd.read_csv("https://raw.githubusercontent.com/sfsheath/roman-amphitheaters/master/roman-amphitheaters.csv")
chronogrps = pd.read_csv("https://raw.githubusercontent.com/sfsheath/roman-amphitheaters/master/chronogrps.csv")

# I put an uncompressed version of pleiades-places into my folder on the github repo
pleiades = pd.read_csv("https://raw.githubusercontent.com/isaw-ga-3024/isaw-ga-3024.github.io/master/heath-sebastian/pleiades-places.csv")

# One note: because it takes a long time to load the pleiades data,
# avoid running this cell again unless necessary
# As I look back on class, perhaps this is why Mikael reported 
# that his replace was taking a long time. Was the data reloading across the internet?

In [59]:
# ramphs, chronogrps, and pleiades are now pandas DataFrames

print(type(ramphs))
print(type(chronogrps))
print(type(pleiades))

# Again, these are pandas DataFrames, not sql tables!

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [60]:
# OK, to our problem in class.

# inspect the values in both DataFrmes
print(ramphs['pleiades'].head(2))
print(pleiades['id'].head(2))

# They're different. Bummer.

0    http://pleiades.stoa.org/places/893989
1    http://pleiades.stoa.org/places/148217
Name: pleiades, dtype: object
0    265876
1    265877
Name: id, dtype: int64


# A tiny bit of string manipulation

In [61]:
tmp = "Hello World"
tmp.replace("Hello", "") # note that this doesn't remove the space

' World'

# A little bit of pandas dataframe manipulation

In [62]:
# first
ramphs.head(2)

Unnamed: 0,id,title,label,pleiades,type,capacity,modcountry,chronogrp,certainty,youtube,extmajor,extminor,arenamajor,arenaminor,latitude,longitude,elevation
0,duraEuroposAmphitheater,Amphitheater at Dura Europos,Dura,http://pleiades.stoa.org/places/893989,amphitheater,1000,Syria,Severan,,,50,44,31,25,34.749855,40.728926,223
1,arlesAmphitheater,Amphitheater at Arles,Arles,http://pleiades.stoa.org/places/148217,amphitheater,20000,France,Flavian,,https://www.youtube.com/watch?v=oCz-76hb1LU,136,107,47,32,43.677778,4.631111,21


In [63]:
# then
ramphs['pleiades'].head(2)

# so it's easy to output either all columns, or just a selection.
# And it's likely that your notebook is trying to nicely format the output of ramphs.head(2).

0    http://pleiades.stoa.org/places/893989
1    http://pleiades.stoa.org/places/148217
Name: pleiades, dtype: object

In [64]:
# it's easy to create new columns
ramphs['new'] = "test"
ramphs['new'].head(5)

# And look!!!! Very important to note that pandas will automatically assign "test" to all rows.
# That's one of its great powers.

0    test
1    test
2    test
3    test
4    test
Name: new, dtype: object

In [65]:
# combining pandas and string manipulation
ramphs['new'].str.replace("e","").head(5) # do note the ".str" before ".replace"

# Again, pandas calculates result for all rows

0    tst
1    tst
2    tst
3    tst
4    tst
Name: new, dtype: object

In [66]:
# here's a useful construct, though dangerous in that 
# it changes data (a concern some of you raised in class)


ramphs['new'] = ramphs['new'].str.replace("t","")
ramphs['new'].head(5)

0    es
1    es
2    es
3    es
4    es
Name: new, dtype: object

# Review of adding tables to an sql database (to keep our terminology straight)

In [67]:
# The conn variable is a 'database connection'
# we are making this database in memory, meaning it won't be saved
conn = sqlite3.connect(':memory:')

# as convenience, I've put the above in a separate cell. You may not need to run it again.


In [68]:
# the following two lines create sql tables from pandas DataFrames
ramphs.to_sql('ramphs',conn,if_exists="replace")
chronogrps.to_sql('chronogrps', conn, if_exists="replace")

# copy-paste, then edit one of the the above lines to make a pleiades table


In [69]:
# let's test that we have sql tables
pd.read_sql("""SELECT id,pleiades FROM RAMPHS WHERE capacity > 40000""",conn)

# but that only confirms all is well working for the ramphs table.
# Good to test others as well.

Unnamed: 0,id,pleiades
0,romeFlavianAmphitheater,http://pleiades.stoa.org/places/423025


In [87]:
# a reminder
pd.read_sql("""PRAGMA table_info(pleiades)""",conn)
# will display the columns in the pleiades table (assuming you called it 'pleiades')
# look at those columns, which ones have 'chronological' information

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,index,INTEGER,0,,0
1,1,authors,TEXT,0,,0
2,2,bbox,TEXT,0,,0
3,3,connectsWith,TEXT,0,,0
4,4,created,TEXT,0,,0
5,5,creators,TEXT,0,,0
6,6,currentVersion,INTEGER,0,,0
7,7,description,TEXT,0,,0
8,8,extent,TEXT,0,,0
9,9,featureTypes,TEXT,0,,0


# More mixing of pandas and sql

In [71]:
# worth noting that the result of pd.read_sql statement is a pandas DataFrame
type(pd.read_sql("""SELECT id FROM ramphs WHERE capacity > 20000""",conn))

# this means you can use pandas .groupby on it. This is familiar from last week
pd.read_sql("""SELECT modcountry,capacity FROM ramphs""",conn).groupby('modcountry').count()


Unnamed: 0_level_0,capacity
modcountry,Unnamed: 1_level_1
Albania,1
Algeria,6
Austria,3
Bulgaria,2
Croatia,2
Cyprus,0
France,18
Germany,2
Greece,1
Hungary,0


In [74]:
import pandas as pd
import sqlite3

ramphs = pd.read_csv("https://raw.githubusercontent.com/sfsheath/roman-amphitheaters/master/roman-amphitheaters.csv")
chronogrps = pd.read_csv("https://raw.githubusercontent.com/sfsheath/roman-amphitheaters/master/chronogrps.csv")
pleiades = pd.read_csv("https://raw.githubusercontent.com/isaw-ga-3024/isaw-ga-3024.github.io/master/heath-sebastian/pleiades-places.csv")

In [75]:
ramphs.to_sql('ramphs',conn,if_exists="replace")
chronogrps.to_sql('chronogrps', conn, if_exists="replace")
pleiades.to_sql('pleiades',conn,if_exists="replace")

In [83]:
ramphs['pleiades'] = ramphs['pleiades'].str.replace("http://pleiades.stoa.org/places/","")
ramphs.to_sql('ramphs',conn,if_exists="replace")
pd.read_sql("""SELECT ramphs.pleiades, pleiades.id FROM ramphs, pleiades LIMIT 6""", conn)

Unnamed: 0,pleiades,id
0,893989,265876
1,893989,265877
2,893989,265878
3,893989,265880
4,893989,265882
5,893989,265883


In [84]:
pd.read_sql("""SELECT ramphs.id, pleiades.title, pleiades.minDate, pleiades.maxDate 
FROM pleiades, ramphs
WHERE pleiades.id = ramphs.pleiades
ORDER BY pleiades.minDate""", conn)

Unnamed: 0,id,title,minDate,maxDate
0,pergamumAmphitheater,Pergamum,-1750,2000
1,antiochAmphitheater,Antiochia/Theoupolis,-1750,2100
2,palmyraAmphitheater,Palmyra,-1600,2100
3,bostraAmphitheater,Bostra/Col. Nova Traiana Alexandriana,-1344,2000
4,cyzicusAmphitheater,Cyzicus,-1200,2000
5,arlesAmphitheater,Theline/Col. Arelate/Constantina,-750,2100
6,ludusMagnusArena,Roma,-750,2100
7,romeFlavianAmphitheater,Roma,-750,2100
8,romeAmphitheatrumCastrense,Roma,-750,2100
9,pompeiiAmphitheater,Pompeii,-750,2100


In [78]:
pd.read_sql("""SELECT ramphs.id, chronogrps.start, pleiades.title, pleiades.minDate, pleiades.maxDate 
FROM pleiades, ramphs, chronogrps 
WHERE (pleiades.id = ramphs.pleiades) AND (ramphs.chronogrp = chronogrps.chronogrp)
ORDER BY chronogrps.start""", conn)

Unnamed: 0,id,start,title,minDate,maxDate
0,pompeiiAmphitheater,-70,Pompeii,-750,2100
1,cumaeAmphitheater,-70,Cumae/Kyme (Campanian),-750,640
2,pozzuoliEarlyAmphitheater,-70,Dikaiarcheia/Puteoli,-750,2100
3,paestumAmphitheater,-70,Poseidonia/Paestum,-750,2100
4,avellaAmphitheater,-70,Abella,-330,640
5,ferentoAmphitheater,-70,Lucus Feroniae,-550,640
6,sutriumAmphitheatre,-70,Sutrium,-550,640
7,telesiaAmphitheatre,-70,Telesia,-330,640
8,antiochAmphitheater,-70,Antiochia/Theoupolis,-1750,2100
9,beneventoAmphitheater,-70,Beneventum/Maleventum,-330,2100


In [86]:
ramphs2 = pd.read_sql("""SELECT ramphs.id, chronogrps.start, pleiades.title, pleiades.minDate, pleiades.maxDate 
FROM pleiades, ramphs, chronogrps 
WHERE (pleiades.id = ramphs.pleiades) AND (ramphs.chronogrp = chronogrps.chronogrp)
ORDER BY chronogrps.start""", conn)
ramphs2.to_sql("ramphs2", conn, if_exists='replace')
ramphs2.head(230)

Unnamed: 0,id,start,title,minDate,maxDate
0,pompeiiAmphitheater,-70,Pompeii,-750,2100
1,cumaeAmphitheater,-70,Cumae/Kyme (Campanian),-750,640
2,pozzuoliEarlyAmphitheater,-70,Dikaiarcheia/Puteoli,-750,2100
3,paestumAmphitheater,-70,Poseidonia/Paestum,-750,2100
4,avellaAmphitheater,-70,Abella,-330,640
5,ferentoAmphitheater,-70,Lucus Feroniae,-550,640
6,sutriumAmphitheatre,-70,Sutrium,-550,640
7,telesiaAmphitheatre,-70,Telesia,-330,640
8,antiochAmphitheater,-70,Antiochia/Theoupolis,-1750,2100
9,beneventoAmphitheater,-70,Beneventum/Maleventum,-330,2100


# What if I don't accept the assignment?