# Getting things going and highlighting "the problem"

In [1]:
import pandas as pd
import sqlite3

ramphs = pd.read_csv("https://raw.githubusercontent.com/sfsheath/roman-amphitheaters/master/roman-amphitheaters.csv")
chronogrps = pd.read_csv("https://raw.githubusercontent.com/sfsheath/roman-amphitheaters/master/chronogrps.csv")

# I put an uncompressed version of pleiades-places into my folder on the github repo
pleiades = pd.read_csv("https://raw.githubusercontent.com/isaw-ga-3024/isaw-ga-3024.github.io/master/heath-sebastian/pleiades-places.csv")

# One note: because it takes a long time to load the pleiades data,
# avoid running this cell again unless necessary
# As I look back on class, perhaps this is why Mikael reported 
# that his replace was taking a long time. Was the data reloading across the internet?

In [2]:
# ramphs, chronogrps, and pleiades are now pandas DataFrames

print(type(ramphs))
print(type(chronogrps))
print(type(pleiades))

# Again, these are pandas DataFrames, not sql tables!

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [3]:
# OK, to our problem in class.

# inspect the values in both DataFrmes
print(ramphs['pleiades'].head(2))
print(pleiades['id'].head(2))

# They're different. Bummer.

0    http://pleiades.stoa.org/places/893989
1    http://pleiades.stoa.org/places/148217
Name: pleiades, dtype: object
0    265876
1    265877
Name: id, dtype: int64


# A tiny bit of string manipulation

In [4]:
tmp = "Hello World"
tmp.replace("Hello", "") # note that this doesn't remove the space

' World'

# A little bit of pandas dataframe manipulation

In [5]:
# first
ramphs.head(2)

Unnamed: 0,id,title,label,pleiades,type,capacity,modcountry,chronogrp,certainty,youtube,extmajor,extminor,arenamajor,arenaminor,latitude,longitude,elevation
0,duraEuroposAmphitheater,Amphitheater at Dura Europos,Dura,http://pleiades.stoa.org/places/893989,amphitheater,1000.0,Syria,Severan,,,50.0,44.0,31.0,25.0,34.749855,40.728926,223
1,arlesAmphitheater,Amphitheater at Arles,Arles,http://pleiades.stoa.org/places/148217,amphitheater,20000.0,France,Flavian,,https://www.youtube.com/watch?v=oCz-76hb1LU,136.0,107.0,47.0,32.0,43.677778,4.631111,21


In [6]:
# then
ramphs['pleiades'].head(2)

# so it's easy to output either all columns, or just a selection.
# And it's likely that your notebook is trying to nicely format the output of ramphs.head(2).

0    http://pleiades.stoa.org/places/893989
1    http://pleiades.stoa.org/places/148217
Name: pleiades, dtype: object

In [8]:
# it's easy to create new columns
ramphs['new'] = "test"
ramphs['new'].head(5)

# And look!!!! Very important to note that pandas will automatically assign "test" to all rows.
# That's one of its great powers.
ramphs.head(2)

Unnamed: 0,id,title,label,pleiades,type,capacity,modcountry,chronogrp,certainty,youtube,extmajor,extminor,arenamajor,arenaminor,latitude,longitude,elevation,new
0,duraEuroposAmphitheater,Amphitheater at Dura Europos,Dura,http://pleiades.stoa.org/places/893989,amphitheater,1000.0,Syria,Severan,,,50.0,44.0,31.0,25.0,34.749855,40.728926,223,test
1,arlesAmphitheater,Amphitheater at Arles,Arles,http://pleiades.stoa.org/places/148217,amphitheater,20000.0,France,Flavian,,https://www.youtube.com/watch?v=oCz-76hb1LU,136.0,107.0,47.0,32.0,43.677778,4.631111,21,test


In [9]:
# combining pandas and string manipulation
ramphs['new'].str.replace("e","").head(5) # do note the ".str" before ".replace"

# Again, pandas calculates result for all rows

0    tst
1    tst
2    tst
3    tst
4    tst
Name: new, dtype: object

In [10]:
# here's a useful construct, though dangerous in that 
# it changes data (a concern some of you raised in class)


ramphs['new'] = ramphs['new'].str.replace("t","")
ramphs['new'].head(5)

0    es
1    es
2    es
3    es
4    es
Name: new, dtype: object

# Review of adding tables to an sql database (to keep our terminology straight)

In [11]:
# The conn variable is a 'database connection'
# we are making this database in memory, meaning it won't be saved
conn = sqlite3.connect(':memory:')

# as convenience, I've put the above in a separate cell. You may not need to run it again.


In [18]:
# the following two lines create sql tables from pandas DataFrames
ramphs.to_sql('ramphs',conn,if_exists="replace")
chronogrps.to_sql('chronogrps', conn, if_exists="replace")

# copy-paste, then edit one of the the above lines to make a pleiades table
pleiades.to_sql('pleiades',conn,if_exists="replace")

In [24]:
# let's test that we have sql tables
pd.read_sql("""SELECT id,pleiades FROM RAMPHS WHERE capacity > 40000""",conn)

# but that only confirms all is well working for the ramphs table.
# Good to test others as well.
pd.read_sql("""SELECT chronogrp FROM chronogrps""",conn)

Unnamed: 0,chronogrp
0,Republican
1,Caesarean
2,Augustan
3,Julio-Claudian
4,Neronian
5,Flavian
6,First Century
7,Late1stEarly2nd
8,Hadrianic
9,Second Century


In [16]:
# a reminder
pd.read_sql("""PRAGMA table_info(pleiades)""",conn)
# will display the columns in the pleiades table (assuming you called it 'pleiades')
# look at those columns, which ones have 'chronological' information

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,index,INTEGER,0,,0
1,1,authors,TEXT,0,,0
2,2,bbox,TEXT,0,,0
3,3,connectsWith,TEXT,0,,0
4,4,created,TEXT,0,,0
5,5,creators,TEXT,0,,0
6,6,currentVersion,INTEGER,0,,0
7,7,description,TEXT,0,,0
8,8,extent,TEXT,0,,0
9,9,featureTypes,TEXT,0,,0


# More mixing of pandas and sql

In [26]:
# worth noting that the result of pd.read_sql statement is a pandas DataFrame
type(pd.read_sql("""SELECT id FROM ramphs WHERE capacity > 20000""",conn))

# this means you can use pandas .groupby on it. This is familiar from last week
pd.read_sql("""SELECT modcountry,capacity FROM ramphs""",conn).groupby('modcountry').count()


Unnamed: 0_level_0,capacity
modcountry,Unnamed: 1_level_1
Albania,1
Algeria,6
Austria,3
Bulgaria,2
Croatia,2
Cyprus,0
France,18
Germany,2
Greece,1
Hungary,0


In [21]:
# You now have all the piecces to move towards completing and going beyond
# the task I set in class. The task was:

# "For each amphitheater with a Pleiades ID,
# list that ID's ancient name(s) and chronological information
# First step... what are the steps?"

# At a minimum, write an SQL SELECT query that lists amphitheater IDs that have
# Pleiades identifiers along with the chronological information that pleiades records

# MORE INTERESTING AND HARDER: Write an SQL SELECT query that also lists the start column
# from the chronogrps table.
# So four columns at least: ramphs.id, chronogrps.start, pleiades.minDate, pleiades.maxDate

# once you do this it's easy to imagine a visualization of the difference between the founding
# of a city as indicated by pleiades and the construction of its amphitheater.
# A histogram of this would be nice. I'd calculate the difference using panda.
# (Or rather, that's how I did it.)

# Or think of some other manipulation that goes a step beyond the "at a minumum" query.


# Finally, as you ponder how to do the above, think about the conceptual range of DH as 
# found in this week's readings and as experienced while manipulating data in this
# ipython notebook. It's all DH and we love it all. How can theory and practice fit together?
# What other readings we've been assigned are relevant. This is just fodder for conversation.

In [192]:
import pandas as pd
import sqlite3

conn = sqlite3.connect(':memory:')
ramphs = pd.read_csv("https://raw.githubusercontent.com/sfsheath/roman-amphitheaters/master/roman-amphitheaters.csv")
chronogrps = pd.read_csv("https://raw.githubusercontent.com/sfsheath/roman-amphitheaters/master/chronogrps.csv")
pleiades = pd.read_csv("https://raw.githubusercontent.com/isaw-ga-3024/isaw-ga-3024.github.io/master/heath-sebastian/pleiades-places.csv")

ramphs['pleiades'] = ramphs['pleiades'].str.replace("http://pleiades.stoa.org/places/","")

ramphs.to_sql('ramphs',conn,if_exists="replace")
chronogrps.to_sql('chronogrps', conn, if_exists="replace")
pleiades.to_sql('pleiades',conn,if_exists="replace")

#pd.read_sql("""SELECT id, pleiades FROM ramphs WHERE pleiades IS NOT NULL""",conn)
#ramphs.head(2)
#pd.read_sql("""SELECT chronogrp FROM ramphs LEFT OUTER JOIN chronogrps on ramphs.chronogrp=chronogrps.chronogrp""", conn)

#pd.read_sql("""SELECT id, chronogrp FROM ramphs INNER JOIN chronogrps ON ramphs.chronogrp=chronogrps.chronogrp""",conn)
#pd.read_sql("""SELECT ramphs.id,pleiades.minDate,pleiades.maxDate FROM ramphs, pleiades WHERE ramphs.pleiades = pleiades.id ORDER BY pleiades.minDate""", conn)

pd.read_sql("""SELECT ramphs.id,ramphs.title,ramphs.chronogrp,chronogrps.start,pleiades.minDate,pleiades.maxDate FROM ramphs, chronogrps, pleiades WHERE (ramphs.pleiades=pleiades.id) AND (ramphs.chronogrp=chronogrps.chronogrp) ORDER BY (chronogrps.start)""",conn)

Unnamed: 0,id,title,chronogrp,start,minDate,maxDate
0,pompeiiAmphitheater,Amphitheater at Pompeii,Republican,-70,-750.0,2100.0
1,cumaeAmphitheater,Amphitheater at Cumae,Republican,-70,-750.0,640.0
2,pozzuoliEarlyAmphitheater,Early Amphitheater at Pozzuoli,Republican,-70,-750.0,2100.0
3,paestumAmphitheater,Amphitheater at Paestum,Republican,-70,-750.0,2100.0
4,avellaAmphitheater,Amphitheater at Abella,Republican,-70,-330.0,640.0
5,ferentoAmphitheater,Amphitheater at Ferento,Republican,-70,-550.0,640.0
6,sutriumAmphitheatre,Amphitheater at Sutrium,Republican,-70,-550.0,640.0
7,telesiaAmphitheatre,Amphitheater at Telesia,Republican,-70,-330.0,640.0
8,antiochAmphitheater,Amphitheater at Antioch,Republican,-70,-1750.0,2100.0
9,beneventoAmphitheater,Amphitheater at Benevento,Republican,-70,-330.0,2100.0


# What if I don't accept the assignment?

In [191]:
# That's OK. You can always use your own data or data you download. But you need to turn in
# an iPython notebook that demonstrates skills of approximately the same level of difficulty
# as what is above. At a minimum: a many-to-one or one-to-many relationship that is queried
# via SQL and then used to say something minimally interesting. The usual "go for it" applies.
import pandas as pd
import sqlite3

conn = sqlite3.connect(':memory:')
anger = pd.read_csv("https://raw.githubusercontent.com/isaw-ga-3024/isaw-ga-3024.github.io/master/rebekah-rust-rebekahrust/Anger%20Database.csv")
play_info = pd.read_csv("https://raw.githubusercontent.com/isaw-ga-3024/isaw-ga-3024.github.io/master/rebekah-rust-rebekahrust/play-info.csv")

anger.to_sql('anger',conn,if_exists="replace")
play_info.to_sql('play_info',conn,if_exists="replace")


#pd.read_sql("""SELECT play_info.title,anger.line_number,play_info.date
#FROM play_info,anger
#WHERE (anger.playwright='sophocles')
#AND (play_info.playwright='sophocles')
#AND (play_info.date<=431)
#AND (play_info.date>=404)
#AND(play_info.prize=1)
#""",conn)

pre_pelop_war_plays = pd.read_sql("""SELECT DISTINCT play_info.title,play_info.playwright,play_info.date
FROM play_info,anger
WHERE (play_info.date>=431)
""",conn)

archidamian_war_plays = pd.read_sql("""SELECT DISTINCT play_info.title,play_info.playwright,play_info.date
FROM play_info,anger
WHERE (play_info.date<=431
AND (play_info.date>=421))
""",conn)

peace_of_nicias_plays = pd.read_sql("""SELECT DISTINCT play_info.title,play_info.playwright,play_info.date
FROM play_info,anger
WHERE (play_info.date<=421)
AND(play_info.date>=413)
""",conn)

second_pelop_war_plays = pd.read_sql("""SELECT DISTINCT play_info.title,play_info.playwright,play_info.date
FROM play_info,anger
WHERE (play_info.date<=413
AND (play_info.date>=404))
""",conn)

post_pelop_war_plays = pd.read_sql("""SELECT DISTINCT play_info.title,play_info.playwright,play_info.date
FROM play_info,anger
WHERE (play_info.date<=404)
""",conn)


pd.read_sql("""SELECT anger.word,play_info.title,anger.line_number,play_info.date
FROM play_info,anger
WHERE (play_info.playwright='sophocles')
AND (play_info.prize=1)
AND (anger.lexical_reference_form="θυμός")
ORDER BY play_info.date
""",conn)



#Not sure what the following error is referring to exactly

  chunksize=chunksize, dtype=dtype)


Unnamed: 0,word,title,line_number,date


In [182]:
#So it seems that sql doesn't recognize unidcode, ergo the following code produces nothing.
#I'm going to read up on how the Classical Toolkit can help my unicode issues this week, but right now I've run out of time.
pd.read_sql("""SELECT anger.play,anger.line_number,anger.lexical_reference_form,play_info.playwright,play_info.date
FROM play_info,anger
WHERE (play_info.playwright=anger.playwright)
AND (anger.play=play_info.title)
AND (anger.lexical_reference_form="θυμός")
AND (play_info.date<=413
AND (play_info.date>=404))
""",conn)

Unnamed: 0,play,line_number,lexical_reference_form,playwright,date


In [167]:
pd.read_sql("""SELECT anger.word,anger.play,anger.line_number,play_info.date, play_info.prize
FROM play_info,anger
WHERE (play_info.playwright=anger.playwright)
AND (play_info.playwright='sophocles')
AND (anger.play=play_info.title)
AND (play_info.prize>=1)
AND (anger.line_number<=250)
ORDER BY (play_info.date)
""",conn)

Unnamed: 0,word,play,line_number,date,prize
0,δυσθύμῳ,elektra,218,418.0,2.0
1,θυμὸν,elektra,26,418.0,2.0
2,πρόθυμος,elektra,3,418.0,2.0
3,χόλον,elektra,176,418.0,2.0
4,ὀργά,elektra,222,418.0,2.0
5,θυμοῦ,oedipus_tyrannus,244,429.0,2.0
6,προθυμίας,oedipus_tyrannus,48,429.0,2.0
7,θυμοφθορῶ,trachiniae,142,437.0,2.0
8,ἐνθυμίοις,trachiniae,109,437.0,2.0
9,ἀθυμίαν,antigone,237,442.0,1.0


In [179]:
pd.read_sql("""SELECT anger.word,anger.play,anger.line_number,play_info.date
FROM play_info,anger
WHERE (play_info.playwright=anger.playwright)
AND (anger.play=play_info.title)
AND (anger.genre='tragedy')
AND (play_info.date<=421)
AND (play_info.date>=413)
ORDER BY (play_info.date)
""",conn)

Unnamed: 0,word,play,line_number,date
0,ὀργὴ,iphigenia_at_taurus,987,414.0
1,προθυμία,iphigenia_at_taurus,616,414.0
2,πρόθυμος,iphigenia_at_taurus,910,414.0
3,πρόθυμον,iphigenia_at_taurus,989,414.0
4,θυμουμένη,iphigenia_at_taurus,993,414.0
5,πρόθυμον,iphigenia_at_taurus,1023,414.0
6,θυμοῦ,iphigenia_at_taurus,1474,414.0
7,θυμοῦμαι,iphigenia_at_taurus,1478,414.0
8,μῆνιν,iphigenia_at_taurus,1272,414.0
9,χόλον,iphigenia_at_taurus,1439,414.0
