# Startup

In [None]:
# let python know you want to use the pandas library
import pandas as pd

# load the csv directly from github
#ramphs = pd.read_csv("https://raw.githubusercontent.com/sfsheath/roman-amphitheaters/master/roman-amphitheaters.csv")

ramphs = pd.read_csv("/Users/sfsh/Documents/roman-amphitheaters/roman-amphitheaters.csv")

# confirm that all is well by displaying first two rows
ramphs.head(2)


# Simple Group By

In [None]:
# very simple group by then sum numeric columns. It makes no sense to sum the latitude and longitude but nor is any harm done
modcountries = ramphs.groupby('modcountry').sum()

# output the results into this notebook
modcountries


# Write to CSV

In [None]:
# You can do this in one line

ramphs.groupby('modcountry').sum().to_csv('modcountries01.csv')

# Or you can do it as a continuation of the above cells

modcountries.to_csv('modcountries02.csv')


# The above two lines of python code should produce the same output, but in two differently named files.
# Look for the files on your computer.

# At this point you may have enough to go forth and make some charts in raw or plotly.
# But perhaps you want to go further with pandas. Some ideas...

# Towards further steps

In [None]:
# The following line will print True or False depending on whether or not the 'chronogrp' column
# contains the characters "Republican"
print(ramphs['chronogrp'].str.contains('Republican'))


In [None]:
# the next line will make a new column in ramphs and fill it with the output of .contains("Repuublican") as seen above

ramphs['republican'] = ramphs['chronogrp'].str.contains('Republican')

# we can group by and sum on the 'republican' column

ramphs.groupby('republican').sum()

# What percentage of amphitheater seats were built after the republican era? That is before 31 BC
# Though do remember that I have a "Julius Caesar" chronogrp. That's technically "Republican".
# Even worse... I stupidly also have "Caesarean" chronogrp. I need to fix that...
# Dealing with such issues will be a topic next week.
# And yes, we don't always know amphitheater capacity and when we do, it's actually an estimate subject to debate...

# IMPORTANT: You can now write this file to a csv and make plots in raw or plotly

# How about this?

In [None]:
# There are many, many interesting directions to take such exploratory analysis. Here's one

# execute the following line to group by modcountry, then by chronogrp, then to sum.
ramphs.groupby(("modcountry","chronogrp")).sum()


In [None]:
# when you write this to a csv, all cells are usefully filled in. Run that through raw or plotly and see what you can do.
# "Circle Packing" in raw might work well. Though perhaps try reversing "modcountry" and "chronogrp".
ramphs.groupby(("modcountry","chronogrp")).sum().to_csv("two-level-sum.csv")


In [None]:
import sqlite3
conn = sqlite3.connect(':memory:')
ramphs.to_sql('ramphs',conn,if_exists="replace")

In [None]:
for row in conn.execute("SELECT chronogrp FROM ramphs WHERE chronogrp = 'syracuseAmphitheater'"):
    print(row)

for row in conn.execute("SELECT id,chronogrp FROM ramphs"):
    print(row)

In [None]:
chronogrps = pd.read_csv("/Users/sfsh/Documents/roman-amphitheaters/chronogrps.csv")
chronogrps.head(2)
chronogrps.to_sql('chronogrps', conn, if_exists="replace")

for row in conn.execute("""SELECT id,chronogrps.start 
FROM ramphs,chronogrps
WHERE ramphs.chronogrp = chronogrps.chronogrp
ORDER BY chronogrps.start"""):
    print(row)
    
pd.read_sql("""SELECT id,chronogrps.start 
FROM ramphs,chronogrps
WHERE ramphs.chronogrp = chronogrps.chronogrp
ORDER BY chronogrps.start""", conn)

pd.read_sql("""SELECT id,chronogrps.start 
FROM ramphs LEFT OUTER JOIN chronogrps 
ON ramphs.chronogrp = chronogrps.chronogrp
ORDER BY chronogrps.start""", conn)