In [11]:
import psycopg2
import pandas as pd

# Prepare the Data

Run `docker-compose up` to get Postgres up and running.

In [12]:
credentials = "dbname='astronomy' user='user' host='localhost' password='pass'"
conn = psycopg2.connect(credentials)
cur = conn.cursor()

cur.execute("""
DROP TABLE IF EXISTS Planet;
DROP TABLE IF EXISTS Star;

CREATE TABLE Star (
  kepler_id INTEGER NOT NULL,
  t_eff INTEGER NOT NULL,
  radius FLOAT NOT NULL,
  PRIMARY KEY (kepler_id)
);

CREATE TABLE Planet (
  kepler_id INTEGER NOT NULL REFERENCES Star(Kepler_ID),
  koi_name VARCHAR(20) NOT NULL,
  kepler_name VARCHAR(20),
  status VARCHAR(20) NOT NULL,
  period FLOAT NOT NULL,
  radius FLOAT NOT NULL,
  t_eq INTEGER NOT NULL,
  PRIMARY KEY (koi_name)
);


COPY Star (kepler_id, t_eff, radius)
  FROM '/data/data3/stars.csv' CSV;

COPY Planet (kepler_id, koi_name, kepler_name, status, period, radius, t_eq)
  FROM '/data/data3/planets.csv' CSV;
""")
conn.commit()

# Task 1
Write a query that returns the radius of each star and planet pair whose radii
have a ratio greater than the Sun-to-Earth radius ratio. Order the results in
descending order based on the stellar radii. Use `sun_radius` and
`planet_radius` as attribute aliases for the star and planet radii.

For this problem you will have to join the two tables to find all planets
belonging to a given star and use a condition to select those results which
fulfill the size requirement above.

In [13]:
pd.read_sql("""
SELECT
  Star.radius AS sun_radius,
  Planet.radius AS planet_radius
FROM Star, Planet
WHERE
  Star.kepler_id = Planet.kepler_id AND
  Star.radius > Planet.radius
ORDER BY Star.radius DESC;
""", conn)

Unnamed: 0,sun_radius,planet_radius
0,1.332,0.65
1,1.029,0.85
2,1.029,0.99
3,0.755,0.58
4,0.755,0.49


# Task 2
Write a query which counts the number of planets in each solar system where
the corresponding stars are larger than our sun (i.e. their radius is larger
than 1).

Your query should return the star's radius and its number of planets, showing
only rows where the number of planets is more than one. Sort the rows in
descending order based on the star radii.

In [None]:
pd.read_sql("""
SELECT Star.radius, COUNT(Planet.koi_name)
FROM Star
JOIN Planet USING (kepler_id)
WHERE Star.radius > 1
GROUP BY Star.kepler_id
HAVING COUNT(Planet.koi_name) > 1
ORDER BY Star.radius DESC;
""", conn)

# Tear down

In [14]:
cur.close()
conn.close()