In [1]:
import psycopg2
import pandas as pd

# Prepare the Data

Run `docker-compose up` to get Postgres up and running.

In [2]:
credentials = "dbname='astronomy' user='user' host='localhost' password='pass'"
conn = psycopg2.connect(credentials)
cur = conn.cursor()

cur.execute("""
DROP TABLE IF EXISTS Star;
DROP TABLE IF EXISTS Planet;

CREATE TABLE Star (
  kepler_id INTEGER NOT NULL,
  koi_name VARCHAR(20) NOT NULL,
  t_eff INTEGER,
  radius FLOAT(5),
  PRIMARY KEY (koi_name)
);

CREATE TABLE Planet (
  kepler_id INTEGER NOT NULL,
  koi_name VARCHAR(20) NOT NULL,
  kepler_name VARCHAR(20),
  status VARCHAR(20) NOT NULL,
  period FLOAT,
  radius FLOAT,
  t_eq INTEGER,
  PRIMARY KEY (koi_name)
);

COPY Star (kepler_id, koi_name, t_eff, radius)
  FROM '/data/week3/data1/stars.csv' CSV;
COPY Planet (kepler_id, koi_name, kepler_name, status, period, radius, t_eq)
  FROM '/data/week3/data1/planets.csv' CSV;
""")
conn.commit()

# Task 1
Write an SQL query to find the radius and temperature of the stars in the
Star table that are larger than our sun.

In [3]:
pd.read_sql("""
SELECT radius, t_eff FROM Star WHERE radius > 1;
""", conn)

Unnamed: 0,radius,t_eff
0,3.523,6335
1,1.965,8782
2,1.032,6319
3,27.384,3789


# Task 2
Write a range query which returns the `kepler_id` and the `t_eff` attributes
of all those stars in the Star table whose temperature lies between 5000 and
6000 Kelvin (inclusive)

In [4]:
pd.read_sql("""
SELECT kepler_id, t_eff FROM Star WHERE t_eff BETWEEN 5000 AND 6000;
""", conn)

Unnamed: 0,kepler_id,t_eff
0,3836450,5160
1,6590362,5926
2,8106973,5810


# Task 3
Write a query to find the `kepler_name` and `radius` of each planet in the
`Planet` table which is a confirmed exoplanet, meaning that their `kepler_name`
is not `NULL`, or, equivalently, whose status is `'CONFIRMED'`.

Restrict your results to those planets whose radius lies between one and three
earth radii, and remember that the radius of the planets is relative to the
earth radius.

In [5]:
pd.read_sql("""
SELECT kepler_name, radius FROM Planet
  WHERE kepler_name IS NOT NULL AND radius BETWEEN 1 AND 3;
""", conn)

Unnamed: 0,kepler_name,radius
0,Kepler-10 b,1.45
1,Kepler-106 c,2.35
2,Kepler-52 d,1.8
3,Kepler-239 b,2.36
4,Kepler-239 c,2.19


# Task 4
## Uploading more Data

In [6]:
cur.execute("""
DROP TABLE IF EXISTS Planet;

CREATE TABLE Planet (
  kepler_id INTEGER NOT NULL,
  koi_name VARCHAR(20) NOT NULL,
  kepler_name VARCHAR(20),
  status VARCHAR(20) NOT NULL,
  period FLOAT NOT NULL,
  radius FLOAT NOT NULL,
  t_eq INTEGER NOT NULL,
  PRIMARY KEY (koi_name)
);


COPY Planet (kepler_id, koi_name, kepler_name, status, period, radius, t_eq)
  FROM '/data/week3/data2/planets.csv' CSV;
""")
conn.commit()

Your task is to write a query that calculates the:

- minimum radius;
- maximum radius;
- average radius; and
- standard deviation of the radius

of unconfirmed planets (with a `NULL` value in `kepler_name`) in the `Planet`
table.

In [7]:
pd.read_sql("""
SELECT
  MIN(radius),
  MAX(radius),
  AVG(radius),
  STDDEV(radius)
FROM Planet WHERE kepler_name IS NULL;
""", conn)

Unnamed: 0,min,max,avg,stddev
0,0.65,3462.25,275.517333,888.709924


# Task 5
Find out how many planets in the `Planet` database are in a multi-planet
system. Planets sharing the same star will have the same `kepler_id`, but
different `koi_name` values.

Your query should return a table in which each row contains the `kepler_id`
of the star and the number of planets orbiting that star (i.e. that share
this `kepler_id`).

Limit your results to counts above one and order the rows in descending order
based on the number of planets.

In [8]:
pd.read_sql("""
SELECT kepler_id, COUNT(koi_name)
  FROM Planet
  GROUP BY kepler_id
  HAVING COUNT(koi_name) > 1
  ORDER BY COUNT(koi_name) DESC;
""", conn)

Unnamed: 0,kepler_id,count
0,8395660,4
1,4139816,4
2,3832474,3
3,5358241,3
4,10910878,3
5,9579641,3
6,10872983,3
7,10601284,3
8,11754553,3
9,12066335,2


# Tear down

In [9]:
cur.close()
conn.close()