# Question 1: Convert XML to a SQL database

Create two tables named `LOW` and `HIGH`, each corresponding to data given for the low and high temperature range.
Each should have the following column names:

- `SPECIES_NAME`
- `TLOW`
- `THIGH`
- `COEFF_1`
- `COEFF_2`
- `COEFF_3`
- `COEFF_4`
- `COEFF_5`
- `COEFF_6`
- `COEFF_7`

Populate the tables using the XML file you created in last assignment. If you did not complete the last assignment, you may also use the `example_thermo.xml` file.

`TLOW` should refer to the temperature at the low range and `THIGH` should refer to the temperature at the high range.  For example, in the `LOW` table, $H$ would have `TLOW` at $200$ and `THIGH` at $1000$ and in the `HIGH` table, $H$ would have `TLOW` at $1000$ and `THIGH` at $3500$.

For both tables, `COEFF_1` through `COEFF_7` should be populated with the corresponding coefficients for the low temperature data and high temperature data.

In [1]:
import xml.etree.ElementTree as ET
import sqlite3
import pandas as pd
import numpy as np
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

In [2]:
tree = ET.parse('example_thermo.xml')
root = tree.getroot()
species = root.find("phase")
species_array = species.find("speciesArray").text
species_name = species_array.split()

species_name

['H', 'O', 'OH', 'H2', 'O2', 'H2O', 'HO2', 'H2O2', 'N2', 'Hp', 'Op']

In [3]:
dic = {}
for specie in root.findall("speciesData"):
    for s in specie.findall("species"):
        name = s.get("name")
        dic[name] = {}
        coeffs = s.find("thermo").findall("NASA")
        T_max = []
        T_min = []
        coeff = []
        for c in coeffs:
            T_max.append(c.get("Tmax"))
            T_min.append(c.get("Tmin"))   
            coeff.append(c.find("floatArray").text.strip().split(','))
        dic[name]["TMAX"] = T_max
        dic[name]["TMIN"] = T_min
        dic[name]["coeffs"] = coeff

In [4]:
# dic

In [5]:
db = sqlite3.connect('HW10.sqlite')
cursor = db.cursor()
cursor.execute("DROP TABLE IF EXISTS HIGH")
cursor.execute("DROP TABLE IF EXISTS LOW")
cursor.execute("PRAGMA foreign_keys=1")

cursor.execute('''CREATE TABLE HIGH (
               SPECIES_NAME TEXT PRIMARY KEY NOT NULL, 
               TLOW FLOAT, 
               THIGH FLOAT, 
               COEFF_1 FLOAT,
               COEFF_2 FLOAT,
               COEFF_3 FLOAT,
               COEFF_4 FLOAT,
               COEFF_5 FLOAT,
               COEFF_6 FLOAT,
               COEFF_7 FLOAT)''')

cursor.execute('''CREATE TABLE LOW (
               SPECIES_NAME TEXT PRIMARY KEY NOT NULL, 
               TLOW FLOAT, 
               THIGH FLOAT, 
               COEFF_1 FLOAT,
               COEFF_2 FLOAT,
               COEFF_3 FLOAT,
               COEFF_4 FLOAT,
               COEFF_5 FLOAT,
               COEFF_6 FLOAT,
               COEFF_7 FLOAT)''')

db.commit() # Commit changes to the database

In [6]:
def viz_tables(cols, query):
    q = cursor.execute(query).fetchall()
    framelist = []
    for i, col_name in enumerate(cols):
        framelist.append((col_name, [col[i] for col in q]))
    return pd.DataFrame.from_items(framelist)

In [7]:
for key, value in dic.items():
    vals_to_insert_low = (key, float(value["TMIN"][0]), float(value["TMAX"][0]), float(value["coeffs"][0][0]), \
                           float(value["coeffs"][0][1]), float(value["coeffs"][0][2]), \
                           float(value["coeffs"][0][3]), float(value["coeffs"][0][4]), \
                           float(value["coeffs"][0][5]), float(value["coeffs"][0][6]))
    cursor.execute('''INSERT INTO LOW 
                  (SPECIES_NAME, TLOW, THIGH, COEFF_1, COEFF_2, COEFF_3, COEFF_4, COEFF_5, COEFF_6, COEFF_7)
                  VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', vals_to_insert_low)


In [8]:
LOW_cols = [col[1] for col in cursor.execute("PRAGMA table_info(LOW)")]
query_LOW = '''SELECT * FROM LOW'''
viz_tables(LOW_cols, query_LOW)

Unnamed: 0,SPECIES_NAME,TLOW,THIGH,COEFF_1,COEFF_2,COEFF_3,COEFF_4,COEFF_5,COEFF_6,COEFF_7
0,H,300.0,1000.0,2.5,0.0,0.0,0.0,0.0,25471.627,-0.460118
1,O,300.0,1000.0,2.946429,-0.001638,2.421032e-06,-1.602843e-09,3.890696e-13,29147.6445,2.963995
2,OH,200.0,1000.0,4.125306,-0.003225,6.527647e-06,-5.798536e-09,2.062374e-12,3346.30913,-0.690433
3,H2,300.0,1000.0,3.298124,0.000825,-8.143015e-07,-9.475434e-11,4.134872e-13,-1012.52087,-3.294094
4,O2,300.0,1000.0,3.212936,0.001127,-5.75615e-07,1.313877e-09,-8.768554e-13,-1005.24902,6.034738
5,H2O,300.0,1000.0,3.386842,0.003475,-6.354696e-06,6.968581e-09,-2.506588e-12,-30208.1133,2.590233
6,HO2,200.0,1000.0,4.301798,-0.004749,2.115829e-05,-2.427639e-08,9.292251e-12,294.80804,3.716662
7,H2O2,300.0,1000.0,3.388754,0.006569,-1.485013e-07,-4.625806e-09,2.471515e-12,-17663.1465,6.785363
8,N2,300.0,1000.0,3.298677,0.001408,-3.963222e-06,5.641515e-09,-2.444855e-12,-1020.9,3.950372
9,Hp,300.0,1000.0,1.642435,0.00029,0.0,0.0,0.0,-4.695103,-11.148334


In [9]:
for key, value in dic.items():
    vals_to_insert_high = (key, float(value["TMIN"][1]), float(value["TMAX"][1]), float(value["coeffs"][1][0]), \
                          float(value["coeffs"][1][1]), float(value["coeffs"][1][2]), \
                          float(value["coeffs"][1][3]), float(value["coeffs"][1][4]), \
                          float(value["coeffs"][1][5]), float(value["coeffs"][1][6]))
    cursor.execute('''INSERT INTO HIGH 
                  (SPECIES_NAME, TLOW, THIGH, COEFF_1, COEFF_2, COEFF_3, COEFF_4, COEFF_5, COEFF_6, COEFF_7)
                  VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', vals_to_insert_high)

In [10]:
HIGH_cols = [col[1] for col in cursor.execute("PRAGMA table_info(HIGH)")]
query_HIGH = '''SELECT * FROM HIGH'''
viz_tables(HIGH_cols, query_HIGH)

Unnamed: 0,SPECIES_NAME,TLOW,THIGH,COEFF_1,COEFF_2,COEFF_3,COEFF_4,COEFF_5,COEFF_6,COEFF_7
0,H,1000.0,5000.0,2.5,0.0,0.0,0.0,0.0,25471.627,-0.460118
1,O,1000.0,5000.0,2.54206,-2.8e-05,-3.102803e-09,4.551067e-12,-4.368051e-16,29230.8027,4.920308
2,OH,1000.0,6000.0,2.864729,0.001057,-2.590828e-07,3.052187e-11,-1.331959e-15,3683.62875,5.701641
3,H2,1000.0,5000.0,2.991423,0.0007,-5.633829e-08,-9.231578e-12,1.582752e-15,-835.033997,-1.35511
4,O2,1000.0,5000.0,3.697578,0.000614,-1.258842e-07,1.775281e-11,-1.136435e-15,-1233.93018,3.189166
5,H2O,1000.0,5000.0,2.672146,0.003056,-8.73026e-07,1.200996e-10,-6.391618e-15,-29899.209,6.862817
6,HO2,1000.0,3500.0,4.017211,0.00224,-6.336581e-07,1.142464e-10,-1.079085e-14,111.856713,3.785102
7,H2O2,1000.0,5000.0,4.573167,0.004336,-1.474689e-06,2.348904e-10,-1.431654e-14,-18006.9609,0.501137
8,N2,1000.0,5000.0,2.92664,0.001488,-5.684761e-07,1.009704e-10,-6.753351e-15,-922.7977,5.980528
9,Hp,1000.0,5000.0,1.642435,0.00029,0.0,0.0,0.0,-4.695103,-11.148334


# Question 2: `WHERE` Statements

1. Write a `Python` function `get_coeffs` that returns an array of 7 coefficients.  
   
   The function should take in two parameters: 1.) `species_name` and 2.) `temp_range`, an indicator variable ('low' or 'high') to indicate whether the coefficients should come from the low or high temperature range.  
   The function should use `SQL` commands and `WHERE` statements on the table you just created in Question 1 (rather than taking data from the XML directly).
```python
def get_coeffs(species_name, temp_range):
    ''' Fill in here'''
    return coeffs
```

2. Write a python function `get_species` that returns all species that have a temperature range above or below a given value. The function should take in two parameters: 1.) `temp` and 2.) `temp_range`, an indicator variable ('low' or 'high').

  When temp_range is 'low', we are looking for species with a temperature range lower than the given temperature, and for a 'high' temp_range, we want species with a temperature range higher than the given temperature.

  This exercise may be useful if different species have different `LOW` and `HIGH` ranges.

  And as before, you should accomplish this through `SQL` queries and where statements.

```python
def get_species(temp, temp_range):
    ''' Fill in here'''
    return coeffs
```

In [11]:
def get_coeffs(species_name, temp_range):
    if temp_range == 'high':
        query = '''SELECT COEFF_1, COEFF_2, COEFF_3, COEFF_4, COEFF_5, COEFF_6, COEFF_7 FROM HIGH WHERE SPECIES_NAME="{}"'''.format(species_name)
    elif temp_range == 'low':
        query = '''SELECT COEFF_1, COEFF_2, COEFF_3, COEFF_4, COEFF_5, COEFF_6, COEFF_7 FROM LOW WHERE SPECIES_NAME="{}"'''.format(species_name)
    coeffs_list = cursor.execute(query).fetchall()
    coeffs = np.asarray(coeffs_list)
    return coeffs

In [12]:
get_coeffs('O', 'high')

array([[  2.54205966e+00,  -2.75506191e-05,  -3.10280335e-09,
          4.55106742e-12,  -4.36805150e-16,   2.92308027e+04,
          4.92030811e+00]])

In [13]:
get_coeffs('H', 'low')

array([[  2.50000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   2.54716270e+04,
         -4.60117608e-01]])

In [14]:
def get_species(temp, temp_range):
    if temp_range == 'high':
        query = '''SELECT SPECIES_NAME FROM HIGH WHERE THIGH > "{}"'''.format(temp)
    elif temp_range == 'low':
        query = '''SELECT SPECIES_NAME FROM LOW WHERE TLOW < "{}"''' .format(temp)
    names_tuple = cursor.execute(query).fetchall()
    names = [x[0] for x in names_tuple]
    return names

In [15]:
get_species(5100, 'high')

['OH']

In [16]:
get_species(250, 'low')

['OH', 'HO2']

# Question 3: `JOIN` STATEMENTS

Create a table named `ALL_TEMPS` that has the following columns:

- `SPECIES_NAME`
- `TEMP_LOW`
- `TEMP_HIGH`

This table should be created by joining the tables `LOW` and `HIGH` on the value `SPECIES_NAME`.

1. Write a `Python` function `get_range` that returns the range of temperatures for a given species_name.

The range should be computed within the `SQL` query (i.e. you should subtract within the `SELECT` statement in the `SQL` query).
```python
def get_range(species_name):
    '''Fill in here'''
    return range
```

Note that `TEMP_LOW` is the lowest temperature in the `LOW` range and `TEMP_HIGH` is the highest temperature in the `HIGH` range.

In [17]:
cursor.execute("DROP TABLE IF EXISTS ALL_TEMPS")
cursor.execute('''CREATE TABLE ALL_TEMPS (
               SPECIES_NAME TEXT PRIMARY KEY NOT NULL, 
               TEMP_LOW FLOAT, 
               TEMP_HIGH FLOAT)''')

db.commit()

In [18]:
query_all = '''INSERT INTO ALL_TEMPS SELECT LOW.SPECIES_NAME, LOW.TLOW, HIGH.THIGH FROM LOW INNER JOIN HIGH ON LOW.SPECIES_NAME = HIGH.SPECIES_NAME'''
cursor.execute(query_all)
ALL_cols = [col[1] for col in cursor.execute("PRAGMA table_info(ALL_TEMPS)")]
query_ALL = '''SELECT * FROM ALL_TEMPS'''
viz_tables(ALL_cols, query_ALL)

Unnamed: 0,SPECIES_NAME,TEMP_LOW,TEMP_HIGH
0,H,300.0,5000.0
1,O,300.0,5000.0
2,OH,200.0,6000.0
3,H2,300.0,5000.0
4,O2,300.0,5000.0
5,H2O,300.0,5000.0
6,HO2,200.0,3500.0
7,H2O2,300.0,5000.0
8,N2,300.0,5000.0
9,Hp,300.0,5000.0


In [19]:
def get_range(species_name):
    query = '''SELECT TEMP_LOW, TEMP_HIGH FROM ALL_TEMPS WHERE SPECIES_NAME = "{}"'''.format(species_name)
    temp_range = cursor.execute(query).fetchall()
    
    return temp_range

In [20]:
get_range('O')

[(300.0, 5000.0)]

In [21]:
get_range('OH')

[(200.0, 6000.0)]

In [22]:
db.commit()

In [23]:
db.close()