# Iris-like dataset from the Rock Property Catalog?

Trying [this dataset](https://agilescientific.com/blog/2015/10/5/the-rock-property-catalog-again) to see if it might make a good 'Iris' dataset for geologists. 

In [3]:
import requests
import pandas as pd

class RPC(object):
    def __init__(self):
        pass
    
    def _query_ssw(self, filters, properties, options):
        
        base_url = "http://www.subsurfwiki.org/api.php"
        q = "action=ask&query=[[RPC:%2B]]"
        q += ''.join(filters) if filters else ''
        q += '|%3F' + '|%3F'.join(properties) if properties else ''
        q += '|' + '|'.join(options) if options else ''
        q += '&format=json'
        
        return requests.get(base_url, params=q)
    
    def _get_formats(self, response):
        formats = {}
        for item in response.json()['query']['printrequests']:
            if item[u'mode'] == 1:
                formats[item[u'label']] = item[u'typeid'].lstrip('_')
        return formats
    
    def _build_dataframe(self, response):
        """
        Takes the response of a query and returns a pandas
        dataframe containing the results.
        """
        try:
            s = list(response.json()['query']['results'].keys())
        except Exception as e:
            raise e
        samples = [i[4:] for i in s]
        df = pd.DataFrame(samples)

        # We'll need to know the formats of the columns.
        formats = self._get_formats(response)
        properties = formats.keys()

        # Now traverse the JSON and build the DataFrame.
        for prop in properties:
            temp = []
            for row in list(s):
                p = response.json()['query']['results'][row]['printouts']
                if p[prop]:
                    if formats[prop] == 'qty':   # Quantity, number + unit
                        temp.append(p[prop][0]['value'])
                    elif formats[prop] == 'wpg':  # Wiki page
                        temp.append(p[prop][0]['fulltext'])
                    else:                         # Anything else: num, txt, tem, etc.
                        temp.append(p[prop][0])
                else:
                    temp.append(None)
            df[prop] = temp
        
        df = df.set_index(0)
        df.index.name = None

        return df
    
    def query(self, filters=None, properties=None, options=None):
        r = self._query_ssw(filters, properties, options)
        if r.status_code == 200:
            return self._build_dataframe(r)
        else:
            print("Something went wrong.")

----

## This query takes a while

In [5]:
rpc = RPC()

filters = ["[[lithology::Shale||Sandstone||Limestone||Dolomite||Tuff]]"]
properties = ['Vp', 'Vs', 'Rho dry', 'Porosity', 'Lithology', 'Description', 'Citation']

options = ["limit=5095"]

df = rpc.query(filters, properties, options)

KeyError: 'query'

-----

## Data munging

In [None]:
df.head(10)

In [None]:
len(df)

In [None]:
# Rearrange columns.
dg = df[properties]

In [None]:
# Convert porosity to numeric percent.
dg['Porosity'] = 100 * pd.to_numeric(dg.loc[:, 'Porosity'])

In [None]:
# Change all the NaN shale porosities to 0.
shale = dg['Lithology'] == 'Shale'
nopor = dg['Porosity'].isna()
dg.loc[shale & nopor, 'Porosity'] = 0

In [None]:
# Drop rows with nulls.
dg = dg.dropna()

In [None]:
len(dg)

In [None]:
dg['Lithology'].unique()

In [None]:
fr = {k: i for i, k in enumerate(dg.Lithology.unique())}

In [None]:
fr

In [None]:
dg['Lithcode'] = dg.replace({'Lithology': fr})['Lithology']

In [None]:
dg.head(10)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

dfX = pd.DataFrame(dg, columns=['Vp', 'Vs', 'Rho dry', 'Porosity', 'Epsilon'])
Y = dg['Lithcode'].values

# Create a scatter matrix from the dataframe, color by y_train
hist_kwds = {'bins': 20, 'lw':0, 'color':'lightgray'}
grr = pd.plotting.scatter_matrix(dfX, c=Y, figsize=(15, 15), marker='o', hist_kwds=hist_kwds, s=40, alpha=0.5)

The porosity and density data are sketchy. Damn. Wondering if we can fake it from sonic... https://www.spec2000.net/12-phidt.htm

Leaving it alone for now.