In [12]:
# hack to allow importing from sibling directories
#https://stackoverflow.com/questions/34478398/import-local-function-from-a-module-housed-in-another-directory-with-relative-im
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


from mlviz.dimensionality_reduction import HDVis
from mlviz.data_visualisation import DraughtPlot
from utilities import utils
import pandas as pd
from sklearn.preprocessing import StandardScaler


# required bokeh imports
from bokeh.io import output_notebook
output_notebook()


# Introduction - Superconductivity example

#### Prepare the data


We will:
- Load the data
- Remove columns which could be categorical, we will be quite harsh.
- Scale the data, using StandardScaler

In [7]:
class Superconductivity():

    def __init__(self, 
                 filepath='Y:/Team Scratch Space/fpc/2_dimensionality_reduction/data/superconductivity/',
                 train_name='train.csv',
                 materials_name='unique_m.csv'
                ):
        self.filepath = filepath
        self.train_name = train_name
        self.materials_name = materials_name
        self.load_data()
    
    def load_data(self):
        """Loads and processes the data."""
        data = pd.read_csv(f'{self.filepath}{self.train_name}')
        names = pd.read_csv(f'{self.filepath}{self.materials_name}')
        data['material'] = names['material']
        # take the mean value for materials measured multiple times
        mean_data = data.groupby('material').mean()
        mean_data.reset_index(level=0, inplace=True)
        self.target = mean_data['critical_temp']
        self.train = mean_data.drop(['critical_temp','material'], axis=1)
        self.df = mean_data.rename({'critical_temp':'target'}, axis=1)

In [8]:
supercon = Superconductivity()

train = supercon.train

In [13]:
categorical_features = utils.is_categorical(train, threshold=0.5)

train.drop(categorical_features, inplace=True, axis=1)

In [14]:
train_scaled = StandardScaler().fit_transform(train)

In [15]:
train_scaled = pd.DataFrame(train_scaled, columns=train.columns)

## Look at the data

Use the HDVis tool noting:
     - y must be a numpy.ndarray

In [16]:
HD_plot = HDVis(train_scaled, supercon.target.values)

In [20]:
X, y = HD_plot.get_brushed_data()

In [21]:
DPPlot = DraughtPlot(X, y)