In [1]:
from sklearn.manifold import TSNE
import pandas as pd

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

%matplotlib inline

In [4]:
# CONSTANTS

N_DATA = 10000
SEED = 1234 # need to seed our pandas sampling

DATA_DIR = '/home/joboy/Work/machine-learning/'
CLAIM_LINE_CLEAN_FILE = DATA_DIR + 'CLAIM_LINE_CLEAN.csv'
CLAIM_LINE_FILE = DATA_DIR + 'CLAIM_LINE.csv'

In [5]:
%time claims_raw = pd.read_csv(CLAIM_LINE_CLEAN_FILE)
%time claims_unclean_raw = pd.read_csv(CLAIM_LINE_FILE)

claims = claims_raw.sample(n=N_DATA, random_state=SEED)
claims_unclean_raw = claims_unclean_raw.sample(n=N_DATA, random_state=SEED)

CPU times: user 1.94 s, sys: 116 ms, total: 2.06 s
Wall time: 3.36 s
CPU times: user 24 ms, sys: 0 ns, total: 24 ms
Wall time: 34.6 ms


In [9]:
# Run TSNE
model = TSNE(n_components=3, learning_rate=100, n_iter=400, init='pca', perplexity=25, verbose=1)

%time claims_tsne = model.fit_transform(claims)

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 76 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 10000
[t-SNE] Computed conditional probabilities for sample 2000 / 10000
[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 0.000594
[t-SNE] KL divergence after 100 iterations with early exaggeration: 1.597217
[t-SNE] Error after 300 iterations: 1.597217
CPU times: user 4min 57s, sys: 1min 27s, total: 6min 24s
Wall time: 6min 16s


In [5]:
# Save TSNE output to CSV
## pd.DataFrame(claims_tsne).to_csv(DATA_DIR + 'CLAIMS_TSNE_VECTORS.csv')

In [12]:
# Run line below to skip TSNE and load from saved vectors
claims_tsne = pd.read_csv(DATA_DIR + 'CLAIMS_TSNE_VECTORS.csv')
claims_tsne = claims_tsne.values[:, 1:]
claims_tsne

array([[ 0.14977551, -2.09717427,  0.69066474],
       [-1.36372681, -1.89213505,  0.27979039],
       [ 1.08252065,  0.69018239, -2.42426233],
       ..., 
       [ 0.39602095, -0.09860427, -0.51000175],
       [-1.2033238 ,  0.83483861,  0.10228475],
       [-0.49093367, -1.50589624, -0.7946499 ]])

In [10]:
# Prepare our metadata as hover label in our scatter data points
claims_unclean_raw['metadata'] = claims_unclean_raw.apply(
    #lambda x: "<br>".join(x.astype(str)),
    lambda x: "ID:          " + str(x[0]) + "<br>" + \
              "ICD:       " + str(x[1]) + "<br>" + \
              "Desc:     " + str(x[2]) + "<br>" + \
              "Amount: " + str(x[3]) + "<br>" + \
              "Age:       " + str(x[4]) + "<br>" + \
              "Gender:  " + str(x[5]) + "<br>",
    axis=1
)
claims_unclean_raw.head()

Unnamed: 0,ID,CLINICAL_CODE,DESCRIPTION,CLAIMED_AMOUNT,AGE_APPLIED,GENDER_CODE,metadata
31091,96976,I11,Hypertensive heart disease,12805.0,51,F,ID: 96976<br>ICD: I11<br>Desc: ...
2456,68341,K80,Cholelithiasis,817.26,42,F,ID: 68341<br>ICD: K80<br>Desc: ...
8752,74637,H61.2,Impacted cerumen,300.0,14,M,ID: 74637<br>ICD: H61.2<br>Desc...
29015,94900,H20.9,Unspecified iridocyclitis,300.0,41,M,ID: 94900<br>ICD: H20.9<br>Desc...
26422,92307,G64,Other disorders of peripheral nervous system,5546.0,54,F,ID: 92307<br>ICD: G64<br>Desc: ...


In [11]:
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)
trace1 = go.Scatter3d(
    x=claims_tsne[:, 0],
    y=claims_tsne[:, 1],
    z=claims_tsne[:, 2],
    text=claims_unclean_raw['metadata'],
    mode='markers',
    marker=dict(
        size=6,
        color=claims_unclean_raw['CLINICAL_CODE'].apply(lambda x: ord(x[0]) + int(x[1:3])),
        colorscale='Viridis',
        opacity=0.8
    )
)

layout = go.Layout(margin=dict(l = 0, r = 0, b = 0, t = 0))

fig = go.Figure(data=[trace1], layout=layout)

iplot(fig)