In [None]:
# setup
import os, sys
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns



# Helper Functions

This function takes a string and eliminates all characters that wouldn't be suitable for filenames

In [None]:
## https://docs.python.org/2/library/re.html
def to_filename(s):
    import re
    return re.sub('_+', '_', re.sub('\W', '', re.sub('[=\s]', '_', s)))

to_filename(u'pearsonr = 0.68; p = 6.7e-34')

A function to save figures which ensures that the subdirectoy is created

In [None]:
def savefig(g, figdir, fname):
    import os
    if not os.path.isdir(figdir):
        os.makedirs(figdir)
    g.savefig(os.path.join(figdir, fname))
    return fname
        

In [None]:
savefig(g, 'www/figs', 'joint.png')

In [None]:
ls -lR www/figs/joint.png

# Visual Inspection

## Produce Description Table
We build a feature description table and export it to Excel and HTML. This makes it easier to read

In [None]:
descr_tbl = df.describe().transpose()
descr_tbl.head()

In [None]:
import StringIO
output = StringIO.StringIO()
descr_tbl[descr_tbl['count']>0].to_html(output, float_format=lambda x: '%12.4f'%x)
##

with open(jp('www', 'desciption.html'), 'w') as out:
    out.write("""
    <html>
    <head>
    <style>
    table {border: 1px gray;}
    td {padding: 1em 0.3em; text-align: right;}
    </style>
    </head>
    <body>
    """)
    out.write(output.getvalue())
    out.write("""
    </body>
    </html>
    """)
output.close()


## Types of Plots

In [None]:
import seaborn as sns
sns.set(style="ticks")

df = sns.load_dataset("iris")
sns.pairplot(df, hue="species")

### Linear regression with marginal distributions¶
- Demo https://seaborn.pydata.org/examples/regression_marginals.html
- Documentation https://seaborn.pydata.org/generated/seaborn.jointplot.html

In [None]:
sns.set(style="darkgrid", color_codes=True)

tips = sns.load_dataset("tips")
g = sns.jointplot("total_bill", "tip", data=tips, kind="reg",
                  xlim=(0, 60), ylim=(0, 12), color="r", size=7)

In [None]:
g.savefig('')

In [None]:
leg = g.ax_joint.get_legend()
txt = leg.get_texts()
for t in txt:
    print t
t.get_text()

### Box Plots
- Demo https://seaborn.pydata.org/examples/grouped_boxplot.html
- Documentation https://seaborn.pydata.org/generated/seaborn.boxplot.html?highlight=box%20plot#seaborn.boxplot

In [None]:
sns.set(style="ticks")

# Load the example tips dataset
tips = sns.load_dataset("tips")

# Draw a nested boxplot to show bills by day and sex
sns.boxplot(x="day", y="total_bill", hue="sex", data=tips, palette="PRGn")
sns.despine(offset=10, trim=True)

### Violine Plot
https://seaborn.pydata.org/examples/elaborate_violinplot.html

In [None]:
#import seaborn as sns
#import matplotlib.pyplot as plt
sns.set(style="whitegrid")

# Load the example dataset of brain network correlations
df = sns.load_dataset("brain_networks", header=[0, 1, 2], index_col=0)

# Pull out a specific subset of networks
used_networks = [1, 3, 4, 5, 6, 7, 8, 11, 12, 13, 16, 17]
used_columns = (df.columns.get_level_values("network")
                          .astype(int)
                          .isin(used_networks))
df = df.loc[:, used_columns]

# Compute the correlation matrix and average over networks
corr_df = df.corr().groupby(level="network").mean()
corr_df.index = corr_df.index.astype(int)
corr_df = corr_df.sort_index().T

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 6))

# Draw a violinplot with a narrower bandwidth than the default
sns.violinplot(data=corr_df, palette="Set3", bw=.2, cut=1, linewidth=1)

# Finalize the figure
ax.set(ylim=(-.7, 1.05))
sns.despine(left=True, bottom=True)

## Running self-standing programs to produce tons of plots
It's not a good idea to produce a large number of plots inside the notebook. It'll potentially slow down or crash the browser.
Instead, we run an external program to produce image files.

In [None]:
# %load plot_histograms.py
#!/usr/bin/env python
import sys, os
jp = os.path.join

import numpy as np
import pandas as pd

### use matplotlib headless https://gist.github.com/philippstroehle/6621189
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt
import seaborn as sb

DATADIR='data'
WEBDIR='www'

if __name__ == '__main__':
    df = pd.read_csv(jp('data', 'manfct80s.csv'), low_memory=False)
    print "Nrows %d, Ncols %d"%(df.shape[0], df.shape[1])

    ## get numeric columns
    descr_tbl = df.describe().transpose()

    for col in descr_tbl.index:
        ##if descr_tbl.loc[col]['count']>0:
        try:
            print "plotting col %s"%(col)
            fig = plt.figure(figsize=(12,8))
            plt.hist(df[col])
            plt.title("%s"%(col))
            fig.savefig(jp(WEBDIR, "hist_%s.png"%(col)))
            ##else:
        except:
            print "skipping col %s"%(col)


## Creating HTML Output

A basic HTML document looks like this
```
<?DOCTYPE html>
<html>
<head>
    <style>
    td {
        text-align:right;
    }
    </style>
</head>
<body>
</body>
</html>
```

# Cleaning
Standardization, or mean removal and variance scaling http://scikit-learn.org/stable/modules/preprocessing.html

**Standardization** of datasets is a common requirement for many machine learning estimators implemented in scikit-learn; they might behave badly if the individual features do not more or less look like **standard normally distributed** data: Gaussian with zero mean and unit variance.

In practice we often ignore the shape of the distribution and just transform the data to center it by removing the mean value of each feature, then scale it by dividing non-constant features by their standard deviation.

For instance, many elements used in the objective function of a learning algorithm (such as the RBF kernel of Support Vector Machines or the l1 and l2 regularizers of linear models) assume that all features are centered around zero and have variance in the same order. If a feature has a variance that is orders of magnitude larger than others, it might dominate the objective function and make the estimator unable to learn from other features correctly as expected.

In [5]:
from sklearn import preprocessing
import numpy as np
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])
X_scaled = preprocessing.scale(X_train)

X_scaled                                          

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

Scaled data has zero mean and unit variance:

In [7]:
print 'Mean:', X_scaled.mean(axis=0)
print 'Stdev:', X_scaled.std(axis=0)

Mean: [ 0.  0.  0.]
Stdev: [ 1.  1.  1.]


However, there are other ways to standardize the data

## Scalining
- Min-Max Scaler http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
- Demo http://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#sphx-glr-auto-examples-preprocessing-plot-all-scaling-py
- Notebook http://localhost:8888/notebooks/plot_all_scaling.ipynb#

## Outlier Detection

open notebook `plot_outlier_detection_housing_orig.ipynb`
- http://scikit-learn.org/stable/auto_examples/covariance/plot_outlier_detection.html
- http://scikit-learn.org/stable/auto_examples/applications/plot_outlier_detection_housing.html