Skip to content

Commit

Permalink
Merge pull request #9 from alyst/enh_perf
Browse files Browse the repository at this point in the history
Performance and Documenation Enhancements
  • Loading branch information
lejon committed Apr 24, 2017
2 parents d71ef23 + bd9611b commit 7d7bb22
Show file tree
Hide file tree
Showing 7 changed files with 159 additions and 132 deletions.
45 changes: 21 additions & 24 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,54 +1,51 @@
[![Travis](https://travis-ci.org/lejon/TSne.jl.svg?branch=master)](https://travis-ci.org/lejon/TSne.jl)
[![Coveralls](https://coveralls.io/repos/github/lejon/TSne.jl/badge.svg?branch=master)](https://coveralls.io/github/lejon/TSne.jl?branch=master)

Julia t-SNE
===========
t-SNE (t-Stochastic Neighbor Embedding)
=======================================

Julia port of L.J.P. van der Maaten and G.E. Hintons T-SNE visualisation technique.
Julia implementation of L.J.P. van der Maaten and G.E. Hintons [t-SNE visualisation technique](https://lvdmaaten.github.io/tsne/).

Please observe, that it is not extensively tested.
Please observe that it is not yet extensively tested.

The examples in the 'examples' dir requires you to have Gadfly and RDatasets installed
The scripts in the `examples` folder require `Gadfly`, `MNIST` and `RDatasets` Julia packages.

**Please note:** At some point something changed in Julia which caused poor results, it took a while before I noted this but now I have updated the implementation so that it works again. See the link below for images rendered using this implementation.
## Installation

For some tips working with t-sne [Klick here] (http://lejon.github.io)
`julia> Pkg.clone("git://github.com/lejon/TSne.jl.git")`

## Basic installation:
## Basic API usage

`julia> Pkg.clone("git://github.com/lejon/TSne.jl.git")`

## Basic API usage:

```jl
using TSne, MNIST

function normalize(A)
for col in 1:size(A)[2]
std(A[:,col]) == 0 && continue
A[:,col] = (A[:,col]-mean(A[:,col])) / std(A[:,col])
end
A
function rescale(A, dim::Integer=1)
res = A .- mean(A, dim)
res ./= map!(x -> x > 0.0 ? x : 1.0, std(A, dim))
res
end

data, labels = traindata()
data = data'
data = data[1:2500,:]
data = convert(Matrix{Float64}, data[:, 1:2500])'
# Normalize the data, this should be done if there are large scale differences in the dataset
X = normalize(float(data))
X = rescale(data, 1)

Y = tsne(X, 2, 50, 1000, 20.0)

using Gadfly
labels = [string(i) for i in labels[1:2500]]
labels = convert(Vector{String}, labels[1:2500])
theplot = plot(x=Y[:,1], y=Y[:,2], color=labels)
draw(PDF("myplot.pdf", 4inch, 3inch), theplot)
```

![](example.png)

## Stand Alone Usage
## Command line usage

```julia demo-csv.jl haveheader --labelcol=5 iris-headers.csv```

Creates myplot.pdf with TSne result visuallized using Gadfly.
Creates `myplot.pdf` with t-SNE result visualized using `Gadfly.jl`.

## See also
* [Some tips working with t-SNE](http://lejon.github.io)
* [How to Use t-SNE Effectively](http://distill.pub/2016/misread-tsne/)
2 changes: 1 addition & 1 deletion REQUIRE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
julia 0.4
Compat 0.9
FactCheck
BaseTestNext
RDatasets
MNIST
ProgressMeter
47 changes: 21 additions & 26 deletions examples/demo-csv.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,20 @@ Usage:
Options:
-h --help Show this screen.
--version Show version.
--filename= Path to CSV file
--noheader The CSV file does not have a header row
--nolabel The CSV file does not have a label column
--filename= Path to CSV file
--noheader The CSV file does not have a header row
--nolabel The CSV file does not have a label column
"""

function normalize(A)
for col in 1:size(A)[2]
std(A[:,col]) == 0 && continue
A[:,col] = (A[:,col]-mean(A[:,col])) / std(A[:,col])
end
A
"""
Normalize `A` columns, so that the mean and standard deviation
of each column are 0 and 1, resp.
"""
function rescale(A, dim::Integer=1)
res = A .- mean(A, dim)
res ./= map!(x -> x > 0.0 ? x : 1.0, std(A, dim))
res
end

using DocOpt
Expand All @@ -34,35 +36,28 @@ arguments = docopt(doc, version=v"2.0.0")
dump(arguments)

if nothing==arguments["--labelcol"]
lblcol = -1
else
lblcol = parse(Int64,arguments["--labelcol"])
lblcol = -1
else
lblcol = parse(Int64, arguments["--labelcol"])
end

df = readtable(arguments["<filename>"],header = nothing!=arguments["haveheader"])
df = readtable(arguments["<filename>"], header = nothing!=arguments["haveheader"])

if nothing!=arguments["--labelcolname"]
lblcol = find(x -> x==symbol(arguments["--labelcolname"]),names(df))[1]
lblcol = findfirst(x -> x==symbol(arguments["--labelcolname"]), names(df))
end

println("Data is $df")
if lblcol>0
labels = df[:,lblcol]
end
labels = lblcol > 0 ? df[:, lblcol] : nothing

dataset = df[filter(x -> x!=lblcol,1:ncol(df)),]
data = float(convert(Array,dataset))
dataset = df[filter(x -> x!=lblcol, 1:ncol(df)), :]
data = convert(Matrix{Float64}, dataset)
# Normalize the data, this should be done if there are large scale differences in the dataset
X = normalize(data)
X = rescale(data)

# Run t-SNE
Y = tsne(X, 2, 50, 1000, 20.0)

using Gadfly
if lblcol>0
theplot = plot(x=Y[:,1], y=Y[:,2], color=labels)
else
theplot = plot(x=Y[:,1], y=Y[:,2])
end
theplot = plot(x=Y[:,1], y=Y[:,2], color=labels)
draw(PDF("myplot.pdf", 8inch, 6inch), theplot)

50 changes: 25 additions & 25 deletions examples/demo.jl
Original file line number Diff line number Diff line change
@@ -1,55 +1,55 @@
using Gadfly
using TSne

function normalize(A)
for col in 1:size(A)[2]
std(A[:,col]) == 0 && continue
A[:,col] = (A[:,col]-mean(A[:,col])) / std(A[:,col])
end
A
"""
Normalize `A` columns, so that the mean and standard deviation
of each column are 0 and 1, resp.
"""
function rescale(A, dim::Integer=1)
res = A .- mean(A, dim)
res ./= map!(x -> x > 0.0 ? x : 1.0, std(A, dim))
res
end

if length(ARGS)==0
if length(ARGS)==0
println("usage:\n\tjulia demo.jl iris\n\tjulia demo.jl mnist")
exit(0)
end

use_iris = ARGS[1] == "iris"
lables = ()

if use_iris
if ARGS[1] == "iris"
using RDatasets
println("Using Iris dataset.")
iris = dataset("datasets","iris")
X = float(convert(Array,iris[:,1:4]))
labels = iris[:,5]
X = convert(Matrix{Float64}, iris[:, 1:4])
labels = iris[:, 5]
plotname = "iris"
initial_dims = -1
iterations = 1500
perplexity = 15
else
elseif ARGS[1] == "mnist"
using MNIST
println("Using MNIST dataset.")
X, labels = traindata()
labels = labels[1:2500]
X = X'
X = X[1:2500,:]
X = normalize(X)
npts = min(2500, size(X, 2), size(labels))
labels = labels[1:npts]
X = rescale(X[:, 1:npts]')
plotname = "mnist"
initial_dims = 50
iterations = 1000
perplexity = 20
else
error("Unknown dataset \"", ARGS[1], "\"")
end

println("X dimensions are: " * string(size(X)))
println("X dimensions are: ", size(X))
Y = tsne(X, 2, initial_dims, iterations, perplexity)
println("Y dimensions are: " * string(size(Y)))
println("Y dimensions are: ", size(Y))

writecsv(plotname*"_tsne_out.csv",Y)
lbloutfile = open("labels.txt", "w")
writedlm(lbloutfile,labels)
close(lbloutfile)
writecsv(plotname*"_tsne_out.csv", Y)
open("labels.txt", "w") do io
writedlm(io, labels)
end

theplot = plot(x=Y[:,1], y=Y[:,2], color=labels)

draw(PDF(plotname*".pdf", 4inch, 3inch), theplot)
#draw(SVG(plotname*".svg", 4inch, 3inch), theplot)
58 changes: 38 additions & 20 deletions src/TSne.jl
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ function perplexities(X::Matrix, tol::Number = 1e-5, perplexity::Number = 30.0;
verbose::Bool=false, progress::Bool=true)
verbose && info("Computing pairwise distances...")
(n, d) = size(X)
sum_XX = sumabs2(X, 2)
sum_XX = sum(abs2, X, 2)
D = -2 * (X*X') .+ sum_XX .+ sum_XX' # euclidean distances between the points
P = zeros(n, n) # perplexities matrix
beta = ones(n) # vector of Normal distribution precisions for each point
Expand Down Expand Up @@ -128,7 +128,7 @@ i.e. embed its points (rows) into `ndims` dimensions preserving close neighbours
Different from the orginal implementation,
the default is not to use PCA for initialization.
# Arguments
### Arguments
* `reduce_dims` the number of the first dimensions of `X` PCA to use for t-SNE,
if 0, all available dimension are used
* `pca_init` whether to use the first `ndims` of `X` PCA as the initial t-SNE layout,
Expand Down Expand Up @@ -179,7 +179,7 @@ function tsne(X::Matrix, ndims::Integer = 2, reduce_dims::Integer = 0,

# Compute P-values
P = perplexities(X, 1e-5, perplexity, verbose=verbose, progress=progress)
P = P + P'
P = P + P' # make P symmetric
scale!(P, 1.0/sum(P))
scale!(P, cheat_scale) # early exaggeration
sum_P = cheat_scale
Expand All @@ -191,23 +191,31 @@ function tsne(X::Matrix, ndims::Integer = 2, reduce_dims::Integer = 0,

# Run iterations
progress && (pb = Progress(max_iter, "Computing t-SNE"))
Q = similar(P)
Q = zeros(P)
for iter in 1:max_iter
# Compute pairwise affinities
sumabs2!(sum_YY, Y)
# FIXME profiling indicates a lot of time is lost in copytri!()
A_mul_Bt!(Q, Y, Y)
sum!(abs2, sum_YY, Y)
BLAS.syrk!('U', 'N', 1.0, Y, 0.0, Q) # Q=YY^T, updates only the upper tri of Q
@inbounds for j in 1:size(Q, 2)
Q[j,j] = 0.0
sum_YYj = sum_YY[j]
Qj = view(Q, :, j)
Qj[j] = 0.0
@simd for i in 1:(j-1)
Q[j,i] = Q[i,j] = 1.0 / max(1.0, 1.0 - 2.0 * Q[i,j] + sum_YY[i] + sum_YY[j])
Qj[i] = 1.0 / max(1.0, 1.0 - 2.0 * Qj[i] + sum_YY[i] + sum_YYj)
end
end
sum_Q = sum(Q)
sum_Q = 2*sum(Q) # the diagonal and lower-tri part of Q is zero

# Compute gradient
@inbounds @simd for i in eachindex(P)
L[i] = (P[i] - Q[i]/sum_Q) * Q[i]
# Compute the gradient
inv_sumQ = 1/sum_Q
@inbounds for j in 1:size(L, 2)
Lj = view(L, :, j)
L_j = view(L, j, :)
Pj = view(P, :, j)
Qj = view(Q, :, j)
@simd for i in 1:j
L_j[i] = Lj[i] = (Pj[i] - Qj[i]*inv_sumQ) * Qj[i]
end
end
sum!(Lcolsums, L)
@inbounds for (i, ldiag) in enumerate(Lcolsums)
Expand All @@ -219,26 +227,36 @@ function tsne(X::Matrix, ndims::Integer = 2, reduce_dims::Integer = 0,
# Perform the update
momentum = iter <= momentum_switch_iter ? initial_momentum : final_momentum
@inbounds @simd for i in eachindex(gains)
flag = (dY[i] > 0) == (iY[i] > 0)
gains[i] = max(flag ? gains[i] * 0.8 : gains[i] + 0.2, min_gain)
gains[i] = max(ifelse((dY[i] > 0) == (iY[i] > 0),
gains[i] * 0.8,
gains[i] + 0.2),
min_gain)
iY[i] = momentum * iY[i] - eta * (gains[i] * dY[i])
Y[i] += iY[i]
end
mean!(Ymean, Y)
@inbounds for j in 1:size(Y, 2)
YcolMean = Ymean[j]
@simd for i in 1:size(Y, 1)
Y[i, j] -= YcolMean
Yj = view(Y, :, j)
@simd for i in eachindex(Yj)
Yj[i] -= YcolMean
end
end

# Compute current value of cost function
if progress && (!isfinite(last_kldiv) || iter == max_iter || mod(iter, max(max_iter÷20, 10)) == 0)
local kldiv = 0.0
@inbounds @simd for i in eachindex(P)
if (p = P[i]) > 0.0 && (q = Q[i]) > 0.0
kldiv += p*log(p/q)
@inbounds for j in 1:size(P, 2)
Pj = view(P, :, j)
Qj = view(Q, :, j)
kldiv_j = 0.0
@simd for i in 1:j
if (p = Pj[i]) > 0.0 && (q = Qj[i]) > 0.0
# P and Q are symmetric
kldiv_j += ifelse(i==j, 1, 2)*p*log(p/q)
end
end
kldiv += kldiv_j
end
last_kldiv = kldiv/sum_P + log(sum_Q/sum_P) # adjust wrt P and Q scales
end
Expand Down
10 changes: 7 additions & 3 deletions test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
using FactCheck, RDatasets, TSne, MNIST
using RDatasets, TSne, MNIST
if VERSION >= v"0.5.0-dev+7720"
using Base.Test
else
using BaseTestNext
const Test = BaseTestNext
end

my_tests = [
"test_tsne.jl",
Expand All @@ -7,5 +13,3 @@ my_tests = [
for t in my_tests
include(t)
end

exitstatus()

0 comments on commit 7d7bb22

Please sign in to comment.