Merge pull request #9 from alyst/enh_perf

Performance and Documenation Enhancements
lejon · Apr 24, 2017 · 7d7bb22 · 7d7bb22
2 parents d71ef23 + bd9611b
commit 7d7bb22
Show file tree

Hide file tree

Showing 7 changed files with 159 additions and 132 deletions.
diff --git a/README.md b/README.md
@@ -1,54 +1,51 @@
 [![Travis](https://travis-ci.org/lejon/TSne.jl.svg?branch=master)](https://travis-ci.org/lejon/TSne.jl)
 [![Coveralls](https://coveralls.io/repos/github/lejon/TSne.jl/badge.svg?branch=master)](https://coveralls.io/github/lejon/TSne.jl?branch=master)
 
-Julia t-SNE
-===========
+t-SNE (t-Stochastic Neighbor Embedding)
+=======================================
 
-Julia port of L.J.P. van der Maaten and G.E. Hintons T-SNE visualisation technique.
+Julia implementation of L.J.P. van der Maaten and G.E. Hintons [t-SNE visualisation technique](https://lvdmaaten.github.io/tsne/).
 
-Please observe, that it is not extensively tested. 
+Please observe that it is not yet extensively tested.
 
-The examples in the 'examples' dir requires you to have Gadfly and RDatasets installed
+The scripts in the `examples` folder require `Gadfly`, `MNIST` and `RDatasets` Julia packages.
 
-**Please note:** At some point something changed in Julia which caused poor results, it took a while before I noted this but now  I have updated the implementation so that it works again. See the link below for images rendered using this implementation.
+## Installation
 
-For some tips working with t-sne [Klick here] (http://lejon.github.io)
+  `julia> Pkg.clone("git://github.com/lejon/TSne.jl.git")`
 
-## Basic installation: 
+## Basic API usage
 
-  `julia> Pkg.clone("git://github.com/lejon/TSne.jl.git")`
-
-## Basic API usage: 
-
 ```jl
 using TSne, MNIST
 
-function normalize(A)
-	for col in 1:size(A)[2]
-        	std(A[:,col]) == 0 && continue 
-        	A[:,col] = (A[:,col]-mean(A[:,col])) / std(A[:,col])
-	end
-	A
+function rescale(A, dim::Integer=1)
+    res = A .- mean(A, dim)
+    res ./= map!(x -> x > 0.0 ? x : 1.0, std(A, dim))
+    res
 end
 
 data, labels = traindata()
-data = data'
-data = data[1:2500,:]
+data = convert(Matrix{Float64}, data[:, 1:2500])'
 # Normalize the data, this should be done if there are large scale differences in the dataset
-X = normalize(float(data)) 
+X = rescale(data, 1)
 
 Y = tsne(X, 2, 50, 1000, 20.0)
 
 using Gadfly
-labels = [string(i) for i in labels[1:2500]]
+labels = convert(Vector{String}, labels[1:2500])
 theplot = plot(x=Y[:,1], y=Y[:,2], color=labels)
 draw(PDF("myplot.pdf", 4inch, 3inch), theplot)
 ```
 
 ![](example.png)
 
-## Stand Alone Usage
+## Command line usage
 
 ```julia demo-csv.jl haveheader --labelcol=5 iris-headers.csv```
 
-Creates myplot.pdf with TSne result visuallized using Gadfly.
+Creates `myplot.pdf` with t-SNE result visualized using `Gadfly.jl`.
+
+## See also
+ * [Some tips working with t-SNE](http://lejon.github.io)
+ * [How to Use t-SNE Effectively](http://distill.pub/2016/misread-tsne/)
diff --git a/REQUIRE b/REQUIRE
@@ -1,6 +1,6 @@
 julia 0.4
 Compat 0.9
-FactCheck
+BaseTestNext
 RDatasets
 MNIST
 ProgressMeter
diff --git a/examples/demo-csv.jl b/examples/demo-csv.jl
@@ -14,18 +14,20 @@ Usage:
 Options:
   -h --help     Show this screen.
   --version     Show version.
-  --filename=   Path to CSV file 
-  --noheader    The CSV file does not have a header row	
-  --nolabel    The CSV file does not have a label column 
+  --filename=   Path to CSV file
+  --noheader    The CSV file does not have a header row
+  --nolabel    The CSV file does not have a label column
 
 """
 
-function normalize(A)
-	for col in 1:size(A)[2]
-        	std(A[:,col]) == 0 && continue 
-        	A[:,col] = (A[:,col]-mean(A[:,col])) / std(A[:,col])
-	end
-	A
+"""
+Normalize `A` columns, so that the mean and standard deviation
+of each column are 0 and 1, resp.
+"""
+function rescale(A, dim::Integer=1)
+    res = A .- mean(A, dim)
+    res ./= map!(x -> x > 0.0 ? x : 1.0, std(A, dim))
+    res
 end
 
 using DocOpt
@@ -34,35 +36,28 @@ arguments = docopt(doc, version=v"2.0.0")
 dump(arguments)
 
 if nothing==arguments["--labelcol"]
-	lblcol = -1
-else 
-	lblcol = parse(Int64,arguments["--labelcol"])
+    lblcol = -1
+else
+    lblcol = parse(Int64, arguments["--labelcol"])
 end
 
-df = readtable(arguments["<filename>"],header = nothing!=arguments["haveheader"])
+df = readtable(arguments["<filename>"], header = nothing!=arguments["haveheader"])
 
 if nothing!=arguments["--labelcolname"]
-	lblcol = find(x -> x==symbol(arguments["--labelcolname"]),names(df))[1]
+    lblcol = findfirst(x -> x==symbol(arguments["--labelcolname"]), names(df))
 end
 
 println("Data is $df")
-if lblcol>0
-	labels = df[:,lblcol]
-end
+labels = lblcol > 0 ? df[:, lblcol] : nothing
 
-dataset = df[filter(x -> x!=lblcol,1:ncol(df)),]
-data = float(convert(Array,dataset))
+dataset = df[filter(x -> x!=lblcol, 1:ncol(df)), :]
+data = convert(Matrix{Float64}, dataset)
 # Normalize the data, this should be done if there are large scale differences in the dataset
-X = normalize(data) 
+X = rescale(data)
 
 # Run t-SNE
 Y = tsne(X, 2, 50, 1000, 20.0)
 
 using Gadfly
-if lblcol>0
-	theplot = plot(x=Y[:,1], y=Y[:,2], color=labels)
-else 
-	theplot = plot(x=Y[:,1], y=Y[:,2])
-end
+theplot = plot(x=Y[:,1], y=Y[:,2], color=labels)
 draw(PDF("myplot.pdf", 8inch, 6inch), theplot)
-
diff --git a/examples/demo.jl b/examples/demo.jl
@@ -1,55 +1,55 @@
 using Gadfly
 using TSne
 
-function normalize(A)
-	for col in 1:size(A)[2]
-        	std(A[:,col]) == 0 && continue 
-        	A[:,col] = (A[:,col]-mean(A[:,col])) / std(A[:,col])
-	end
-	A
+"""
+Normalize `A` columns, so that the mean and standard deviation
+of each column are 0 and 1, resp.
+"""
+function rescale(A, dim::Integer=1)
+    res = A .- mean(A, dim)
+    res ./= map!(x -> x > 0.0 ? x : 1.0, std(A, dim))
+    res
 end
 
-if length(ARGS)==0 
+if length(ARGS)==0
     println("usage:\n\tjulia demo.jl iris\n\tjulia demo.jl mnist")
     exit(0)
 end
 
-use_iris = ARGS[1] == "iris"
-lables = ()
-
-if use_iris
+if ARGS[1] == "iris"
     using RDatasets
     println("Using Iris dataset.")
     iris = dataset("datasets","iris")
-    X = float(convert(Array,iris[:,1:4]))
-    labels = iris[:,5]
+    X = convert(Matrix{Float64}, iris[:, 1:4])
+    labels = iris[:, 5]
     plotname = "iris"
     initial_dims = -1
     iterations = 1500
     perplexity = 15
-else
+elseif ARGS[1] == "mnist"
     using MNIST
+    println("Using MNIST dataset.")
     X, labels = traindata()
-    labels = labels[1:2500]
-    X = X'
-    X = X[1:2500,:]
-    X = normalize(X)
+    npts = min(2500, size(X, 2), size(labels))
+    labels = labels[1:npts]
+    X = rescale(X[:, 1:npts]')
     plotname = "mnist"
     initial_dims = 50
     iterations = 1000
     perplexity = 20
+else
+    error("Unknown dataset \"", ARGS[1], "\"")
 end
 
-println("X dimensions are: " * string(size(X)))
+println("X dimensions are: ", size(X))
 Y = tsne(X, 2, initial_dims, iterations, perplexity)
-println("Y dimensions are: " * string(size(Y)))
+println("Y dimensions are: ", size(Y))
 
-writecsv(plotname*"_tsne_out.csv",Y)
-lbloutfile = open("labels.txt", "w")
-writedlm(lbloutfile,labels)
-close(lbloutfile)
+writecsv(plotname*"_tsne_out.csv", Y)
+open("labels.txt", "w") do io
+    writedlm(io, labels)
+end
 
 theplot = plot(x=Y[:,1], y=Y[:,2], color=labels)
-
 draw(PDF(plotname*".pdf", 4inch, 3inch), theplot)
 #draw(SVG(plotname*".svg", 4inch, 3inch), theplot)
diff --git a/src/TSne.jl b/src/TSne.jl
@@ -43,7 +43,7 @@ function perplexities(X::Matrix, tol::Number = 1e-5, perplexity::Number = 30.0;
                       verbose::Bool=false, progress::Bool=true)
     verbose && info("Computing pairwise distances...")
     (n, d) = size(X)
-    sum_XX = sumabs2(X, 2)
+    sum_XX = sum(abs2, X, 2)
     D = -2 * (X*X') .+ sum_XX .+ sum_XX' # euclidean distances between the points
     P = zeros(n, n) # perplexities matrix
     beta = ones(n)  # vector of Normal distribution precisions for each point
@@ -128,7 +128,7 @@ i.e. embed its points (rows) into `ndims` dimensions preserving close neighbours
 Different from the orginal implementation,
 the default is not to use PCA for initialization.
 
-# Arguments
+### Arguments
 * `reduce_dims` the number of the first dimensions of `X` PCA to use for t-SNE,
   if 0, all available dimension are used
 * `pca_init` whether to use the first `ndims` of `X` PCA as the initial t-SNE layout,
@@ -179,7 +179,7 @@ function tsne(X::Matrix, ndims::Integer = 2, reduce_dims::Integer = 0,
 
     # Compute P-values
     P = perplexities(X, 1e-5, perplexity, verbose=verbose, progress=progress)
-    P = P + P'
+    P = P + P' # make P symmetric
     scale!(P, 1.0/sum(P))
     scale!(P, cheat_scale)  # early exaggeration
     sum_P = cheat_scale
@@ -191,23 +191,31 @@ function tsne(X::Matrix, ndims::Integer = 2, reduce_dims::Integer = 0,
 
     # Run iterations
     progress && (pb = Progress(max_iter, "Computing t-SNE"))
-    Q = similar(P)
+    Q = zeros(P)
     for iter in 1:max_iter
         # Compute pairwise affinities
-        sumabs2!(sum_YY, Y)
-        # FIXME profiling indicates a lot of time is lost in copytri!()
-        A_mul_Bt!(Q, Y, Y)
+        sum!(abs2, sum_YY, Y)
+        BLAS.syrk!('U', 'N', 1.0, Y, 0.0, Q) # Q=YY^T, updates only the upper tri of Q
         @inbounds for j in 1:size(Q, 2)
-            Q[j,j] = 0.0
+            sum_YYj = sum_YY[j]
+            Qj = view(Q, :, j)
+            Qj[j] = 0.0
             @simd for i in 1:(j-1)
-                Q[j,i] = Q[i,j] = 1.0 / max(1.0, 1.0 - 2.0 * Q[i,j] + sum_YY[i] + sum_YY[j])
+                Qj[i] = 1.0 / max(1.0, 1.0 - 2.0 * Qj[i] + sum_YY[i] + sum_YYj)
             end
         end
-        sum_Q = sum(Q)
+        sum_Q = 2*sum(Q) # the diagonal and lower-tri part of Q is zero
 
-        # Compute gradient
-        @inbounds @simd for i in eachindex(P)
-            L[i] = (P[i] - Q[i]/sum_Q) * Q[i]
+        # Compute the gradient
+        inv_sumQ = 1/sum_Q
+        @inbounds for j in 1:size(L, 2)
+            Lj = view(L, :, j)
+            L_j = view(L, j, :)
+            Pj = view(P, :, j)
+            Qj = view(Q, :, j)
+            @simd for i in 1:j
+                L_j[i] = Lj[i] = (Pj[i] - Qj[i]*inv_sumQ) * Qj[i]
+            end
         end
         sum!(Lcolsums, L)
         @inbounds for (i, ldiag) in enumerate(Lcolsums)
@@ -219,26 +227,36 @@ function tsne(X::Matrix, ndims::Integer = 2, reduce_dims::Integer = 0,
         # Perform the update
         momentum = iter <= momentum_switch_iter ? initial_momentum : final_momentum
         @inbounds @simd for i in eachindex(gains)
-            flag = (dY[i] > 0) == (iY[i] > 0)
-            gains[i] = max(flag ? gains[i] * 0.8 : gains[i] + 0.2, min_gain)
+            gains[i] = max(ifelse((dY[i] > 0) == (iY[i] > 0),
+                                  gains[i] * 0.8,
+                                  gains[i] + 0.2),
+                           min_gain)
             iY[i] = momentum * iY[i] - eta * (gains[i] * dY[i])
             Y[i] += iY[i]
         end
         mean!(Ymean, Y)
         @inbounds for j in 1:size(Y, 2)
             YcolMean = Ymean[j]
-            @simd for i in 1:size(Y, 1)
-                Y[i, j] -= YcolMean
+            Yj = view(Y, :, j)
+            @simd for i in eachindex(Yj)
+                Yj[i] -= YcolMean
             end
         end
 
         # Compute current value of cost function
         if progress && (!isfinite(last_kldiv) || iter == max_iter || mod(iter, max(max_iter÷20, 10)) == 0)
             local kldiv = 0.0
-            @inbounds @simd for i in eachindex(P)
-                if (p = P[i]) > 0.0 && (q = Q[i]) > 0.0
-                    kldiv += p*log(p/q)
+            @inbounds for j in 1:size(P, 2)
+                Pj = view(P, :, j)
+                Qj = view(Q, :, j)
+                kldiv_j = 0.0
+                @simd for i in 1:j
+                    if (p = Pj[i]) > 0.0 && (q = Qj[i]) > 0.0
+                        # P and Q are symmetric
+                        kldiv_j += ifelse(i==j, 1, 2)*p*log(p/q)
+                    end
                 end
+                kldiv += kldiv_j
             end
             last_kldiv = kldiv/sum_P + log(sum_Q/sum_P) # adjust wrt P and Q scales
         end

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,4 +1,10 @@
-using FactCheck, RDatasets, TSne, MNIST
+using RDatasets, TSne, MNIST
+if VERSION >= v"0.5.0-dev+7720"
+    using Base.Test
+else
+    using BaseTestNext
+    const Test = BaseTestNext
+end
 
 my_tests = [
   "test_tsne.jl",
@@ -7,5 +13,3 @@ my_tests = [
 for t in my_tests
   include(t)
 end
-
-exitstatus()