From a5daec48140a23580e09c469933cffd5ad1b5d94 Mon Sep 17 00:00:00 2001
From: Anshu Avinash <anshu.avinash35@gmail.com>
Date: Wed, 23 Jul 2014 15:48:40 +0530
Subject: [PATCH] Added clsqr for solving least squares

---
 CMakeLists.txt       |   1 +
 clsqr/CMakeLists.txt |   6 +
 clsqr/lsqr.c         | 904 +++++++++++++++++++++++++++++++++++++++++++
 clsqr/lsqr.h         | 400 +++++++++++++++++++
 clsqr/lsqr_c.README  |  64 +++
 clsqr/test_lsqr.c    | 240 ++++++++++++
 clsqr/test_lsqr.h    |  26 ++
 clsqr/test_lstp.c    | 119 ++++++
 clsqr/test_mult.c    | 100 +++++
 clsqr/test_prog.c    |  36 ++
 sql/CMakeLists.txt   |   4 +-
 sql/opt_costmodel.h  |   1 +
 sql/sp_head.cc       |   1 +
 sql/sql_class.cc     |  83 +++-
 sql/sql_class.h      |   6 +-
 sql/sql_parse.cc     |   2 +-
 sql/sql_select.cc    |   1 +
 17 files changed, 1975 insertions(+), 19 deletions(-)
 create mode 100644 clsqr/CMakeLists.txt
 create mode 100644 clsqr/lsqr.c
 create mode 100644 clsqr/lsqr.h
 create mode 100644 clsqr/lsqr_c.README
 create mode 100644 clsqr/test_lsqr.c
 create mode 100644 clsqr/test_lsqr.h
 create mode 100644 clsqr/test_lstp.c
 create mode 100644 clsqr/test_mult.c
 create mode 100644 clsqr/test_prog.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 849da49d82a00..ac83a175a5b8d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -412,6 +412,7 @@ ADD_SUBDIRECTORY(libservices)
 ADD_SUBDIRECTORY(scripts)
 ADD_SUBDIRECTORY(sql/share)
 ADD_SUBDIRECTORY(support-files)
+ADD_SUBDIRECTORY(clsqr)
 
 IF(NOT WITHOUT_SERVER)
   ADD_SUBDIRECTORY(tests)
diff --git a/clsqr/CMakeLists.txt b/clsqr/CMakeLists.txt
new file mode 100644
index 0000000000000..c2dbbb6f040ee
--- /dev/null
+++ b/clsqr/CMakeLists.txt
@@ -0,0 +1,6 @@
+PROJECT(CLSQR C)
+
+SET(CLSQR_HEADERS ${PROJECT_BINARY_DIR}/lsqr.h)
+
+SET(CLSQR_SOURCES lsqr.c)
+ADD_CONVENIENCE_LIBRARY(clsqr ${CLSQR_HEADERS} ${CLSQR_SOURCES})
diff --git a/clsqr/lsqr.c b/clsqr/lsqr.c
new file mode 100644
index 0000000000000..4de1ff3029801
--- /dev/null
+++ b/clsqr/lsqr.c
@@ -0,0 +1,904 @@
+/*
+* lsqr.c
+* This is a C version of LSQR derived by James W. Howse <jhowse@lanl.gov>
+* from the Fortran 77 implementation of C. C. Paige and M. A. Saunders.
+*
+* This file defines functions for data type allocation and deallocation,
+* lsqr itself (the main algorithm),
+* and functions that scale, copy, and compute the Euclidean norm of a vector.
+*
+*
+* The following history comments are maintained by Michael Saunders.
+*
+* 08 Sep 1999: First version of lsqr.c (this file) provided by
+*              James W. Howse <jhowse@lanl.gov>.
+*
+* 16 Aug 2005: Forrester H Cole <fcole@Princeton.edu> reports "too many"
+"*              iterations" when a good solution x0 is supplied in
+*              input->sol_vec.  Note that the f77 and Matlab versions of LSQR
+*              intentionally don't allow x0 to be input.  They require
+*              the user to define r0 = b - A*x0 and solve A dx = r0,
+*              then add x = x0 + dx.  Note that nothing is gained unless
+*              larger stopping tolerances are set in the solve for dx
+*              (because dx doesn't need to be computed accurately if
+*              x0 is very good).  The same is true here.  BEWARE!
+*
+* 14 Feb 2006: Marc Grunberg <marc@renass.u-strasbg.fr> reports successful
+*              use on a sparse system of size 1422805 x 246588 with
+*              76993294 nonzero entries.  Also reports that the solution
+*              vector isn't initialized(!).  Added
+*                 output->sol_vec->elements[indx] = 0.0;
+*              to the first for loop.
+*
+*              Presumably you are supposed to provide x0 even if it is zero.
+*              Again, BEWARE!  I feel it's another reason not to allow x0.
+*
+* 18 Mar 2007: Dima Sorkin <dima.sorkin@gmail.com> reports long strings
+*              split across lines (fixed later by Krishna Mohan Gundu's
+*              script -- see 30 Aug 2007 below).
+*              Also recommends changing %i to %li for long ints.
+*
+* 05 Jul 2007: Joel Erickson <Joel.Erickson@twosigma.com> reports bug in
+*              resid_tol and resid_tol_mach used in the stopping rules:
+*                 output->mat_resid_norm   should be
+*                 output->frob_mat_norm    in both.
+*
+* 30 Aug 2007: Krishna Mohan Gundu <gkmohan@gmail.com> also reports
+*              long strings split across lines, and provides perl script
+*              fix_newlines.pl to change to the form "..." "...".
+*              The script is included in the sol/software/lsqr/c/ directory.
+*              It has been used to create this version of lsqr.c.
+*
+*              Krishna also questioned the fact that the test program
+*              seems to fail on the last few rectangular cases.
+*              Don't worry!  ||A'r|| is in fact quite small.
+*              atol and btol have been reduced from 1e-10 to 1e-15
+*              to allow more iterations and hence "success".
+*
+* 02 Sep 2007  Macro "sqr" now defined to be "lsqr_sqr".
+*/
+
+#include "lsqr.h"
+
+/*-------------------------------------------------------------------------*/
+/*                                                                         */
+/*  Define the data type allocation and deallocation functions.            */
+/*                                                                         */
+/*-------------------------------------------------------------------------*/
+
+     /*---------------------------------------------------------------*/
+     /*                                                               */
+     /*  Define the error handling function for LSQR and its          */
+     /*  associated functions.                                        */ 
+     /*                                                               */
+     /*---------------------------------------------------------------*/
+
+void lsqr_error( char  *msg,
+                 int   code  )
+{
+  fprintf(stderr, "\t%s\n", msg);
+  exit(code);
+}
+
+     /*---------------------------------------------------------------*/
+     /*                                                               */
+     /*  Define the allocation function for a long vector with        */ 
+     /*  subscript range x[0..n-1].                                   */ 
+     /*                                                               */
+     /*---------------------------------------------------------------*/
+
+lvec *alloc_lvec( long  lvec_size )
+{
+  lvec *lng_vec;
+
+  lng_vec = (lvec *) malloc( sizeof(lvec) );
+  if (!lng_vec) lsqr_error("lsqr: vector allocation failure in function "
+"alloc_lvec()", -1);
+
+  lng_vec->elements = (long *) malloc( (lvec_size) * sizeof(long) );
+  if (!lng_vec->elements) lsqr_error("lsqr: element allocation failure in "
+"function alloc_lvec()", -1);
+
+  lng_vec->length = lvec_size;
+
+  return lng_vec;
+}
+
+     /*---------------------------------------------------------------*/
+     /*                                                               */
+     /*  Define the deallocation function for the vector              */ 
+     /*  created with the function 'alloc_lvec()'.                    */ 
+     /*                                                               */
+     /*---------------------------------------------------------------*/
+
+void free_lvec( lvec  *lng_vec )
+{
+  free((double *) (lng_vec->elements));
+  free((lvec *) (lng_vec));
+  return;
+}
+
+     /*---------------------------------------------------------------*/
+     /*                                                               */
+     /*  Define the allocation function for a double vector with      */ 
+     /*  subscript range x[0..n-1].                                   */ 
+     /*                                                               */
+     /*---------------------------------------------------------------*/
+
+dvec *alloc_dvec( long  dvec_size )
+{
+  dvec *dbl_vec;
+
+  dbl_vec = (dvec *) malloc( sizeof(dvec) );
+  if (!dbl_vec) lsqr_error("lsqr: vector allocation failure in function "
+"alloc_dvec()", -1);
+
+  dbl_vec->elements = (double *) malloc( (dvec_size) * sizeof(double) );
+  if (!dbl_vec->elements) lsqr_error("lsqr: element allocation failure in "
+"function alloc_dvec()", -1);
+
+  dbl_vec->length = dvec_size;
+
+  return dbl_vec;
+}
+
+     /*---------------------------------------------------------------*/
+     /*                                                               */
+     /*  Define the deallocation function for the vector              */ 
+     /*  created with the function 'alloc_dvec()'.                    */ 
+     /*                                                               */
+     /*---------------------------------------------------------------*/
+
+void free_dvec( dvec  *dbl_vec )
+{
+  free((double *) (dbl_vec->elements));
+  free((dvec *) (dbl_vec));
+  return;
+}
+
+     /*---------------------------------------------------------------*/
+     /*                                                               */
+     /*  Define the allocation function for all of the structures     */ 
+     /*  used by the function 'lsqr()'.                               */ 
+     /*                                                               */
+     /*---------------------------------------------------------------*/
+
+void alloc_lsqr_mem( lsqr_input   **in_struct,
+                     lsqr_output  **out_struct,
+                     lsqr_work    **wrk_struct,
+                     lsqr_func    **fnc_struct,
+                     long         max_num_rows,
+                     long         max_num_cols )
+{
+  *in_struct = (lsqr_input *) alloc_lsqr_in( max_num_rows, max_num_cols );
+  if (!in_struct) lsqr_error("lsqr: input structure allocation failure in "
+"function alloc_lsqr_in()", -1);
+
+  *out_struct = (lsqr_output *) alloc_lsqr_out( max_num_rows, max_num_cols );
+  if (!out_struct) lsqr_error("lsqr: output structure allocation failure in "
+"function alloc_lsqr_out()", -1); 
+
+  (*out_struct)->sol_vec = (dvec *) (*in_struct)->sol_vec;
+
+  *wrk_struct = (lsqr_work *) alloc_lsqr_wrk( max_num_rows, max_num_cols ); 
+  if (!wrk_struct) lsqr_error("lsqr: work structure allocation failure in "
+"function alloc_lsqr_wrk()", -1);
+
+  *fnc_struct = (lsqr_func *) alloc_lsqr_fnc( );
+  if (!fnc_struct) lsqr_error("lsqr: work structure allocation failure in "
+"function alloc_lsqr_fnc()", -1);
+
+  return;
+}
+
+     /*---------------------------------------------------------------*/
+     /*                                                               */
+     /*  Define the deallocation function for the structure           */ 
+     /*  created with the function 'alloc_lsqr_mem()'.                */ 
+     /*                                                               */
+     /*---------------------------------------------------------------*/
+
+void free_lsqr_mem( lsqr_input   *in_struct,
+                    lsqr_output  *out_struct,
+                    lsqr_work    *wrk_struct,
+                    lsqr_func    *fnc_struct  )
+{
+  free_lsqr_in(in_struct);
+  free_lsqr_out(out_struct);
+  free_lsqr_wrk(wrk_struct);
+  free_lsqr_fnc(fnc_struct);
+  return;
+}
+
+     /*---------------------------------------------------------------*/
+     /*                                                               */
+     /*  Define the allocation function for the structure of          */ 
+     /*  type 'lsqr_input'.                                           */ 
+     /*                                                               */
+     /*---------------------------------------------------------------*/
+
+lsqr_input *alloc_lsqr_in( long  max_num_rows,
+                           long  max_num_cols  )
+{
+  lsqr_input *in_struct;
+
+  in_struct = (lsqr_input *) malloc( sizeof(lsqr_input) );
+  if (!in_struct) lsqr_error("lsqr: input structure allocation failure in "
+"function alloc_lsqr_in()", -1);
+
+  in_struct->rhs_vec = (dvec *) alloc_dvec( max_num_rows );
+  if (!in_struct->rhs_vec) lsqr_error("lsqr: right hand side vector \'b\' "
+"allocation failure in function alloc_lsqr_in()", -1);
+
+  in_struct->sol_vec = (dvec *) alloc_dvec( max_num_cols );
+  if (!in_struct->sol_vec) lsqr_error("lsqr: solution vector \'x\' allocation "
+"failure in function alloc_lsqr_in()", -1);
+
+  return in_struct;
+}
+
+     /*---------------------------------------------------------------*/
+     /*                                                               */
+     /*  Define the deallocation function for the structure           */ 
+     /*  created with the function 'alloc_lsqr_in()'.                 */ 
+     /*                                                               */
+     /*---------------------------------------------------------------*/
+
+void free_lsqr_in( lsqr_input  *in_struct )
+{
+  free_dvec(in_struct->rhs_vec);
+  free_dvec(in_struct->sol_vec);
+  free((lsqr_input *) (in_struct));
+  return;
+}
+
+     /*---------------------------------------------------------------*/
+     /*                                                               */
+     /*  Define the allocation function for the structure of          */ 
+     /*  type 'lsqr_output'.                                          */ 
+     /*                                                               */
+     /*---------------------------------------------------------------*/
+
+lsqr_output *alloc_lsqr_out( long  max_num_rows,
+                             long  max_num_cols  )
+{
+  lsqr_output *out_struct;
+
+  out_struct = (lsqr_output *) malloc( sizeof(lsqr_output) );
+  if (!out_struct) lsqr_error("lsqr: output structure allocation failure in "
+"function alloc_lsqr_out()", -1);
+
+  out_struct->std_err_vec = (dvec *) alloc_dvec( max_num_cols );
+  if (!out_struct->std_err_vec) lsqr_error("lsqr: standard error vector \'e\' "
+"allocation failure in function alloc_lsqr_out()", -1);
+
+  return out_struct;
+}
+
+     /*---------------------------------------------------------------*/
+     /*                                                               */
+     /*  Define the deallocation function for the structure           */ 
+     /*  created with the function 'alloc_lsqr_out()'.                */ 
+     /*                                                               */
+     /*---------------------------------------------------------------*/
+
+void free_lsqr_out( lsqr_output  *out_struct )
+{
+  free_dvec(out_struct->std_err_vec);
+  free((lsqr_output *) (out_struct));
+  return;
+}
+
+     /*---------------------------------------------------------------*/
+     /*                                                               */
+     /*  Define the allocation function for the structure of          */ 
+     /*  type 'lsqr_work'.                                            */ 
+     /*                                                               */
+     /*---------------------------------------------------------------*/
+
+lsqr_work *alloc_lsqr_wrk( long  max_num_rows,
+                           long  max_num_cols )
+{
+  lsqr_work *wrk_struct;
+
+  wrk_struct = (lsqr_work *) malloc( sizeof(lsqr_work) );
+  if (!wrk_struct) lsqr_error("lsqr: work structure allocation failure in "
+"function alloc_lsqr_wrk()", -1);
+
+  wrk_struct->bidiag_wrk_vec = (dvec *) alloc_dvec( max_num_cols );
+  if (!wrk_struct->bidiag_wrk_vec) lsqr_error("lsqr: bidiagonalization work "
+"vector \'v\' allocation failure in function alloc_lsqr_wrk()", -1);
+
+  wrk_struct->srch_dir_vec = (dvec *) alloc_dvec( max_num_cols );
+  if (!wrk_struct->srch_dir_vec)
+     lsqr_error("lsqr: search direction vector \'w\' "
+"allocation failure in function alloc_lsqr_wrk()", -1);
+
+  return wrk_struct;
+}
+
+     /*---------------------------------------------------------------*/
+     /*                                                               */
+     /*  Define the deallocation function for the structure           */ 
+     /*  created with the function 'alloc_lsqr_wrk()'.                */ 
+     /*                                                               */
+     /*---------------------------------------------------------------*/
+
+void free_lsqr_wrk( lsqr_work  *wrk_struct )
+{
+  free_dvec(wrk_struct->bidiag_wrk_vec);
+  free_dvec(wrk_struct->srch_dir_vec);
+  free((lsqr_work *) (wrk_struct));
+  return;
+}
+
+     /*---------------------------------------------------------------*/
+     /*                                                               */
+     /*  Define the allocation function for the structure of          */ 
+     /*  type 'lsqr_func'.                                            */ 
+     /*                                                               */
+     /*---------------------------------------------------------------*/
+
+lsqr_func *alloc_lsqr_fnc( )
+{
+  lsqr_func *fnc_struct;
+
+  fnc_struct = (lsqr_func *) malloc( sizeof(lsqr_func) );
+  if (!fnc_struct) lsqr_error("lsqr: function structure allocation failure in "
+"function alloc_lsqr_fnc()", -1);
+
+  return fnc_struct;
+}
+
+     /*---------------------------------------------------------------*/
+     /*                                                               */
+     /*  Define the deallocation function for the structure           */ 
+     /*  created with the function 'alloc_lsqr_fnc()'.                */ 
+     /*                                                               */
+     /*---------------------------------------------------------------*/
+
+void free_lsqr_fnc( lsqr_func  *fnc_struct )
+{
+  free((lsqr_func *) (fnc_struct));
+  return;
+}
+
+/*-------------------------------------------------------------------------*/
+/*                                                                         */
+/*  Define the LSQR function.                                              */
+/*                                                                         */
+/*-------------------------------------------------------------------------*/
+
+/*
+*------------------------------------------------------------------------------
+*
+*     LSQR  finds a solution x to the following problems:
+*
+*     1. Unsymmetric equations --    solve  A*x = b
+*
+*     2. Linear least squares  --    solve  A*x = b
+*                                    in the least-squares sense
+*
+*     3. Damped least squares  --    solve  (   A    )*x = ( b )
+*                                           ( damp*I )     ( 0 )
+*                                    in the least-squares sense
+*
+*     where 'A' is a matrix with 'm' rows and 'n' columns, 'b' is an
+*     'm'-vector, and 'damp' is a scalar.  (All quantities are real.)
+*     The matrix 'A' is intended to be large and sparse.  
+*
+*
+*     Notation
+*     --------
+*
+*     The following quantities are used in discussing the subroutine
+*     parameters:
+*
+*     'Abar'   =  (   A    ),          'bbar'  =  ( b )
+*                 ( damp*I )                      ( 0 )
+*
+*     'r'      =  b  -  A*x,           'rbar'  =  bbar  -  Abar*x
+*
+*     'rnorm'  =  sqrt( norm(r)**2  +  damp**2 * norm(x)**2 )
+*              =  norm( rbar )
+*
+*     'rel_prec'  =  the relative precision of floating-point arithmetic
+*                    on the machine being used.  Typically 2.22e-16
+*                    with 64-bit arithmetic.
+*
+*     LSQR  minimizes the function 'rnorm' with respect to 'x'.
+*
+*
+*     References
+*     ----------
+*
+*     C.C. Paige and M.A. Saunders,  LSQR: An algorithm for sparse
+*          linear equations and sparse least squares,
+*          ACM Transactions on Mathematical Software 8, 1 (March 1982),
+*          pp. 43-71.
+*
+*     C.C. Paige and M.A. Saunders,  Algorithm 583, LSQR: Sparse
+*          linear equations and least-squares problems,
+*          ACM Transactions on Mathematical Software 8, 2 (June 1982),
+*          pp. 195-209.
+*
+*     C.L. Lawson, R.J. Hanson, D.R. Kincaid and F.T. Krogh,
+*          Basic linear algebra subprograms for Fortran usage,
+*          ACM Transactions on Mathematical Software 5, 3 (Sept 1979),
+*          pp. 308-323 and 324-325.
+*
+*------------------------------------------------------------------------------
+*/
+void lsqr( lsqr_input *input, lsqr_output *output, lsqr_work *work,
+           lsqr_func *func,
+           void *prod )
+{
+  double  dvec_norm2( dvec * );
+
+  long    indx,
+          term_iter,
+          term_iter_max;
+  
+  double  alpha, 
+          beta,
+          rhobar,
+          phibar,
+          bnorm,
+          bbnorm,
+          cs1,
+          sn1,
+          psi,
+          rho,
+          cs,
+          sn,
+          theta,
+          phi,
+          tau,
+          ddnorm,
+          delta,
+          gammabar,
+          zetabar,
+          gamma,
+          cs2,
+          sn2,
+          zeta,
+          xxnorm,
+          res,
+          resid_tol,
+          cond_tol,
+          resid_tol_mach,
+          temp,
+          stop_crit_1,
+          stop_crit_2,
+          stop_crit_3;
+
+  static char term_msg[8][80] = 
+  {
+    "The exact solution is x = x0",
+    "The residual Ax - b is small enough, given ATOL and BTOL",
+    "The least squares error is small enough, given ATOL",
+    "The estimated condition number has exceeded CONLIM",
+    "The residual Ax - b is small enough, given machine precision",
+    "The least squares error is small enough, given machine precision",
+    "The estimated condition number has exceeded machine precision",
+    "The iteration limit has been reached"
+  };
+
+  if( input->lsqr_fp_out != NULL )
+    fprintf( input->lsqr_fp_out, "  Least Squares Solution of A*x = b\n"
+"        The matrix A has %7li rows and %7li columns\n"
+"        The damping parameter is\tDAMP = %10.2e\n"
+"        ATOL = %10.2e\t\tCONDLIM = %10.2e\n"
+"        BTOL = %10.2e\t\tITERLIM = %10li\n\n",
+        input->num_rows, input->num_cols, input->damp_val, input->rel_mat_err,
+        input->cond_lim, input->rel_rhs_err, input->max_iter );
+  
+  output->term_flag = 0;
+  term_iter = 0;
+
+  output->num_iters = 0;
+
+  output->frob_mat_norm = 0.0;
+  output->mat_cond_num = 0.0;
+  output->sol_norm = 0.0;
+  
+  for(indx = 0; indx < input->num_cols; indx++)
+    {
+      work->bidiag_wrk_vec->elements[indx] = 0.0;
+      work->srch_dir_vec->elements[indx] = 0.0;
+      output->std_err_vec->elements[indx] = 0.0;
+      output->sol_vec->elements[indx] = 0.0;
+    }
+
+  bbnorm = 0.0;
+  ddnorm = 0.0;
+  xxnorm = 0.0;
+
+  cs2 = -1.0;
+  sn2 = 0.0;
+  zeta = 0.0;
+  res = 0.0;
+
+  if( input->cond_lim > 0.0 )
+    cond_tol = 1.0 / input->cond_lim;
+  else
+    cond_tol = DBL_EPSILON;
+
+  alpha = 0.0;
+  beta = 0.0;
+/*
+*  Set up the initial vectors u and v for bidiagonalization.  These satisfy 
+*  the relations
+*             BETA*u = b - A*x0 
+*             ALPHA*v = A^T*u
+*/
+    /* Compute b - A*x0 and store in vector u which initially held vector b */
+  dvec_scale( (-1.0), input->rhs_vec );
+  func->mat_vec_prod( 0, input->sol_vec, input->rhs_vec, prod );
+  dvec_scale( (-1.0), input->rhs_vec );
+  
+    /* compute Euclidean length of u and store as BETA */
+  beta = dvec_norm2( input->rhs_vec );
+  
+  if( beta > 0.0 )
+    {
+        /* scale vector u by the inverse of BETA */
+      dvec_scale( (1.0 / beta), input->rhs_vec );
+
+        /* Compute matrix-vector product A^T*u and store it in vector v */
+      func->mat_vec_prod( 1, work->bidiag_wrk_vec, input->rhs_vec, prod );
+      
+        /* compute Euclidean length of v and store as ALPHA */
+      alpha = dvec_norm2( work->bidiag_wrk_vec );      
+    }
+  
+  if( alpha > 0.0 )
+    {
+        /* scale vector v by the inverse of ALPHA */
+      dvec_scale( (1.0 / alpha), work->bidiag_wrk_vec );
+
+        /* copy vector v to vector w */
+      dvec_copy( work->bidiag_wrk_vec, work->srch_dir_vec );
+    }    
+
+  output->mat_resid_norm = alpha * beta;
+  output->resid_norm = beta;
+  bnorm = beta;
+/*
+*  If the norm || A^T r || is zero, then the initial guess is the exact
+*  solution.  Exit and report this.
+*/
+  if( (output->mat_resid_norm == 0.0) && (input->lsqr_fp_out != NULL) )
+    {
+      fprintf( input->lsqr_fp_out, "\tISTOP = %3li\t\t\tITER = %9li\n"
+"        || A ||_F = %13.5e\tcond( A ) = %13.5e\n"
+"        || r ||_2 = %13.5e\t|| A^T r ||_2 = %13.5e\n"
+"        || b ||_2 = %13.5e\t|| x - x0 ||_2 = %13.5e\n\n", 
+        output->term_flag, output->num_iters, output->frob_mat_norm, 
+        output->mat_cond_num, output->resid_norm, output->mat_resid_norm,
+        bnorm, output->sol_norm );
+      
+      fprintf( input->lsqr_fp_out, "  %s\n\n", term_msg[output->term_flag]);
+      
+      return;
+    }
+
+  rhobar = alpha;
+  phibar = beta;
+/*
+*  If statistics are printed at each iteration, print a header and the initial
+*  values for each quantity.
+*/
+  if( input->lsqr_fp_out != NULL )
+    {
+      fprintf( input->lsqr_fp_out,
+"  ITER     || r ||    Compatible  "
+"||A^T r|| / ||A|| ||r||  || A ||    cond( A )\n\n" );
+
+      stop_crit_1 = 1.0;
+      stop_crit_2 = alpha / beta;
+
+      fprintf( input->lsqr_fp_out,
+      "%6li %13.5e %10.2e \t%10.2e \t%10.2e  %10.2e\n",
+               output->num_iters, output->resid_norm, stop_crit_1, stop_crit_2,
+               output->frob_mat_norm, output->mat_cond_num);
+    }
+ 
+/*
+*  The main iteration loop is continued as long as no stopping criteria
+*  are satisfied and the number of total iterations is less than some upper
+*  bound.
+*/
+  while( output->term_flag == 0 )
+    {
+      output->num_iters++;
+/*      
+*     Perform the next step of the bidiagonalization to obtain
+*     the next vectors u and v, and the scalars ALPHA and BETA.
+*     These satisfy the relations
+*                BETA*u  =  A*v  -  ALPHA*u,
+*                ALFA*v  =  A^T*u  -  BETA*v.
+*/      
+         /* scale vector u by the negative of ALPHA */
+      dvec_scale( (-alpha), input->rhs_vec );
+
+        /* compute A*v - ALPHA*u and store in vector u */
+      func->mat_vec_prod( 0, work->bidiag_wrk_vec, input->rhs_vec, prod );
+
+        /* compute Euclidean length of u and store as BETA */
+      beta = dvec_norm2( input->rhs_vec );
+
+        /* accumulate this quantity to estimate Frobenius norm of matrix A */
+   /* bbnorm += sqr(alpha) + sqr(beta) + sqr(input->damp_val);*/
+      bbnorm += alpha*alpha + beta*beta
+                + input->damp_val*input->damp_val;
+
+      if( beta > 0.0 )
+        {
+            /* scale vector u by the inverse of BETA */
+          dvec_scale( (1.0 / beta), input->rhs_vec );
+
+            /* scale vector v by the negative of BETA */
+          dvec_scale( (-beta), work->bidiag_wrk_vec );
+
+            /* compute A^T*u - BETA*v and store in vector v */
+          func->mat_vec_prod( 1, work->bidiag_wrk_vec, input->rhs_vec, prod );
+      
+            /* compute Euclidean length of v and store as ALPHA */
+          alpha = dvec_norm2( work->bidiag_wrk_vec );
+
+          if( alpha > 0.0 )
+              /* scale vector v by the inverse of ALPHA */
+            dvec_scale( (1.0 / alpha), work->bidiag_wrk_vec );
+        }
+/*
+*     Use a plane rotation to eliminate the damping parameter.
+*     This alters the diagonal (RHOBAR) of the lower-bidiagonal matrix.
+*/
+      cs1 = rhobar / sqrt( lsqr_sqr(rhobar) + lsqr_sqr(input->damp_val) );
+      sn1 = input->damp_val
+                   / sqrt( lsqr_sqr(rhobar) + lsqr_sqr(input->damp_val) );
+      
+      psi = sn1 * phibar;
+      phibar = cs1 * phibar;
+/*      
+*     Use a plane rotation to eliminate the subdiagonal element (BETA)
+*     of the lower-bidiagonal matrix, giving an upper-bidiagonal matrix.
+*/
+      rho = sqrt( lsqr_sqr(rhobar) + lsqr_sqr(input->damp_val)
+                                   + lsqr_sqr(beta) );
+      cs  = sqrt( lsqr_sqr(rhobar) + lsqr_sqr(input->damp_val) ) / rho;
+      sn  = beta / rho;
+
+      theta = sn * alpha;
+      rhobar = -cs * alpha;
+      phi = cs * phibar;
+      phibar = sn * phibar;
+      tau = sn * phi;
+/*
+*     Update the solution vector x, the search direction vector w, and the 
+*     standard error estimates vector se.
+*/     
+      for(indx = 0; indx < input->num_cols; indx++)
+        {
+            /* update the solution vector x */
+          output->sol_vec->elements[indx] += (phi / rho) * 
+            work->srch_dir_vec->elements[indx];
+
+            /* update the standard error estimates vector se */
+          output->std_err_vec->elements[indx] += lsqr_sqr( (1.0 / rho) * 
+            work->srch_dir_vec->elements[indx] );
+
+            /* accumulate this quantity to estimate condition number of A 
+*/
+          ddnorm += lsqr_sqr( (1.0 / rho) *
+            work->srch_dir_vec->elements[indx] );
+
+            /* update the search direction vector w */
+          work->srch_dir_vec->elements[indx] = 
+          work->bidiag_wrk_vec->elements[indx] -
+            (theta / rho) * work->srch_dir_vec->elements[indx];
+        }
+/*
+*     Use a plane rotation on the right to eliminate the super-diagonal element
+*     (THETA) of the upper-bidiagonal matrix.  Then use the result to estimate 
+*     the solution norm || x ||.
+*/
+      delta = sn2 * rho;
+      gammabar = -cs2 * rho;
+      zetabar = (phi - delta * zeta) / gammabar;
+
+        /* compute an estimate of the solution norm || x || */
+      output->sol_norm = sqrt( xxnorm + lsqr_sqr(zetabar) );
+
+      gamma = sqrt( lsqr_sqr(gammabar) + lsqr_sqr(theta) );
+      cs2 = gammabar / gamma;
+      sn2 = theta / gamma;
+      zeta = (phi - delta * zeta) / gamma;
+
+        /* accumulate this quantity to estimate solution norm || x || */
+      xxnorm += lsqr_sqr(zeta);
+/*
+*     Estimate the Frobenius norm and condition of the matrix A, and the 
+*     Euclidean norms of the vectors r and A^T*r.
+*/
+      output->frob_mat_norm = sqrt( bbnorm );
+      output->mat_cond_num = output->frob_mat_norm * sqrt( ddnorm );
+
+      res += lsqr_sqr(psi);
+      output->resid_norm = sqrt( lsqr_sqr(phibar) + res );
+
+      output->mat_resid_norm = alpha * fabs( tau );
+/*
+*     Use these norms to estimate the values of the three stopping criteria.
+*/
+      stop_crit_1 = output->resid_norm / bnorm;
+
+      stop_crit_2 = 0.0;
+      if( output->resid_norm > 0.0 )
+        stop_crit_2 = output->mat_resid_norm / ( output->frob_mat_norm * 
+          output->resid_norm );
+
+      stop_crit_3 = 1.0 / output->mat_cond_num;
+
+/*    05 Jul 2007: Bug reported by Joel Erickson <Joel.Erickson@twosigma.com>.
+*/
+      resid_tol = input->rel_rhs_err + input->rel_mat_err * 
+        output->frob_mat_norm *         /* (not output->mat_resid_norm *) */
+        output->sol_norm / bnorm;
+
+      resid_tol_mach = DBL_EPSILON + DBL_EPSILON *
+        output->frob_mat_norm *         /* (not output->mat_resid_norm *) */
+        output->sol_norm / bnorm;
+/*
+*     Check to see if any of the stopping criteria are satisfied.
+*     First compare the computed criteria to the machine precision.
+*     Second compare the computed criteria to the the user specified precision.
+*/
+        /* iteration limit reached */
+      if( output->num_iters >= input->max_iter )
+        output->term_flag = 7;
+
+        /* condition number greater than machine precision */
+      if( stop_crit_3 <= DBL_EPSILON )
+        output->term_flag = 6;
+        /* least squares error less than machine precision */
+      if( stop_crit_2 <= DBL_EPSILON )
+        output->term_flag = 5;
+        /* residual less than a function of machine precision */
+      if( stop_crit_1 <= resid_tol_mach )
+        output->term_flag = 4;
+
+        /* condition number greater than CONLIM */
+      if( stop_crit_3 <= cond_tol )
+        output->term_flag = 3;
+        /* least squares error less than ATOL */
+      if( stop_crit_2 <= input->rel_mat_err )
+        output->term_flag = 2;
+        /* residual less than a function of ATOL and BTOL */
+      if( stop_crit_1 <= resid_tol )
+        output->term_flag = 1;
+/*
+*  If statistics are printed at each iteration, print a header and the initial
+*  values for each quantity.
+*/
+      if( input->lsqr_fp_out != NULL )
+        {
+          fprintf( input->lsqr_fp_out,
+          "%6li %13.5e %10.2e \t%10.2e \t%10.2e %10.2e\n",
+                   output->num_iters, output->resid_norm, stop_crit_1, 
+                   stop_crit_2,
+                   output->frob_mat_norm, output->mat_cond_num);
+        }
+/*
+*     The convergence criteria are required to be met on NCONV consecutive 
+*     iterations, where NCONV is set below.  Suggested values are 1, 2, or 3.
+*/
+      if( output->term_flag == 0 )
+        term_iter = -1;
+
+      term_iter_max = 1;
+      term_iter++;
+
+      if( (term_iter < term_iter_max) &&
+          (output->num_iters < input->max_iter) )
+        output->term_flag = 0;
+    } /* end while loop */
+/*
+*  Finish computing the standard error estimates vector se.
+*/
+  temp = 1.0;
+
+  if( input->num_rows > input->num_cols )
+    temp = ( double ) ( input->num_rows - input->num_cols );
+
+  if( lsqr_sqr(input->damp_val) > 0.0 )
+    temp = ( double ) ( input->num_rows );
+  
+  temp = output->resid_norm / sqrt( temp );
+  
+  for(indx = 0; indx < input->num_cols; indx++)
+      /* update the standard error estimates vector se */
+    output->std_err_vec->elements[indx] = temp * 
+      sqrt( output->std_err_vec->elements[indx] );
+/*
+*  If statistics are printed at each iteration, print the statistics for the
+*  stopping condition.
+*/
+  if( input->lsqr_fp_out != NULL )
+    {
+      fprintf( input->lsqr_fp_out, "\n\tISTOP = %3li\t\t\tITER = %9li\n"
+"        || A ||_F = %13.5e\tcond( A ) = %13.5e\n"
+"        || r ||_2 = %13.5e\t|| A^T r ||_2 = %13.5e\n"
+"        || b ||_2 = %13.5e\t|| x - x0 ||_2 = %13.5e\n\n", 
+        output->term_flag, output->num_iters, output->frob_mat_norm, 
+        output->mat_cond_num, output->resid_norm, output->mat_resid_norm,
+        bnorm, output->sol_norm );
+
+      fprintf( input->lsqr_fp_out, "  %s\n\n", term_msg[output->term_flag]);
+      
+    }
+
+  return;
+}
+
+/*-------------------------------------------------------------------------*/
+/*                                                                         */
+/*  Define the function 'dvec_norm2()'.  This function takes a vector      */
+/*  arguement and computes the Euclidean or L2 norm of this vector.  Note  */
+/*  that this is a version of the BLAS function 'dnrm2()' rewritten to     */
+/*  use the current data structures.                                       */
+/*                                                                         */
+/*-------------------------------------------------------------------------*/
+
+double dvec_norm2(dvec *vec)
+{
+  long   indx;
+  double norm;
+  
+  norm = 0.0;
+
+  for(indx = 0; indx < vec->length; indx++)
+    norm += lsqr_sqr(vec->elements[indx]);
+
+  return sqrt(norm);
+}
+
+
+/*-------------------------------------------------------------------------*/
+/*                                                                         */
+/*  Define the function 'dvec_scale()'.  This function takes a vector      */
+/*  and a scalar as arguments and multiplies each element of the vector    */
+/*  by the scalar.  Note  that this is a version of the BLAS function      */
+/*  'dscal()' rewritten to use the current data structures.                */
+/*                                                                         */
+/*-------------------------------------------------------------------------*/
+
+void dvec_scale(double scal, dvec *vec)
+{
+  long   indx;
+
+  for(indx = 0; indx < vec->length; indx++)
+    vec->elements[indx] *= scal;
+
+  return;
+}
+
+/*-------------------------------------------------------------------------*/
+/*                                                                         */
+/*  Define the function 'dvec_copy()'.  This function takes two vectors    */
+/*  as arguements and copies the contents of the first into the second.    */
+/*  Note  that this is a version of the BLAS function 'dcopy()' rewritten  */
+/*  to use the current data structures.                                    */
+/*                                                                         */
+/*-------------------------------------------------------------------------*/
+
+void dvec_copy(dvec *orig, dvec *copy)
+{
+  long   indx;
+
+  for(indx = 0; indx < orig->length; indx++)
+    copy->elements[indx] = orig->elements[indx];
+
+  return;
+}
diff --git a/clsqr/lsqr.h b/clsqr/lsqr.h
new file mode 100644
index 0000000000000..1f1d59a10ff88
--- /dev/null
+++ b/clsqr/lsqr.h
@@ -0,0 +1,400 @@
+/*
+* lsqr.h
+* Contains auxiliary functions, data type definitions, and function 
+* prototypes for the iterative linear solver LSQR. 
+*
+* 08 Sep 1999: First version from James W. Howse <jhowse@lanl.gov>
+* 02 Sep 2007: Dima Sorkin <dima.sorkin@gmail.com> advises that
+*              in C++ the use of macros is strongly deprecated.
+*              Originally, sqr, max, min, round, TRUE, FALSE, PI
+*              were defined here.  Now,
+*                 min, round, TRUE, FALSE are gone (never used);
+*                 PI is defined explicitly in test_lstp.c;
+*                 max is changed to lsqr_max in test_lsqr.c;
+*/
+
+/*
+*------------------------------------------------------------------------------
+*
+*     LSQR  finds a solution x to the following problems:
+*
+*     1. Unsymmetric equations --    solve  A*x = b
+*
+*     2. Linear least squares  --    solve  A*x = b
+*                                    in the least-squares sense
+*
+*     3. Damped least squares  --    solve  (   A    )*x = ( b )
+*                                           ( damp*I )     ( 0 )
+*                                    in the least-squares sense
+*
+*     where 'A' is a matrix with 'm' rows and 'n' columns, 'b' is an
+*     'm'-vector, and 'damp' is a scalar.  (All quantities are real.)
+*     The matrix 'A' is intended to be large and sparse.  
+*
+*
+*     Notation
+*     --------
+*
+*     The following quantities are used in discussing the subroutine
+*     parameters:
+*
+*     'Abar'   =  (   A    ),          'bbar'  =  ( b )
+*                 ( damp*I )                      ( 0 )
+*
+*     'r'      =  b  -  A*x,           'rbar'  =  bbar  -  Abar*x
+*
+*     'rnorm'  =  sqrt( norm(r)**2  +  damp**2 * norm(x)**2 )
+*              =  norm( rbar )
+*
+*     'rel_prec'  =  the relative precision of floating-point arithmetic
+*                    on the machine being used.  For example, on the IBM 370,
+*                    'rel_prec' is about 1.0E-6 and 1.0D-16 in single and double
+*                    precision respectively.
+*
+*     LSQR  minimizes the function 'rnorm' with respect to 'x'.
+*
+*------------------------------------------------------------------------------
+*/
+
+/*---------------*/
+/* Include files */
+/*---------------*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <limits.h>
+#include <float.h>
+
+/* 02 Sep 2007: The following 7 macros
+                   sqr, max, min, round, TRUE, FALSE, PI
+                are no longer defined here.
+                (min, round, TRUE, FALSE were never used anyway.)
+                "sqr" has been changed to lsqr_sqr.
+*/
+
+/*------------------------*/
+/* User-defined functions */
+/*------------------------*/
+
+#define lsqr_sqr(a)	( (a) * (a) )
+/*
+#define max(a,b)	( (a) < (b) ? (b) : (a) )
+#define min(a,b)	( (a) < (b) ? (a) : (b) )
+#define round(a)        ( (a) > 0.0 ? (int)((a) + 0.5) : (int)((a) - 0.5) )
+*/
+
+/*----------------------*/
+/* Default declarations */
+/*----------------------*/
+
+/*
+#define TRUE	(1)
+#define FALSE	(0)
+#define PI      (4.0 * atan(1.0))
+*/
+
+/*------------------*/
+/* Type definitions */
+/*------------------*/
+
+typedef struct LONG_VECTOR {
+  long     length;
+  long     *elements;
+} lvec;
+
+typedef struct DOUBLE_VECTOR {
+  long     length;
+  double   *elements;
+} dvec;
+
+/*
+*------------------------------------------------------------------------------
+*
+*     Input Quantities
+*     ----------------
+*
+*     num_rows     input  The number of rows (e.g., 'm') in the matrix A.
+*
+*     num_cols     input  The number of columns (e.g., 'n') in the matrix A.
+*
+*     damp_val     input  The damping parameter for problem 3 above.
+*                         ('damp_val' should be 0.0 for problems 1 and 2.)
+*                         If the system A*x = b is incompatible, values
+*                         of 'damp_val' in the range
+*                            0 to sqrt('rel_prec')*norm(A)
+*                         will probably have a negligible effect.
+*                         Larger values of 'damp_val' will tend to decrease
+*                         the norm of x and reduce the number of
+*                         iterations required by LSQR.
+*
+*                         The work per iteration and the storage needed
+*                         by LSQR are the same for all values of 'damp_val'.
+*
+*     rel_mat_err  input  An estimate of the relative error in the data
+*                         defining the matrix 'A'.  For example,
+*                         if 'A' is accurate to about 6 digits, set
+*                         rel_mat_err = 1.0e-6 .
+*
+*     rel_rhs_err  input  An extimate of the relative error in the data
+*                         defining the right hand side (rhs) vector 'b'.  For 
+*                         example, if 'b' is accurate to about 6 digits, set
+*                         rel_rhs_err = 1.0e-6 .
+*
+*     cond_lim     input  An upper limit on cond('Abar'), the apparent
+*                         condition number of the matrix 'Abar'.
+*                         Iterations will be terminated if a computed
+*                         estimate of cond('Abar') exceeds 'cond_lim'.
+*                         This is intended to prevent certain small or
+*                         zero singular values of 'A' or 'Abar' from
+*                         coming into effect and causing unwanted growth
+*                         in the computed solution.
+*
+*                         'cond_lim' and 'damp_val' may be used separately or
+*                         together to regularize ill-conditioned systems.
+*
+*                         Normally, 'cond_lim' should be in the range
+*                         1000 to 1/rel_prec.
+*                         Suggested value:
+*                         cond_lim = 1/(100*rel_prec)  for compatible systems,
+*                         cond_lim = 1/(10*sqrt(rel_prec)) for least squares.
+*
+*             Note:  If the user is not concerned about the parameters
+*             'rel_mat_err', 'rel_rhs_err' and 'cond_lim', any or all of them 
+*             may be set to zero.  The effect will be the same as the values
+*             'rel_prec', 'rel_prec' and 1/rel_prec respectively.
+*
+*     max_iter     input  An upper limit on the number of iterations.
+*                         Suggested value:
+*                         max_iter = n/2   for well-conditioned systems
+*                                          with clustered singular values,
+*                         max_iter = 4*n   otherwise.
+*
+*     lsqr_fp_out  input  Pointer to the file for sending printed output.  If  
+*                         not NULL, a summary will be printed to the file that 
+*                         'lsqr_fp_out' points to.
+*
+*     rhs_vec      input  The right hand side (rhs) vector 'b'.  This vector
+*                         has a length of 'm' (i.e., 'num_rows').  The routine 
+*                         LSQR is designed to over-write 'rhs_vec'.
+*
+*     sol_vec      input  The initial guess for the solution vector 'x'.  This 
+*                         vector has a length of 'n' (i.e., 'num_cols').  The  
+*                         routine LSQR is designed to over-write 'sol_vec'.
+*
+*------------------------------------------------------------------------------
+*/
+
+typedef struct LSQR_INPUTS {
+  long     num_rows;
+  long     num_cols;
+  double   damp_val;
+  double   rel_mat_err;
+  double   rel_rhs_err;
+  double   cond_lim;
+  long     max_iter;
+  FILE     *lsqr_fp_out;
+  dvec     *rhs_vec;
+  dvec     *sol_vec;
+} lsqr_input;
+
+/*
+*------------------------------------------------------------------------------
+*
+*     Output Quantities
+*     -----------------
+*
+*     term_flag       output  An integer giving the reason for termination:
+*
+*                     0       x = x0  is the exact solution.
+*                             No iterations were performed.
+*
+*                     1       The equations A*x = b are probably compatible.  
+*                             Norm(A*x - b) is sufficiently small, given the 
+*                             values of 'rel_mat_err' and 'rel_rhs_err'.
+*
+*                     2       The system A*x = b is probably not
+*                             compatible.  A least-squares solution has
+*                             been obtained that is sufficiently accurate,
+*                             given the value of 'rel_mat_err'.
+*
+*                     3       An estimate of cond('Abar') has exceeded
+*                             'cond_lim'.  The system A*x = b appears to be
+*                             ill-conditioned.  Otherwise, there could be an
+*                             error in subroutine APROD.
+*
+*                     4       The equations A*x = b are probably
+*                             compatible.  Norm(A*x - b) is as small as
+*                             seems reasonable on this machine.
+*
+*                     5       The system A*x = b is probably not
+*                             compatible.  A least-squares solution has
+*                             been obtained that is as accurate as seems
+*                             reasonable on this machine.
+*
+*                     6       Cond('Abar') seems to be so large that there is
+*                             no point in doing further iterations,
+*                             given the precision of this machine.
+*                             There could be an error in subroutine APROD.
+*
+*                     7       The iteration limit 'max_iter' was reached.
+*  
+*     num_iters       output  The number of iterations performed.
+*
+*     frob_mat_norm   output  An estimate of the Frobenius norm of 'Abar'.
+*                             This is the square-root of the sum of squares
+*                             of the elements of 'Abar'.
+*                             If 'damp_val' is small and if the columns of 'A'
+*                             have all been scaled to have length 1.0,
+*                             'frob_mat_norm' should increase to roughly
+*                             sqrt('n').
+*                             A radically different value for 'frob_mat_norm' 
+*                             may indicate an error in subroutine APROD (there
+*                             may be an inconsistency between modes 1 and 2).
+*
+*     mat_cond_num    output  An estimate of cond('Abar'), the condition
+*                             number of 'Abar'.  A very high value of 
+*                             'mat_cond_num'
+*                             may again indicate an error in APROD.
+*
+*     resid_norm      output  An estimate of the final value of norm('rbar'),
+*                             the function being minimized (see notation
+*                             above).  This will be small if A*x = b has
+*                             a solution.
+*
+*     mat_resid_norm  output  An estimate of the final value of
+*                             norm( Abar(transpose)*rbar ), the norm of
+*                             the residual for the usual normal equations.
+*                             This should be small in all cases.  
+*                             ('mat_resid_norm'
+*                             will often be smaller than the true value
+*                             computed from the output vector 'x'.)
+*
+*     sol_norm        output  An estimate of the norm of the final
+*                             solution vector 'x'.
+*
+*     sol_vec         output  The vector which returns the computed solution 
+*                             'x'.
+*                             This vector has a length of 'n' (i.e., 
+*                             'num_cols').
+*
+*     std_err_vec     output  The vector which returns the standard error 
+*                             estimates  for the components of 'x'.
+*                             This vector has a length of 'n'
+*                             (i.e., 'num_cols').  For each i, std_err_vec(i) 
+*                             is set to the value
+*                             'resid_norm' * sqrt( sigma(i,i) / T ),
+*                             where sigma(i,i) is an estimate of the i-th
+*                             diagonal of the inverse of Abar(transpose)*Abar
+*                             and  T = 1      if  m <= n,
+*                                  T = m - n  if  m > n  and  damp_val = 0,
+*                                  T = m      if  damp_val != 0.
+*
+*------------------------------------------------------------------------------
+*/
+
+typedef struct LSQR_OUTPUTS {
+  long     term_flag;
+  long     num_iters;
+  double   frob_mat_norm;
+  double   mat_cond_num;
+  double   resid_norm;
+  double   mat_resid_norm;
+  double   sol_norm;
+  dvec     *sol_vec;
+  dvec     *std_err_vec;
+} lsqr_output;
+
+/*
+*------------------------------------------------------------------------------
+*
+*     Workspace Quantities
+*     --------------------
+*
+*     bidiag_wrk_vec  workspace  This float vector is a workspace for the 
+*                                current iteration of the
+*                                Lanczos bidiagonalization.  
+*                                This vector has length 'n' (i.e., 'num_cols').
+*
+*     srch_dir_vec    workspace  This float vector contains the search direction 
+*                                at the current iteration.  This vector has a 
+*                                length of 'n' (i.e., 'num_cols').
+*
+*------------------------------------------------------------------------------
+*/
+
+typedef struct LSQR_WORK {
+  dvec     *bidiag_wrk_vec;
+  dvec     *srch_dir_vec;
+} lsqr_work;
+
+/*
+*------------------------------------------------------------------------------
+*
+*     Functions Called
+*     ----------------
+*
+*     mat_vec_prod       functions  A pointer to a function that performs the 
+*                                   matrix-vector multiplication.  The function
+*                                   has the calling parameters:
+*
+*                                       APROD ( mode, x, y, prod_data ),
+*
+*                                   and it must perform the following functions:
+*
+*                                       If MODE = 0, compute  y = y + A*x,
+*                                       If MODE = 1, compute  x = x + A^T*y.
+*
+*                                   The vectors x and y are input parameters in 
+*                                   both cases.
+*                                   If mode = 0, y should be altered without
+*                                   changing x.
+*                                   If mode = 2, x should be altered without
+*                                   changing y.
+*                                   The argument prod_data is a pointer to a
+*                                   user-defined structure that contains
+*                                   any data need by the function APROD that is
+*                                   not used by LSQR.  If no additional data is 
+*                                   needed by APROD, then prod_data should be
+*                                   the NULL pointer.
+*------------------------------------------------------------------------------
+*/
+
+typedef struct LSQR_FUNC {
+  void     (*mat_vec_prod) (long, dvec *, dvec *, void *);
+} lsqr_func;
+
+/*---------------------*/
+/* Function prototypes */
+/*---------------------*/
+
+void lsqr_error( char *, int );
+
+lvec *alloc_lvec( long );
+void free_lvec( lvec * );
+
+dvec *alloc_dvec( long );
+void free_dvec( dvec * );
+
+void alloc_lsqr_mem( lsqr_input **, lsqr_output **, lsqr_work **, lsqr_func **, 
+		     long, long ); 
+void free_lsqr_mem( lsqr_input *, lsqr_output *, lsqr_work *, lsqr_func * );
+
+lsqr_input *alloc_lsqr_in( long, long );
+void free_lsqr_in( lsqr_input * );
+
+lsqr_output *alloc_lsqr_out( long, long );
+void free_lsqr_out( lsqr_output * );
+
+lsqr_work *alloc_lsqr_wrk( long, long );
+void free_lsqr_wrk( lsqr_work * );
+
+lsqr_func *alloc_lsqr_fnc( );
+void free_lsqr_fnc( lsqr_func * );
+
+void lsqr( lsqr_input *, lsqr_output *, lsqr_work *, lsqr_func *, void * );
+
+double dvec_norm2( dvec * );
+void dvec_scale( double, dvec * );
+void dvec_copy( dvec *, dvec * );
+
diff --git a/clsqr/lsqr_c.README b/clsqr/lsqr_c.README
new file mode 100644
index 0000000000000..40e57b7589219
--- /dev/null
+++ b/clsqr/lsqr_c.README
@@ -0,0 +1,64 @@
+lsqr_c.README
+
+The software for LSQR (C version) is provided by SOL, Stanford University
+under the terms of the OSI Common Public License (CPL):
+http://www.opensource.org/licenses/cpl1.0.php
+
+
+08 Sep 1999: First set of files provided by James W. Howse <jhowse@lanl.gov>.
+
+11 Oct 2000: Files unpacked by Michael Saunders.
+             Lines longer than 80 characters had extraneous carriage returns.
+             Tried to repair them, but not sure if I/O formats will be correct.
+
+09 Dec 2005: Beware that this C version of LSQR allows x0 to be input.
+             I RECOMMEND THAT YOU ALWAYS USE x0 = 0 (an n-vector of zeros).
+
+             Otherwise, you must be conscious of the fact that LSQR will be
+             solving A dx = r0 = b - A x0 and adding parts of dx to x0
+             AS IT GOES.  If x0 is very close to a solution (that is, if dx
+             is very small compared to x0), many digits of accuracy in dx
+             will be lost.  It is safer to solve A dx = r0 (with x0 = 0)
+             and then compute x = x0 + dx yourself at the end.
+
+             In either case, providing x0 will generally not save work
+             (will not reduce the number of LSQR iterations) unless you
+             use larger tolerances atol and btol than you would for Ax = b.
+
+31 Aug 2007: Some bugs fixed (see lsqr.c).
+             Note that some of the rectangular test problems
+             at the end of test_prog.c are really too ill-conditioned
+             for any least-squares method to give an accurate solution.
+             Ignore the "LSQR seems to have failed" messages if ||A'r||
+             is quite small.
+
+31 Aug 2007: Simplified C version of LSQR contributed by
+             Michael Friedlander.  Added as clsqr2 to
+             http://www.stanford.edu/group/SOL/lsqr.html
+
+02 Aug 2007: Dima Sorkin <dima.sorkin@gmail.com> recommends changes
+             to improve portability.  Also the following compile command:
+                gcc -ansi -pedantic -Wall -c lsqr.c
+
+Please send comments to Michael Saunders, SOL, Stanford University
+                        saunders@stanford.edu  650-723-1875
+-----------------------------------------------------------------------------
+
+The C version of LSQR involves the following files:
+
+   lsqr.c
+   lsqr.h
+   makefile
+   test_lsqr.c
+   test_lsqr.h
+   test_lstp.c
+   test_mult.c
+   test_prog.c
+
+test_lstp.c   contains lstp, a least-square test problem generator.
+              It constructs data for a matrix A and a vector b.
+test_mult.c   computes products Ax and A'y.
+test_lsqr.c   calls lstp to generate a problem,
+              then calls lsqr to solve it.
+test_prog.c   calls test_lsqr several times to generate and solve
+              various problems.
diff --git a/clsqr/test_lsqr.c b/clsqr/test_lsqr.c
new file mode 100644
index 0000000000000..4b3e8b090f0f8
--- /dev/null
+++ b/clsqr/test_lsqr.c
@@ -0,0 +1,240 @@
+/*
+* test_lsqr.c
+* For testing C version of LSQR.
+*
+* 08 Sep 1999: First version from James W. Howse <jhowse@lanl.gov>
+* 02 Sep 2007: "max" redefined here to be "lsqr_max".
+*              test_in->rel_mat_err changed from 1e-10 to 1e-15 and
+*              test_in->rel_rhs_err changed from 1e-10 to 1e-15
+*              to ensure "success" on the ill-conditioned rectangular
+*              test cases.
+*/
+
+#include "lsqr.h"
+
+#include "test_lsqr.h"
+
+# define lsqr_max(a,b) ( (a) < (b) ? (b) : (a) )
+
+void test_lsqr( long    num_rows,
+		long    num_cols,
+		long    duplicate_param,
+		long    power_param,
+		double  damp_param )
+/*     
+*     ------------------------------------------------------------------
+*     This is an example driver routine for running LSQR.
+*     It generates a test problem, solves it, and examines the results.
+*     Note that subroutine APROD must be declared 'extern' if it is 
+*     used only in the call to LSQR.
+*     ------------------------------------------------------------------
+*/
+{
+  double       dvec_norm2( dvec * );
+
+  long         col_indx;
+  
+  double       unorm,
+               enorm,
+               etol,
+               act_mat_cond_num,
+               act_resid_norm;
+  
+  dvec         *act_rhs_vec,
+               *act_sol_vec;
+
+  lsqr_input   *test_in;
+  lsqr_output  *test_out;
+  lsqr_work    *test_work;
+  lsqr_func    *test_func;
+  prod_data    *test_prod;
+
+  extern void  test_mult( long, dvec *, dvec *, void * );
+  extern void  lstp_mult( dvec *, dvec *, dvec * );
+  
+  alloc_lsqr_mem( &test_in, &test_out, &test_work, &test_func, num_rows, 
+                  num_cols );
+
+  test_func->mat_vec_prod = test_mult;
+  test_in->lsqr_fp_out = stdout;
+/*
+*     Allocate the the data structure of type 'prod_data'.
+*/
+  test_prod = (prod_data *) malloc( sizeof(prod_data) );
+
+  test_prod->d_vec = (dvec *) alloc_dvec( num_cols );
+  if (!test_prod->d_vec) lsqr_error("test_prog: vector \'d\' allocation failure "
+"in function lsqr_test()", -1);
+
+  test_prod->hy_vec = (dvec *) alloc_dvec( num_rows );
+  if (!test_prod->hy_vec) lsqr_error("test_prog: vector \'hy\' allocation "
+"failure in function lsqr_test()", -1);
+
+  test_prod->hz_vec = (dvec *) alloc_dvec( num_cols );
+  if (!test_prod->hz_vec) lsqr_error("test_prog: vector \'hz\' allocation "
+"failure in function lsqr_test()", -1);
+
+  test_prod->work_vec = (dvec *) alloc_dvec( lsqr_max( num_rows, num_cols ) );
+  if (!test_prod->work_vec) lsqr_error("test_prog: vector \'work\' allocation "
+"failure in function lsqr_test()", -1);
+/*
+*     Allocate the other needed vectors.
+*/
+  act_rhs_vec = (dvec *) alloc_dvec( num_rows );
+  if (!act_rhs_vec) lsqr_error("test_prog: vector \'b\' allocation failure in "
+"function lsqr_test()", -1);
+
+  act_sol_vec = (dvec *) alloc_dvec( num_cols );
+  if (!act_sol_vec) lsqr_error("test_prog: vector \'x_ans\' allocation failure "
+"in function lsqr_test()", -1);
+/*
+*     Set the true solution for the test problem.
+*/
+  for( col_indx = 0; col_indx < num_cols; col_indx++ )
+    act_sol_vec->elements[col_indx] = (double)( num_cols - (col_indx + 1) );
+/*
+*     Call the routine LSTP which generates the least squares test problem.
+*/
+  lstp( duplicate_param, power_param, damp_param, act_sol_vec, act_rhs_vec, 
+        test_prod, &act_mat_cond_num, &act_resid_norm );
+/*
+*     Copy the right-hand side vector generated for the test problem by LSTP 
+*     into the right-hand side vector for LSQR. 
+*/
+  dvec_copy( act_rhs_vec, test_in->rhs_vec );
+/*
+*  Set the initial guess for LSQR.
+*/
+  for( col_indx = 0; col_indx < num_cols; col_indx++ )
+    test_in->sol_vec->elements[col_indx] = 0.0;
+/*
+*     Print information about the test problem.
+*/
+  fprintf( test_in->lsqr_fp_out,
+	   
+"--------------------------------------------------------------------\n" );
+  fprintf( test_in->lsqr_fp_out,
+    "Least-Squares Test Problem      P( %5li %5li %5li "
+    "%5li %12.2e )\n\n",
+	   num_rows, num_cols, duplicate_param, power_param, damp_param );
+  fprintf( test_in->lsqr_fp_out,
+    "Condition No. = %12.4e     Residual Function = "
+    "%17.9e\n",
+	   act_mat_cond_num, act_resid_norm );
+  fprintf( test_in->lsqr_fp_out,
+	   
+"--------------------------------------------------------------------\n\n" );
+/*
+*     Set the input parameters for LSQR.
+*/
+  test_in->num_rows = num_rows;
+  test_in->num_cols = num_cols;
+  test_in->damp_val = damp_param;
+  test_in->rel_mat_err = 1.0e-15;  /* Previously 1.0e-10; */
+  test_in->rel_rhs_err = 1.0e-15;  /* Previously 1.0e-10; */
+  test_in->cond_lim = 10.0 * act_mat_cond_num;
+  test_in->max_iter = num_rows + num_cols + 50;
+/*
+*     Solve the test problem generated by LTSP by calling the routine LSQR.
+*/
+  lsqr( test_in, test_out, test_work, test_func, test_prod );
+
+/*
+*     Examine the results.
+*     Print the residual norms RNORM and ARNORM given by LSQR, and then compute 
+*     their true values in terms of the solution X obtained by  LSQR.
+*     At least one of these norms should be small.
+*/
+  fprintf( test_in->lsqr_fp_out, "\t\t\tResidual Norm       Residual Norm       "
+"Solution Norm\n" );
+  fprintf( test_in->lsqr_fp_out, "\t\t\t||A x - b||_2   ||A^T A x - A^T b||_2   "
+"||x - x0||_2\n\n" );
+  fprintf( test_in->lsqr_fp_out, "Estimated by LSQR  %17.5e   %17.5e   "
+"%17.5e\n",
+	   test_out->resid_norm, test_out->mat_resid_norm, test_out->sol_norm );
+/*
+*     Compute  U = A*x - b.
+*     This is the negative of the usual residual vector.  It will be close to 
+*     zero only if b is a compatible rhs and x is close to a solution.
+*/
+  dvec_copy( act_rhs_vec, test_in->rhs_vec );
+  dvec_scale( (-1.0), test_in->rhs_vec );
+  test_mult( 0, test_out->sol_vec, test_in->rhs_vec, test_prod );
+/*
+*     Compute  v = A^T*u  +  DAMP**2 * x.
+*     This will be close to zero in all cases if x is close to a solution.
+*/
+  dvec_copy( test_out->sol_vec, test_work->bidiag_wrk_vec );
+  dvec_scale( ( lsqr_sqr(damp_param) ), test_work->bidiag_wrk_vec );
+  test_mult( 1, test_work->bidiag_wrk_vec, test_in->rhs_vec, test_prod );
+/*
+*     Compute the norms associated with  x, u, v.
+*/
+  test_out->sol_norm = dvec_norm2( test_out->sol_vec );
+  unorm = dvec_norm2( test_in->rhs_vec );
+  test_out->resid_norm = sqrt( lsqr_sqr( unorm ) + lsqr_sqr( damp_param ) * 
+			       lsqr_sqr( test_out->sol_norm ) ); 
+  test_out->mat_resid_norm = dvec_norm2( test_work->bidiag_wrk_vec );
+  fprintf( test_in->lsqr_fp_out, "Computed from X    %17.5e   %17.5e   "
+"%17.5e\n\n",
+	   test_out->resid_norm, test_out->mat_resid_norm, test_out->sol_norm );
+/*
+*     Print the solution and standard error estimates from LSQR.
+*/
+  fprintf( test_in->lsqr_fp_out, "Solution X\n" );
+  for( col_indx = 0; col_indx < num_cols; col_indx++ )
+    {
+      fprintf( test_in->lsqr_fp_out, "%6li %14.6g", col_indx, 
+	       test_out->sol_vec->elements[col_indx] );
+      if( ( (col_indx + 1) % 4 ) == 0 )
+	fprintf( test_in->lsqr_fp_out, "\n" );
+    }
+
+  fprintf( test_in->lsqr_fp_out, "\n\n" );
+
+  fprintf( test_in->lsqr_fp_out, "Standard Errors SE\n" );
+  for( col_indx = 0; col_indx < num_cols; col_indx++ )
+    {
+      fprintf( test_in->lsqr_fp_out, "%6li %14.6g", col_indx, 
+	       test_out->std_err_vec->elements[col_indx] );
+      if( ( (col_indx + 1) % 4 ) == 0 )
+	fprintf( test_in->lsqr_fp_out, "\n" );
+    }
+  fprintf( test_in->lsqr_fp_out, "\n\n" );
+/*
+*     Print information about the accuracy of the LSQR solution.
+*/
+  for( col_indx = 0; col_indx < num_cols; col_indx++ )
+    test_work->srch_dir_vec->elements[col_indx] = 
+test_out->sol_vec->elements[col_indx]
+      - act_sol_vec->elements[col_indx];
+
+  enorm = dvec_norm2( test_work->srch_dir_vec ) /
+                      ( 1.0 + dvec_norm2( act_sol_vec ) );
+  etol = 1.0e-5;
+  
+  if( enorm <= etol )
+    fprintf( test_in->lsqr_fp_out, "LSQR appears to be successful.    Relative "
+"error in X = %10.2e\n\n",
+	     enorm );
+  if( enorm > etol )
+    fprintf( test_in->lsqr_fp_out, "LSQR appears to have failed.      Relative "
+"error in X = %10.2e\n\n",
+	     enorm );
+/*
+*     Free the memory allocated for LSQR.
+*/
+  free_lsqr_mem( test_in, test_out, test_work, test_func );
+
+  free_dvec( test_prod->d_vec );
+  free_dvec( test_prod->hy_vec );
+  free_dvec( test_prod->hz_vec );
+  free_dvec( test_prod->work_vec );
+  free( (prod_data *) (test_prod) );
+
+  free_dvec( act_rhs_vec );
+  free_dvec( act_sol_vec );
+
+  return;
+}
+
diff --git a/clsqr/test_lsqr.h b/clsqr/test_lsqr.h
new file mode 100644
index 0000000000000..684e8bf5f2972
--- /dev/null
+++ b/clsqr/test_lsqr.h
@@ -0,0 +1,26 @@
+/*
+* test_lsqr.h
+*
+* 08 Sep 1999: First version from James W. Howse <jhowse@lanl.gov>
+*/
+
+/*------------------*/
+/* Type definitions */
+/*------------------*/
+
+typedef struct PROD_DATA {
+  dvec  *d_vec;
+  dvec  *hy_vec;
+  dvec  *hz_vec;
+  dvec  *work_vec;
+} prod_data;
+
+/*---------------------*/
+/* Function prototypes */
+/*---------------------*/
+
+void test_lsqr( long, long, long, long, double );
+void lstp( long, long, double, dvec *, dvec *, prod_data *,
+           double *, double * );
+void test_mult( long, dvec *, dvec *, void * );
+void lstp_mult( dvec *, dvec *, dvec * );
diff --git a/clsqr/test_lstp.c b/clsqr/test_lstp.c
new file mode 100644
index 0000000000000..d8a585b4a292b
--- /dev/null
+++ b/clsqr/test_lstp.c
@@ -0,0 +1,119 @@
+/*
+* test_lstp.c
+* For testing lstp, the LSQR test-problem generator.
+*
+* 08 Sep 1999: First version from James W. Howse <jhowse@lanl.gov>
+* 02 Sep 2007: Changed 4.0 * PI to 12.566370614359172.
+*/
+
+#include "lsqr.h"
+
+#include "test_lsqr.h"
+
+void lstp( long       duplicate_param,
+	   long       power_param,
+	   double     damp_param,
+	   dvec       *act_sol_vec,
+	   dvec       *act_rhs_vec, 
+           prod_data  *test_prod, 
+	   double     *act_mat_cond_num, 
+	   double     *act_resid_norm )
+/*     
+*     ------------------------------------------------------------------
+*     LSTP  generates a sparse least-squares test problem of the form
+*
+*                (   A    )*X = ( B ) 
+*                ( DAMP*I )     ( 0 )
+*
+*     having a specified solution X.  The matrix A is constructed
+*     in the form  A = HY*D*HZ,  where D is an 'num_rows' by 'num_cols'
+*     diagonal matrix, and HY and HZ are Householder transformations.
+*
+*     The first 4 parameters are input to LSTP.  The remainder are
+*     output.  LSTP is intended for use with 'num_rows' >= 'num_cols'.
+*     ------------------------------------------------------------------
+*/
+{
+  double   dvec_norm2( dvec * );
+
+  long     num_rows,
+	   num_cols,
+	   row_indx,
+           col_indx,
+           lwork;
+
+  double   mnorm,
+           nnorm,
+           dwork;
+
+  num_rows = act_rhs_vec->length;
+  num_cols = act_sol_vec->length;
+/*  
+*     Make two vectors of norm 1.0 for the Householder transformations.
+* 02 Sep 2007: Changed 4.0 * PI to 12.566370614359172.
+*/
+  for( row_indx = 0; row_indx < num_rows; row_indx++ )
+    test_prod->hy_vec->elements[row_indx] =
+       sin( (double)(row_indx + 1) * 12.566370614359172
+        / ( (double)(num_rows) ) );
+  
+  for( col_indx = 0; col_indx < num_cols; col_indx++ )
+    test_prod->hz_vec->elements[col_indx] =
+       cos( (double)(col_indx + 1) * 12.566370614359172
+        / ( (double)(num_cols) ) );
+  
+  mnorm = dvec_norm2( test_prod->hy_vec );
+  nnorm = dvec_norm2( test_prod->hz_vec );
+  dvec_scale( (-1.0 / mnorm), test_prod->hy_vec );
+  dvec_scale( (-1.0 / nnorm), test_prod->hz_vec );
+/*  
+*     Set the diagonal matrix D.  These are the singular values of A.
+*/
+  for( col_indx = 0; col_indx < num_cols; col_indx++ )
+    {
+      lwork = ( col_indx + duplicate_param ) / duplicate_param;
+      dwork = (double) lwork * (double) duplicate_param;
+      dwork /= (double) num_cols;
+      test_prod->d_vec->elements[col_indx] = pow( dwork, ( (double) power_param 
+) );
+    }
+
+  (*act_mat_cond_num) =
+     sqrt((lsqr_sqr(test_prod->d_vec->elements[num_cols - 1]) +
+           lsqr_sqr(damp_param)) / 
+          (lsqr_sqr(test_prod->d_vec->elements[0])            +
+           lsqr_sqr(damp_param)));
+/*  
+*     Compute the residual vector, storing it in  B.
+*     It takes the form  HY*( S )
+*                           ( T )
+*     where S is obtained from  D*S = DAMP**2 * HZ * X and T can be anything.
+*/
+  lstp_mult( test_prod->hz_vec, act_sol_vec, act_rhs_vec );
+  
+  for( col_indx = 0; col_indx < num_cols; col_indx++ )
+    act_rhs_vec->elements[col_indx] *= lsqr_sqr( damp_param ) /
+      test_prod->d_vec->elements[col_indx];
+  
+  dwork = 1.0;
+  for( row_indx = num_cols; row_indx < num_rows; row_indx++ )
+    {
+      lwork = row_indx - (num_cols - 1);
+      act_rhs_vec->elements[row_indx] = ( dwork * (double)(lwork) ) /
+	( (double)(num_rows) );
+      dwork = -dwork;
+    }
+  
+  lstp_mult( test_prod->hy_vec, act_rhs_vec, act_rhs_vec );
+/*
+*     Now compute the true B = RESIDUAL + A*X.
+*/
+  mnorm = dvec_norm2( act_rhs_vec );
+  nnorm = dvec_norm2( act_sol_vec );
+  (*act_resid_norm) = sqrt( lsqr_sqr( mnorm ) +
+                            lsqr_sqr( damp_param ) * lsqr_sqr( nnorm ) );
+  
+  test_mult( 0, act_sol_vec, act_rhs_vec, test_prod );
+
+  return;
+}
diff --git a/clsqr/test_mult.c b/clsqr/test_mult.c
new file mode 100644
index 0000000000000..a2cb6371ed9fc
--- /dev/null
+++ b/clsqr/test_mult.c
@@ -0,0 +1,100 @@
+/*
+* test_mult.c
+* For testing the matrix-vector product routine for the LSQR test problems.
+*
+* 08 Sep 1999: First version from James W. Howse <jhowse@lanl.gov>
+*/
+
+#include "lsqr.h"
+
+#include "test_lsqr.h"
+
+void test_mult( long  mode,
+	        dvec  *x, 
+                dvec  *y,
+                void  *prod )
+/*
+*     ------------------------------------------------------------------
+*     This is the matrix-vector product routine required by LSQR
+*     for a test matrix of the form  A = HY*D*HZ.  The quantities
+*     defining D, HY, HZ and the work array W are in the structure data.
+*     ------------------------------------------------------------------
+*/
+{
+  long       vec_indx,
+             num_rows,
+             num_cols;
+
+  prod_data  *data;
+  
+  data = (prod_data *) prod;  
+
+  num_cols = x->length;
+  num_rows = y->length;
+/*  
+*     Compute  Y = Y + A*X
+*/
+  if( mode == 0 )
+    {
+      lstp_mult( data->hz_vec, x, data->work_vec );
+      
+      for( vec_indx = 0; vec_indx < num_cols; vec_indx++ )
+	data->work_vec->elements[vec_indx] *= data->d_vec->elements[vec_indx];
+      
+      for( vec_indx = num_cols; vec_indx < num_rows; vec_indx++ )
+	data->work_vec->elements[vec_indx] = 0.0;
+      
+      lstp_mult( data->hy_vec, data->work_vec, data->work_vec );
+
+      for( vec_indx = 0; vec_indx < num_rows; vec_indx++ )
+	y->elements[vec_indx] += data->work_vec->elements[vec_indx];
+    }
+/*  
+*     Compute  X = X + A^T*Y
+*/  
+  if( mode == 1 )
+    {
+      lstp_mult( data->hy_vec, y, data->work_vec );
+      
+      for( vec_indx = 0; vec_indx < num_cols; vec_indx++ )
+	data->work_vec->elements[vec_indx] *= data->d_vec->elements[vec_indx];
+      
+      lstp_mult( data->hz_vec, data->work_vec, data->work_vec );
+
+      for( vec_indx = 0; vec_indx < num_cols; vec_indx++ )
+	x->elements[vec_indx] += data->work_vec->elements[vec_indx];
+    }
+  
+  return;
+}
+
+
+void lstp_mult( dvec  *h,
+	        dvec  *x, 
+                dvec  *y )
+/*
+*     ------------------------------------------------------------------
+*     HPROD  applies a Householder transformation stored in H to get
+*     Y = ( I - 2*HZ*HZ(transpose) ) * X.
+*     ------------------------------------------------------------------
+*/
+{
+  long    vec_indx,
+          vec_length;
+
+  double  dwork;
+  
+  vec_length = h->length;
+  dwork = 0.0;
+
+  for( vec_indx = 0; vec_indx < vec_length; vec_indx++ )
+    dwork += h->elements[vec_indx] * x->elements[vec_indx];
+  
+  dwork += dwork;
+  
+  for( vec_indx = 0; vec_indx < vec_length; vec_indx++ )
+    y->elements[vec_indx] = x->elements[vec_indx] - dwork * 
+h->elements[vec_indx];
+  
+  return;
+}
diff --git a/clsqr/test_prog.c b/clsqr/test_prog.c
new file mode 100644
index 0000000000000..d3aa32cee51fb
--- /dev/null
+++ b/clsqr/test_prog.c
@@ -0,0 +1,36 @@
+/*
+* test_prog.c
+* Tests C version of LSQR.
+*
+* 08 Sep 1999: First version from James W. Howse <jhowse@lanl.gov>
+*/
+
+#include "lsqr.h"
+
+#include "test_lsqr.h"
+
+void main()
+{
+  double  damp1,
+          damp2,
+          damp3,
+          damp4;
+  
+  damp1 = 0.1;
+  damp2 = 0.01;
+  damp3 = 0.001;
+  damp4 = 0.0001;
+  
+  test_lsqr(  1,  1, 1, 1, 0.0 );
+  test_lsqr(  2,  1, 1, 1, 0.0 );
+  test_lsqr( 40, 40, 4, 4, 0.0 );
+  test_lsqr( 40, 40, 4, 4, damp2 );
+  test_lsqr( 80, 40, 4, 4, damp2 );
+
+  test_lsqr( 10, 10, 1, 8, 0.0 );
+  test_lsqr( 40, 40, 4, 7, 0.0 );
+  test_lsqr( 80, 40, 4, 6, 0.0 );
+  test_lsqr( 20, 10, 1, 6, 0.0 );
+  test_lsqr( 20, 10, 1, 8, 0.0 );
+
+} /*  end main  */
diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt
index 380442bd3a2bc..a164eb9fa7620 100644
--- a/sql/CMakeLists.txt
+++ b/sql/CMakeLists.txt
@@ -20,6 +20,8 @@ ${PCRE_INCLUDES}
 ${ZLIB_INCLUDE_DIR}
 ${SSL_INCLUDE_DIRS}
 ${CMAKE_BINARY_DIR}/sql
+${CMAKE_SOURCE_DIR}/clsqr
+${CMAKE_BINARY_DIR}/clsqr
 )
 
 SET(GEN_SOURCES
@@ -110,7 +112,7 @@ ADD_LIBRARY(sql STATIC ${SQL_SOURCE})
 ADD_DEPENDENCIES(sql GenServerSource)
 DTRACE_INSTRUMENT(sql)
 TARGET_LINK_LIBRARIES(sql ${MYSQLD_STATIC_PLUGIN_LIBS} 
-  mysys mysys_ssl dbug strings vio pcre ${LIBJEMALLOC}
+  mysys mysys_ssl dbug strings vio pcre clsqr ${LIBJEMALLOC}
   ${LIBWRAP} ${LIBCRYPT} ${LIBDL} ${CMAKE_THREAD_LIBS_INIT}
   ${SSL_LIBRARIES})
 
diff --git a/sql/opt_costmodel.h b/sql/opt_costmodel.h
index b855ca28b3930..ced39771cbccb 100644
--- a/sql/opt_costmodel.h
+++ b/sql/opt_costmodel.h
@@ -5,6 +5,7 @@
 
 class handler;
 
+#define MAX_EQUATIONS 20
 #define MAX_CONSTANTS 130
 #define MAX_GLOBAL_CONSTANTS 2
 #define TIME_FOR_COMPARE 0
diff --git a/sql/sp_head.cc b/sql/sp_head.cc
index 8a9e8ddc816f2..1ac9efb49656b 100644
--- a/sql/sp_head.cc
+++ b/sql/sp_head.cc
@@ -1322,6 +1322,7 @@ sp_head::execute(THD *thd, bool merge_da_on_success)
     thd->profiling.finish_current_query();
     thd->profiling.start_new_query("continuing inside routine");
 #endif
+    thd->utime_before_query= 0;
 
     /* get_instr returns NULL when we're done. */
     i = get_instr(ip);
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
index 924605f18ed95..a8df98bdb340d 100644
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -68,6 +68,7 @@
 
 #include <fstream>
 
+#include "lsqr.h"
 /*
   The following is used to initialise Table_ident with a internal
   table name
@@ -899,7 +900,8 @@ THD::THD()
     main_da(0, false, false),
    m_stmt_da(&main_da),
    thd_cost_factors(cost_factors),
-   total_time(0)
+   equation_no(0),
+   utime_before_query(0)
 {
   ulong tmp;
 
@@ -6434,21 +6436,74 @@ bool Discrete_intervals_list::append(Discrete_interval *new_interval)
 
 void THD::build_equation()
 {
-  total_time= utime_after_query - utime_before_query;
-  solve_equation();
+  if(utime_before_query != 0)
+  {
+    coefficients[equation_no][MAX_CONSTANTS].value= utime_after_query - utime_before_query;
+    equation_no++;
+  }
+  if(equation_no == MAX_EQUATIONS)
+  {
+    solve_equation();
+    equation_no= 0;
+  }
 }
 
-void THD::solve_equation()
+// Helper function, TODO:move it in lsqr
+void mult(long mode, dvec *x, dvec *y, void *prod)
 {
+  long num_cols = x->length;
+  long num_rows = y->length;
+  long i, j;
+  dvec *A = (dvec *)prod;
   /*
-     Currently just dump all the coefficients to a file
-  */
-  std::ofstream datafile;
-  char file_name[100];
-  my_snprintf(file_name, 100, "/tmp/mariadb_cost_coefficients_%lu.txt", thread_id);
-  datafile.open(file_name, std::ios::app);
-  for(int i=0; i < MAX_CONSTANTS; i++)
-    datafile << coefficients[i].value << " ";
-  datafile << total_time << "\n";
-  datafile.close();
+   * Compute  Y = Y + A*X
+   */
+  if(mode == 0)
+  {
+    for(i=0; i < num_rows; i++)
+    {
+      for(j=0; j < num_cols; j++)
+        y->elements[i] += A->elements[i*num_cols+j]*x->elements[j];
+    }
+  }
+  /*
+   * Compute  X = X + A^T*Y
+   */
+  if(mode == 1)
+  {
+    for(i=0; i < num_cols; i++)
+    {
+      for(j=0; j < num_rows; j++)
+        x->elements[i] += A->elements[j*num_cols+i]*y->elements[j];
+    }
+  }
+}
+
+void THD::solve_equation()
+{
+  lsqr_input   *in;
+  lsqr_output  *out;
+  lsqr_work    *work;
+  lsqr_func    *func;
+  dvec         *prod;
+  long num_rows= equation_no;
+  long num_cols= MAX_CONSTANTS;
+  long row_indx, col_indx;
+  alloc_lsqr_mem( &in, &out, &work, &func, num_rows, num_cols );
+  func->mat_vec_prod= mult;
+  prod= (dvec *) alloc_dvec(num_rows*num_cols);
+  for(row_indx= 0; row_indx < num_rows; row_indx++)
+    for(col_indx= 0; col_indx < num_cols; col_indx++)
+      prod->elements[row_indx*num_cols+col_indx]= coefficients[row_indx][col_indx].value;
+  for(row_indx=0; row_indx < num_rows; row_indx++)
+    in->rhs_vec->elements[row_indx]= coefficients[row_indx][MAX_CONSTANTS].value;
+  in->num_rows = num_rows;
+  in->num_cols = num_cols;
+  in->rel_mat_err = 1.0e-15;
+  in->rel_rhs_err = 1.0e-15;
+  in->max_iter = num_rows + num_cols + 50;
+  lsqr(in, out, work, func, prod);
+  // Solution is in out->sol_vec->elements
+  free_lsqr_mem(in, out, work, func);
+  free_dvec(prod);
 }
diff --git a/sql/sql_class.h b/sql/sql_class.h
index e8d6491c89163..1523f72681080 100644
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -3731,13 +3731,13 @@ class THD :public Statement,
   }
 private:
   Cost_factors thd_cost_factors;
-  eq_coefficient coefficients[MAX_CONSTANTS];
-  ulonglong total_time;
+  eq_coefficient coefficients[MAX_EQUATIONS][MAX_CONSTANTS+1];
+  uint equation_no;
 public:
   ulonglong utime_before_query;
   inline void inc_coefficient(uint factor, int engine_index= -1)
   {
-    coefficients[get_factor_index(factor, engine_index)].value++;
+    coefficients[equation_no][get_factor_index(factor, engine_index)].value++;
   }
   void build_equation();
   void solve_equation();
diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc
index 71c01afffc375..ccbf47930fcca 100644
--- a/sql/sql_parse.cc
+++ b/sql/sql_parse.cc
@@ -757,7 +757,7 @@ static void handle_bootstrap_impl(THD *thd)
     thd->profiling.start_new_query();
     thd->profiling.set_query_source(thd->query(), length);
 #endif
-
+    thd->utime_before_query= 0;
     /*
       We don't need to obtain LOCK_thread_count here because in bootstrap
       mode we have only one thread.
diff --git a/sql/sql_select.cc b/sql/sql_select.cc
index 8832083ceccd9..33dfde54df6b0 100644
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -2356,6 +2356,7 @@ void JOIN::save_explain_data(Explain_query *output, bool can_overwrite,
 
 void JOIN::exec()
 {
+  //DBUG_ASSERT(thd->utime_before_query == 0);
   thd->utime_before_query= microsecond_interval_timer();
   DBUG_EXECUTE_IF("show_explain_probe_join_exec_start", 
                   if (dbug_user_var_equals_int(thd,