In [1]:
%pylab inline 

import numpy as np
import numpy
import pandas as pd
import re
import random as rnd
from sklearn.metrics import accuracy_score
import math
import operator
from tqdm import tqdm_notebook as tqdm
import random
import warnings
warnings.filterwarnings("ignore")

Populating the interactive namespace from numpy and matplotlib


In [2]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

In [3]:
# read the arrays
a = [float(x) for x in open('titanic_cpp_x_old.txt').read().split()]
cols = int(a[0])
bdx = array(a[1:]).reshape(-1, cols)
bdx.shape

(891, 8)

In [4]:
a = [float(x) for x in open('titanic_cpp_test_x_old.txt').read().split()]
cols = int(a[0])
bdx_test = array(a[1:]).reshape(-1, cols)
bdx_test.shape

(418, 8)

In [5]:
bdy = array([float(x) for x in open('titanic_cpp_y.txt').read().split()])
bdy.shape

(891,)

In [6]:
bdy_test = array([float(x) for x in open('titanic_cpp_test_y.txt').read().split()])
bdy_test.shape

(418,)

In [7]:
bdx.shape

(891, 8)

In [8]:
# data organization



In [9]:
num_insts = 19

In [10]:
num_rows = 1024
num_features = bdx.shape[1]

In [11]:
num_threads_x = 16384
num_threads_y = num_rows

In [12]:
prog_len = 16
gprogs = np.zeros((1, prog_len*num_threads_x), dtype=np.float32)

In [13]:
# 2. The data is a single array of 8 features, many rows, and each thread on the Y axis takes an individual row
gdata = np.zeros((num_rows, num_features), dtype=np.float32)
gdatay = np.zeros((num_rows, 1), dtype=np.float32)

In [14]:
# 3. The results - for each thread, single column of results
gresult = np.zeros((num_rows, 1*num_threads_x), dtype=np.float32)
gscores = np.zeros((1, 1*num_threads_x), dtype=np.float32)

In [15]:
# Allocate the buffers in GPU memory
gprogs_gpu = cuda.mem_alloc(gprogs.nbytes)

gdata_gpu = cuda.mem_alloc(gdata.nbytes)
gdatay_gpu = cuda.mem_alloc(gdatay.nbytes)
gresult_gpu = cuda.mem_alloc(gresult.nbytes)
gscores_gpu = cuda.mem_alloc(gscores.nbytes)

In [16]:
# The GP kernel
mod = SourceModule("""

#define PROGLEN %d
#define NUM_FEATURES %d
#define NUM_THREADS_X %d

#define NOP 0
#define PUSHV 1
#define PUSHC 2
#define ADD 3
#define MUL 4
#define DIV 5
#define NEG 6
#define MIN 7
#define MAX 8
#define GREATER 9
#define LESS 10
#define EQUAL 11
#define SIN 12
#define COS 13
#define EXP 14
#define LOG 15
#define SQR 16
#define SQRT 17
#define TANH 18

  __global__ void compute(int num_real_rows, float *gprogs, float *gdata, float *gresult)
  {
  
    int prog_idx = (blockIdx.x * blockDim.x + threadIdx.x) * PROGLEN;
    int data_row_idx = (blockIdx.y * blockDim.y + threadIdx.y) * NUM_FEATURES;
    
    int th_y = blockIdx.y * blockDim.y + threadIdx.y;
    int th_x = blockIdx.x * blockDim.x + threadIdx.x;
 
    int result_idx = NUM_THREADS_X * th_y + th_x; 
    
    if (th_y >= num_real_rows)
        return; // don't compute where there's no data
        
    float stack[PROGLEN+4]; // a little bigger just in case
    
    int sp = 0;
    
    // push a 0 to the stack
    stack[sp] = 0;
    sp++;
    
    float x = 0;
    float y = 0;
    
    float inst = 0;
    float operand = 0;
    
    /////////////////////////////////
    // Evaluate program on data row
    float* start_idx = (gprogs + prog_idx);
    
    for(int ip=0; ip<(PROGLEN*2); ip += 2)
    {
        // fetch instruction and operand
        inst = *(start_idx + ip);
        operand = *(start_idx + ip + 1);
        
        // execute instruction
        if (inst == NOP)
            continue; 
        
        if (inst == PUSHV)
        {
            // fetch variable from data
            x = *(gdata + data_row_idx + (int)(operand));
            
            // push to stack
            stack[sp] = x;
            sp++;
        }
        
        if (inst == PUSHC)
        {
            // push constant to stack
            stack[sp] = operand;
            sp++;
        }
        
        // math/logic instructions go below
        if ((inst == ADD) && (sp >= 2))
        {
            // pop two values from stack
            x = stack[sp];
            sp--;
            y = stack[sp];
            sp--;
            
            // push result to stack
            stack[sp] = x + y;
            sp++;
        }
    
        if ((inst == MUL) && (sp >= 2))
        {
            // pop two values from stack
            x = stack[sp];
            sp--;
            y = stack[sp];
            sp--;
            
            // push result to stack
            stack[sp] = x * y;
            sp++;
        }

        if ((inst == DIV) && (sp >= 2))
        {
            // pop two values from stack
            x = stack[sp];
            sp--;
            y = stack[sp];
            sp--;
            
            // push result to stack
            stack[sp] = x / y;
            sp++;
        }

        if ((inst == NEG) && (sp >= 1))
        {
            /*
            // pop one value from stack
            x = stack[sp];
            sp--;
            
            // push result to stack
            stack[sp] = -x;
            sp++;*/
            
            stack[sp] = -stack[sp];
        }
        
        if ((inst == MIN) && (sp >= 2))
        {
            // pop two values from stack
            x = stack[sp];
            sp--;
            y = stack[sp];
            sp--;
            
            // push result to stack
            stack[sp] = (x > y)?y:x;
            sp++;
        }

        if ((inst == MAX) && (sp >= 2))
        {
            // pop two values from stack
            x = stack[sp];
            sp--;
            y = stack[sp];
            sp--;
            
            // push result to stack
            stack[sp] = (x > y)?x:y;
            sp++;
        }

        if ((inst == GREATER) && (sp >= 2))
        {
            // pop two values from stack
            x = stack[sp];
            sp--;
            y = stack[sp];
            sp--;
            
            // push result to stack
            stack[sp] = (float)(x > y);
            sp++;
        }

        if ((inst == LESS) && (sp >= 2))
        {
            // pop two values from stack
            x = stack[sp];
            sp--;
            y = stack[sp];
            sp--;
            
            // push result to stack
            stack[sp] = (float)(x < y);
            sp++;
        }

        if ((inst == EQUAL) && (sp >= 2))
        {
            // pop two values from stack
            x = stack[sp];
            sp--;
            y = stack[sp];
            sp--;
            
            // push result to stack
            stack[sp] = (float)(x == y);
            sp++;
        }
        
        if ((inst == SIN) && (sp >= 1))
        {
            stack[sp] = sin(stack[sp]);
        }

        if ((inst == COS) && (sp >= 1))
        {
            stack[sp] = cos(stack[sp]);
        }

        if ((inst == EXP) && (sp >= 1))
        {
            stack[sp] = exp(stack[sp]);
        }

        if ((inst == LOG) && (sp >= 1))
        {
            stack[sp] = log(stack[sp]);
        }

        if ((inst == SQR) && (sp >= 1))
        {
            stack[sp] = stack[sp]*stack[sp];
        }

        if ((inst == SQRT) && (sp >= 1))
        {
            stack[sp] = sqrt(stack[sp]);
        }

        if ((inst == TANH) && (sp >= 1))
        {
            stack[sp] = tanh(stack[sp]);
        }
    }
    
    // store result
    float f = 1/(1+exp(-stack[sp]));
    if (f > 0.5) 
    {
        f = 1.0;
    }
    else 
    {
        f = 0.0;
    }
    gresult[result_idx] = f;    
  }
  """ % (prog_len, num_features, num_threads_x))

In [17]:
func = mod.get_function("compute")

In [18]:
modsc = SourceModule("""

#define NUM_THREADS_X %d

  __global__ void scores(int num_real_rows, float *gdatay, float *gresult, float *gscores)
  {
      int th_x = blockIdx.x * blockDim.x + threadIdx.x;
      float sum = 0;
      
      for(int i=0; i<num_real_rows; i++)
      {
          float y1 = gdatay[i];
          float y2 = *(gresult + i*NUM_THREADS_X + th_x);
          if (y1 == y2)
          {
              sum ++;
          }
      }
      
      sum /= num_real_rows;
      gscores[th_x] = sum;
  }
  """ % (num_threads_x))

In [19]:
funcsc = modsc.get_function("scores")

In [20]:
gdata[0 : bdx.shape[0], :] = bdx

In [21]:
gdatay[0 : bdx.shape[0], :] = bdy.reshape(-1, 1)

In [None]:
# this will hold the result
gresult = np.zeros((num_rows, 1*num_threads_x), dtype=np.float32)
gresult, gresult.shape

In [None]:
# this will hold the scores
gscores = np.zeros((1, 1*num_threads_x), dtype=np.float32)
gscores, gscores.shape

In [None]:
# copy arrays to GPU
cuda.memcpy_htod(gprogs_gpu, gprogs)

cuda.memcpy_htod(gdata_gpu, gdata)
cuda.memcpy_htod(gdatay_gpu, gdatay)

In [None]:
threads_per_block = 16
blocks_x = num_threads_x // threads_per_block
blocks_y = num_threads_y // threads_per_block

In [None]:
num_real_rows = np.array(bdx.shape).astype(np.int32)[0]

In [None]:
# execute kernel
func(num_real_rows, gprogs_gpu, gdata_gpu, gresult_gpu, 
     block=(threads_per_block, threads_per_block, 1), grid=(blocks_x, blocks_y, 1))

In [None]:
# compute the scores now
funcsc(num_real_rows, gdatay_gpu, gresult_gpu, gscores_gpu, 
     block=(threads_per_block, 1, 1), grid=(blocks_x, 1, 1))

In [None]:
# get back the data and display it
cuda.memcpy_dtoh(gscores, gscores_gpu)
gscores.reshape(-1)

In [None]:
# sort the arrays by score
hs_idx = np.argsort(gscores.reshape(-1))[::-1]
gprogs = gprogs[hs_idx]


In [None]:
# choose parent indices
pr_idx = np.random.randint(0, int(len(hs_idx)*0.2), size=(len(hs_idx),))
pr_idx

In [None]:
# replace pop with the new individuals, but flatten them
gprogs = gprogs[pr_idx].reshape(-1)

In [None]:
# mutate the pop randomly
numtomut = int( len(pr_idx)*0.3 )
mt_idx = np.random.randint(0, numtomut, size=(numtomut,))

gprogs[mt_idx] = np.random.randint(0, num_insts, size=(numtomut,)).astype(dtype=np.float32)

In [None]:
gprogs = gprogs.reshape(len(hs_idx),-1)

In [None]:
gprogs = np.zeros((num_threads_x, prog_len), dtype=np.float32)
inst_idx = np.arange(0, prog_len, 2)
gprogs[:,inst_idx] = np.random.randint(0, num_insts, size=(num_threads_x, prog_len//2))


In [None]:
# %%timeit 

# gresult = np.zeros((num_rows, 1*num_threads_x), dtype=np.float32)

# cuda.memcpy_htod(gops_gpu, gops)
# cuda.memcpy_htod(gvars_gpu, gvars)
# cuda.memcpy_htod(gconsts_gpu, gconsts)
# cuda.memcpy_htod(gis_var_gpu, gis_var)
# cuda.memcpy_htod(gdata_gpu, gdata)

# func(num_real_rows, gops_gpu, gvars_gpu, gconsts_gpu, gis_var_gpu, gdata_gpu, gresult_gpu, 
#      block=(threads_per_block, threads_per_block, 1), grid=(blocks_x, blocks_y, 1))

# funcsc(num_real_rows, gdatay_gpu, gresult_gpu, gscores_gpu, 
#      block=(threads_per_block, 1, 1), grid=(blocks_x, 1, 1))

# cuda.memcpy_dtoh(gscores, gscores_gpu)

# # new generation

In [None]:
hist(gscores.reshape(-1), 50);

In [None]:
cuda.memcpy_htod(gdata_gpu, gdata)
cuda.memcpy_htod(gdatay_gpu, gdatay)

In [None]:
# initial random population 

gprogs = np.zeros((num_threads_x, prog_len), dtype=np.float32)
inst_idx = np.arange(0, prog_len, 2)
gprogs[:,inst_idx] = np.random.randint(0, num_insts, size=(num_threads_x, prog_len//2))


In [None]:
best_ever = 0
for i in tqdm(range(10000000)):
    
    #cuda.memcpy_htod(gprogs_gpu, gprogs)

    func(num_real_rows, gprogs_gpu, gdata_gpu, gresult_gpu, 
         block=(threads_per_block, threads_per_block, 1), grid=(blocks_x, blocks_y, 1))
    
    funcsc(num_real_rows, gdatay_gpu, gresult_gpu, gscores_gpu, 
     block=(threads_per_block, 1, 1), grid=(blocks_x, 1, 1))

    cuda.memcpy_dtoh(gscores, gscores_gpu)
    accs = gscores.reshape(-1)
    best = np.max(accs)
    if best > best_ever:
        best_ever = best
        print('new best', best)
        
   