In [2]:
#make-training-data.ipynb
#
#by Joe Hahn
#joe.hahn@oracle.com
#3 July 2018
#
#generate training data

In [4]:
import pandas as pd
import numpy as np
def make_xo_data(N_dots, initial_id, x_half_width, radius, box_half_width, jitter, rn_seed, debug):
    
    #generate dataframe and initialize records' ids
    df = pd.DataFrame()
    df.index.name = 'idx'
    df['id'] = initial_id + np.arange(N_dots)
    
    #generate random x,y positions
    rn_state = np.random.RandomState(seed=rn_seed)
    df['x'] = np.random.uniform(low=-box_half_width, high=box_half_width, size=N_dots)
    df['y'] = np.random.uniform(low=-box_half_width, high=box_half_width, size=N_dots)
    df['r'] = np.sqrt(df.x**2 + df.y**2)

    #classify dots as members of X, O, or B=background classes
    df['class'] = 'B'
    idx = df['r'] < radius
    df.loc[idx, 'class'] = 'O'
    idx = (df.x.abs() < x_half_width) | (df.y.abs() < x_half_width)
    df.loc[idx, 'class'] = 'X'
    idx_x = df['class'] == 'X'
    idx_o = df['class'] == 'O'
    idx_not = df['class'] == 'B'
    
    #class scores
    df.loc[idx_x, 'Xscore'] = 1.0
    df.loc[idx_x, 'Oscore'] = 0.0
    df.loc[idx_x, 'Bscore'] = 0.0
    df.loc[idx_o, 'Xscore'] = 0.0
    df.loc[idx_o, 'Oscore'] = 1.0
    df.loc[idx_o, 'Bscore'] = 0.0
    df.loc[idx_not, 'Xscore'] = 0.0
    df.loc[idx_not, 'Oscore'] = 0.0
    df.loc[idx_not, 'Bscore'] = 1.0

    #rotate coordinate system by 45 degrees = pi/4 radians
    phi = np.pi/4.0
    c = np.cos(phi)
    s = np.sin(phi)
    df['xr'] =  df.x*c + df.y*s
    df['yr'] = -df.x*s + df.y*c
    
    #add gaussian noise/jitter to dots' (x,y) positions:
    df['xrn'] = df.xr + np.random.normal(scale=jitter, size=N_dots)
    df['yrn'] = df.yr + np.random.normal(scale=jitter, size=N_dots)
    
    #add a column of random numbers 
    df['ran_num'] = np.random.uniform(size=N_dots)
    if (debug):
        print df.head(5)
    
    #return selected columns
    cols = ['id', 'ran_num', 'class', 'Xscore', 'Oscore', 'Bscore', 'xr', 'yr', 'xrn', 'yrn']
    df_select = df[['id', 'ran_num', 'class', 'Xscore', 'Oscore', 'Bscore', 'xr', 'yr', 'xrn', 'yrn']]
    df_select.columns = ['id', 'ran_num', 'class', 'Xscore', 'Oscore', 'Bscore', 'x0', 'y0', 'x', 'y']
    if (debug):
        print df_select.head(5)
    return df_select

In [6]:
#number of dots in training dataset
N_train = 5#15000

#half-thickness of the x
x_half_width = 0.5

#radius of the O
radius = 3.5

#box half-width before 45 degree rotation
box_half_width = 7.1

#set jitter=scale of the gaussian noise, to make the class boundaries slightly fuzzy
jitter = 0.0

#set seed for random number generator
rn_seed = 13

#set debug=True to see debugging output
debug =  False

#generate the xo dataset
print 'generating xo data...'
initial_id = 0
train = make_xo_data(N_train, initial_id, x_half_width, radius, box_half_width, jitter, rn_seed, debug)
print 'number of training records = ', len(train)

##save training data as csv file
#import pandas as pd
#pd.set_option('display.expand_frame_repr', False)
#print train.head(5)
#train.to_csv('data/train.txt', sep='|', index=False, header=False)

generating xo data...
number of training records =  5


In [7]:
train

Unnamed: 0,id,ran_num,class,Xscore,Oscore,Bscore,x0,y0,x,y
0,0,0.405618,B,0.0,0.0,1.0,-0.317479,-5.17024,-0.317479,-5.17024
1,1,0.961104,O,0.0,1.0,0.0,1.790388,-0.131998,1.790388,-0.131998
2,2,0.580402,B,0.0,0.0,1.0,-0.619544,-8.18249,-0.619544,-8.18249
3,3,0.435426,B,0.0,0.0,1.0,-5.161139,-0.161297,-5.161139,-0.161297
4,4,0.449685,B,0.0,0.0,1.0,1.814697,4.840764,1.814697,4.840764
