In [0]:
import math
import random
import numpy as np
import matplotlib.pyplot as plt

This function will create an (m x 2)-sized array called data that holds (x, y) values for our model. It also creates an (m x 1)-sized array that holds the label information for each respective row in data.

In [0]:
def get_random_data(w, b, mu, sigma, m):
  data = np.zeros((m, 2))
  labels = np.zeros((m, 1))
  for i in range(m):
    c = random.randint(0, 1)

    # First feature, x_1 is chosen uniformly at random within this interval
    x_1 = np.random.uniform(0, 1)

    # Get the "noise" that will be used to get our second feature, x_2
    n = np.random.normal(mu, sigma)

    # Get second feature, x_2 
    x_2 = w * x_1 + b + math.pow(-1, c) * n

    # Row i of data consists of (x_1, x_2)
    # Row i of vector labels consists of our label for this set of data, c
    data[i][0] = x_1
    data[i][1] = x_2
    labels[i] = c

  # print(data)

  return data, labels

This function graphs the data made in get_random_data(), as well as the function y = w * x + b

In [0]:
def display_random_data(labels, data, w,  b):
  plt.style.use('seaborn')

  zerosX = []
  zerosY = []
  onesX = []
  onesY = []
  # Separate the two labels into different lists based on if they were 0 or 1
  for i in range(len(labels)):
    if labels[i] == 0:
      zerosX.append(data[i][0])
      zerosY.append(data[i][1])
    else:
      onesX.append(data[i][0])
      onesY.append(data[i][1])

  # Plot the scatterplots
  plt.scatter(zerosX, zerosY, label='0', c="blue", edgecolor="black", linewidth=1, alpha=0.75)
  plt.scatter(onesX, onesY, label='1', c="red", edgecolor="black", linewidth=1, alpha=0.75)
  plt.xlabel("x")
  plt.ylabel("y")
  plt.title("Randomly Generated Data")

  # Plot the function y = m * w + b
  # print(data[:,0])
  
  y = w * data[:,0] + b
  plt.plot(data[:,0], y)

  plt.show()


Create the initial values for (w, b, mu, sigma, m) that will be used for our model and gather the respective data values. Also, let's create our training and validation sets, which will be 80% and 20% of our data. 

In [0]:
w = 0.5
b = 0.1
mu = 0.01
sigma = 0.001
m = 10000

data, labels = get_random_data(w, b, mu, sigma, m)
display_random_data(labels, data, w, b)

# Get that 80-20 split index
data_split = int(80/100 * m)

# Now get the data itself
train_data, test_data = data[:data_split], data[data_split:]
# print(train_data)
# print(test_data)


