In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [12]:

def get_df_types(df, debug=False):
  # Create lists of column names by data type
  floats=[]
  ints=[]
  strings=[]
  other=[]

  for col in df.columns:
    if debug:
      print(f'## {col}:\t\t{df[col].dtype}')
    t = df[col].dtype.name
    if t.find('float') >= 0:
      floats.append(col)
    elif t.find('int') >= 0:
      ints.append(col)
    elif t.find('object') >= 0:
      strings.append(col)
    else:
      other.append(col)

  if debug:
    print(f'Types::\n\tInts: {ints}'),print(f'\tFloats: {floats}'),print(f'\tStrings: {strings}'),print(f'\tOther: {other}')

  return floats,ints,strings,other


def df_to_arrays(df, target_label, debug=False):
  """
  Given a DataFrame, convert into a 2D array of numerics.
  Target variable is returned as y.

  Returns a 2D ndarray as X, ndarray as y, and optional encoder for y
  if encoding was necessary.
  """
  target_encoder = None
  X = []
  y = []

  # Numericize non-numerics
  for alpha_col in alphas:
    if debug:
      print(f'Label encoding col: {alpha_col}')
    label_enc = LabelEncoder()
    enc_col = label_enc.fit_transform(df[alpha_col].values)
    if alpha_col == target_label:
      target_encoder = label_enc
      y.append(enc_col)
    else:
      X.append(enc_col)

  for numeric_col in numerics:
    if numeric_col == target_label:
      y.append(df[numeric_col].values)
    else:
      X.append(df[numeric_col].values)

  return np.array(X), np.array(y), target_encoder


In [14]:
# Unit Testing
if False:
  def get_test_df():
    df = pd.DataFrame({'angles': [0, 3, 4],
                      'degrees': [360.32, 180.31, 360.114],
                      'code':['A','B','C']},
                      index=['circle', 'triangle', 'rectangle'])
    print(f'DataFrame: \n{df}')
    return df

  print('--------------------------------------------')
  print('Case 1: get_df_types()')
  df = get_test_df()

  # Determine data types in given columns
  floats,ints,strings,other = get_df_types(df, True)

  numerics = set(floats).union(set(ints))
  alphas = set(strings).union(set(other))

  print(f'Numeric cols: {numerics}')
  print(f'Alpha cols: {alphas}')

  print('--------------------------------------------')
  print('Case 2: df_to_arrays() - alpha target')
  df = get_test_df()

  X, y, enc = df_to_arrays(df, 'code', debug=True)
  print(f'X: {X}')
  print(f'y: {y}')
  print(f'enc: {enc}')

  print('--------------------------------------------')
  print('Case 3: df_to_arrays() - alpha col, numer. target')
  df = get_test_df()

  X, y, enc = df_to_arrays(df, 'angles', debug=True)
  print(f'X: {X}')
  print(f'y: {y}')
  print(f'enc: {enc}')

--------------------------------------------
Case 1: get_df_types()
DataFrame: 
           angles  degrees code
circle          0  360.320    A
triangle        3  180.310    B
rectangle       4  360.114    C
## angles:		int64
## degrees:		float64
## code:		object
Types::
	Ints: ['angles']
	Floats: ['degrees']
	Strings: ['code']
	Other: []
Numeric cols: {'degrees', 'angles'}
Alpha cols: {'code'}
--------------------------------------------
Case 2: df_to_arrays() - alpha target
DataFrame: 
           angles  degrees code
circle          0  360.320    A
triangle        3  180.310    B
rectangle       4  360.114    C
Label encoding col: code
X: [[360.32  180.31  360.114]
 [  0.      3.      4.   ]]
y: [[0 1 2]]
enc: LabelEncoder()
--------------------------------------------
Case 3: df_to_arrays() - alpha col, numer. target
DataFrame: 
           angles  degrees code
circle          0  360.320    A
triangle        3  180.310    B
rectangle       4  360.114    C
Label encoding col: code
X: 