In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from pandas.tseries.offsets import Day, MonthBegin, YearBegin

In [None]:
def get_df_types(df, debug=False):
  # Create lists of column names by data type
  floats=[]
  ints=[]
  strings=[]
  other=[]

  for col in df.columns:
    if debug:
      print(f'## {col}:\t\t{df[col].dtype}')
    t = df[col].dtype.name
    if t.find('float') >= 0:
      floats.append(col)
    elif t.find('int') >= 0:
      ints.append(col)
    elif t.find('object') >= 0:
      strings.append(col)
    else:
      other.append(col)

  if debug:
    print(f'Types::\n\tInts: {ints}'),print(f'\tFloats: {floats}'),print(f'\tStrings: {strings}'),print(f'\tOther: {other}')

  return floats,ints,strings,other


def df_to_arrays(df, target_label, debug=False):
  """
  Given a DataFrame, convert into a 2D array of numerics.
  Target variable is returned as y.

  Returns a 2D ndarray as X, ndarray as y, and optional encoder for y
  if encoding was necessary.
  """
  target_encoder = None
  X = []
  y = []

  # Numericize non-numerics
  for alpha_col in alphas:
    if debug:
      print(f'Label encoding col: {alpha_col}')
    label_enc = LabelEncoder()
    enc_col = label_enc.fit_transform(df[alpha_col].values)
    if alpha_col == target_label:
      target_encoder = label_enc
      y.append(enc_col)
    else:
      X.append(enc_col)

  for numeric_col in numerics:
    if numeric_col == target_label:
      y.append(df[numeric_col].values)
    else:
      X.append(df[numeric_col].values)

  return np.array(X), np.array(y), target_encoder

def df_retain(df, columns, debug=False):
  """
  Given a DataFrame and a list of column names, retain only the listed columns.
  Returns a dataframe with all but the listed columns removed.
  """
  if (df is None):
    return None

  drop_cols = []
  for col in df.columns:
    if (col not in columns):
      drop_cols.append(col)

  if (debug):
    print(f'Dropping columns: {drop_cols}')

  return df.drop(columns=drop_cols)

def create_timeindexed_df(start_date, end_date, freq='1M'):
  """
  Create an empty DataFrame with index set up with dates in a regular pattern
  for the currently set start/end dates and given frequency.
  See https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases
  for the full frequency specification.
  Default creates an index point at every first day of the month in the given range.
  """
  if (not start_date or not end_date):
    raise AssertionError('Time range required')

  # See if we need an offset
  offset = Day(0)  # Days do not, unless you're looking to back off Time units to midnight
  if (freq[-1] == 'M'):
    offset = MonthBegin(1)
  elif (freq[-1] == 'Y'):
    offset = YearBegin(1)

  # Generate index
  dates = pd.date_range(start_date, end_date, freq=freq) - offset

  # Create empty df
  return pd.DataFrame(index=dates)

def clean_df(df, purge_suffixes=[]):
  """
  Perform following on the given df:
  * drop any column with a name ending in a given purge_suffix
  * remove surrounding parands from any column name
  * rename columns ending in ", mean" to "-Mean"
  """
  drop_list = []
  tuple_list = {}

  for col in df.columns:
    if (isinstance(col, tuple)):
      # rename tuple to just first entry
      tuple_list[col] = col[0]
    else:
      for s in purge_suffixes:
        if (col.endswith(s)):
          drop_list.append(col)
          break

  # Drop columns w/ suffixes from list
  df.drop(columns=drop_list, inplace=True)

  # Rename mean cols
  #for col in df.columns:
  #  if (", mean" in col):
  #    mean_list[col] = f'{col[:-6]}-Mean'

  # Rename tuples
  df.rename(columns=tuple_list, inplace=True)
  return df

---

**Unit Tests**

---

In [None]:
PU_UNIT_TEST = False

In [None]:
# Unit Testing
if PU_UNIT_TEST:
  def get_test_df():
    df = pd.DataFrame({'angles': [0, 3, 4],
                      'degrees': [360.32, 180.31, 360.114],
                      'code':['A','B','C']},
                      index=['circle', 'triangle', 'rectangle'])
    print(f'DataFrame: \n{df}')
    return df

  print('--------------------------------------------')
  print('Case 1: get_df_types()')
  df = get_test_df()

  # Determine data types in given columns
  floats,ints,strings,other = get_df_types(df, True)

  numerics = set(floats).union(set(ints))
  alphas = set(strings).union(set(other))

  print(f'Numeric cols: {numerics}')
  print(f'Alpha cols: {alphas}')

  print('--------------------------------------------')
  print('Case 2: df_to_arrays() - alpha target')
  df = get_test_df()

  X, y, enc = df_to_arrays(df, 'code', debug=True)
  print(f'X: {X}')
  print(f'y: {y}')
  print(f'enc: {enc}')

  print('--------------------------------------------')
  print('Case 3: df_to_arrays() - alpha col, numer. target')
  df = get_test_df()

  X, y, enc = df_to_arrays(df, 'angles', debug=True)
  print(f'X: {X}')
  print(f'y: {y}')
  print(f'enc: {enc}')

  print('--------------------------------------------')
  print('Case 4: df_retain() - retain only "angles"')
  df = get_test_df()

  df = df_retain(df, 'angles', debug=True)
  print(f'df: {df.columns}')



In [None]:
from sre_constants import error
if PU_UNIT_TEST:
  import pandas as pd
  from datetime import datetime as dt
  import datetime

  START_DATE =  pd.to_datetime(dt.fromisoformat('1999-01-01'))
  # Stop including data after this date
  END_DATE = pd.to_datetime(dt.fromisoformat('2002-12-31'))

  print('--------------------------------------------')
  print('Case 5a: create_timeindexed_df() - no date')
  try:
    # Error expected
    df = create_timeindexed_df(START_DATE, None, freq='1M')
  except AssertionError as e:
    print (f'Caught expected error: {e}')

  print('--------------------------------------------')
  print('Case 5b: create_timeindexed_df() - 1M')
  df = create_timeindexed_df(START_DATE, END_DATE, freq='1M')
  print(df.index)

  print('--------------------------------------------')
  print('Case 5c: create_timeindexed_df() - 1D')
  df = create_timeindexed_df(START_DATE, END_DATE, freq='1D')
  print(df.index)

  print('--------------------------------------------')
  print('Case 5c: create_timeindexed_df() - 1Y')
  df = create_timeindexed_df(START_DATE, END_DATE, freq='1Y')
  print(df.index)


In [None]:
if PU_UNIT_TEST:
  print('--------------------------------------------')
  print('Case 6: () - ')
  df = create_timeindexed_df(START_DATE, END_DATE, freq='1M')
  df['date'] = df.index
  print(df.columns)
  print(df.head())
  df['year'] = df['date'].apply(lambda x: x.year)
  df['month'] = df['date'].apply(lambda x: f'{x.month:02}')

In [None]:
if PU_UNIT_TEST:
  df = pd.DataFrame({'days': ['Mon','Tues','Wed','Thurs','Fri'],
                   ('vals', 'mean'): [1, 2, 3, 4, 5],
                     'valdf':range(5)},
                  index=range(5))
  print(df)
  df = clean_df(df, purge_suffixes=['df'])
  print(df)

    days  (vals, mean)  valdf
0    Mon             1      0
1   Tues             2      1
2    Wed             3      2
3  Thurs             4      3
4    Fri             5      4
    days  vals
0    Mon     1
1   Tues     2
2    Wed     3
3  Thurs     4
4    Fri     5
