[Dealing with categorical features in machine learning](https://medium.com/hugo-ferreiras-blog/dealing-with-categorical-features-in-machine-learning-1bb70f07262d)

In [51]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer

pd.set_option('display.max_columns', 999)

# names of columns, as per description
cols_names = [
  'class',
  'age',
  'menopause',
  'tumor-size', 
  'inv-nodes',
  'node-caps',
  'deg-malig',
  'breast', 
  'breast-quad',
  'irradiat'
]


df = pd.read_csv('breast-cancer.data', header=None, names=cols_names).replace({'?': np.nan})
df['float'] = np.random.random(len(df))
df.head()

Unnamed: 0,class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,float
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no,0.472275
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no,0.223552
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no,0.80029
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no,0.468268
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no,0.653099


In [104]:
def describe(df):
    nrows = len(df)
    df_ret = pd.DataFrame()
    df_ret['feature'] = df.columns
    df_ret['dtype'] = df.dtypes.values
    df_ret['null_count'] = df.isnull().sum().values
    df_ret['non-null_count'] = df.notnull().sum().values
    df_ret['unique_count'] = df.nunique().values

    # numeric features
    df_float = df.select_dtypes(['int16', 'int32', 'int64', 'float16', 'float32', 'float64'])
    dtype_dict = {k: v for k, v in zip(df_float.columns, df_float.dtypes)}
    df_ret['max'] = df_ret['feature'].map(df_float.max())
    df_ret['min'] = df_ret['feature'].map(df_float.min())
    df_ret['mean'] = df_ret['feature'].map(df_float.mean())
    df_ret['median'] = df_ret['feature'].map(df_float.median())
    df_ret['std'] = df_ret['feature'].map(df_float.std())

    # unique values
    for col in df.columns:
        val_counts = df[col].value_counts(dropna=False)
        top5_vals = val_counts.index.tolist()[:5]
        df_ret.loc[df_ret['feature'] == col, 'top5_values'] = ', '.join(map(lambda x: str(x), top5_vals))
        df_ret.loc[df_ret['feature'] == col, 'top_value_ratio'] = val_counts.values[0] / nrows

    return df_ret
  
describe(df)

Unnamed: 0,feature,dtype,null_count,non-null_count,unique_count,max,min,mean,median,std,top5_values,top_value_ratio
0,class,object,0,286,2,,,,,,"no-recurrence-events, recurrence-events",0.702797
1,age,object,0,286,6,,,,,,"50-59, 40-49, 60-69, 30-39, 70-79",0.335664
2,menopause,object,0,286,3,,,,,,"premeno, ge40, lt40",0.524476
3,tumor-size,object,0,286,11,,,,,,"30-34, 25-29, 20-24, 15-19, 10-14",0.20979
4,inv-nodes,object,0,286,7,,,,,,"0-2, 3-5, 6-8, 9-11, 15-17",0.744755
5,node-caps,object,8,278,2,,,,,,"no, yes, nan",0.776224
6,deg-malig,int64,0,286,3,3.0,1.0,2.048951,2.0,0.738217,"2, 3, 1",0.454545
7,breast,object,0,286,2,,,,,,"left, right",0.531469
8,breast-quad,object,1,285,5,,,,,,"left_low, left_up, right_up, right_low, central",0.384615
9,irradiat,object,0,286,2,,,,,,"no, yes",0.762238
