In [1]:
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
import seaborn as sns
from IPython.display import display_html
import pandas as pd
import numpy as np
import os
import re

In [2]:
# https://www.kaggle.com/code/kimtaehun/breif-eda-and-xgb-baseline-with-full-dataset
def summary(df):
    print(f'data shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#missing'] = df.isnull().sum().values 
    summ['%missing'] = df.isnull().sum().values / len(df) * 100
    summ['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())

    # Debug: print the columns of desc
    # print("Columns in desc:", desc.columns)
    
    if 'min' in desc.columns:
        summ['min'] = desc['min'].values
    else:
        summ['min'] = 'N/A'

    if 'max' in desc.columns:
        summ['max'] = desc['max'].values
    else:
        summ['max'] = 'N/A'
    
    summ['first value'] = df.iloc[0].values if len(df) > 0 else 'N/A'
    summ['second value'] = df.iloc[1].values if len(df) > 1 else 'N/A'
    summ['third value'] = df.iloc[2].values if len(df) > 2 else 'N/A'    

#    return summ
    display_html(summ)
    print("\n")

In [3]:
# From chatGPT and then me editng it
def ohe_categorical(df, categorical_cols, threshold=10, keep_col = True):
    """
    One-hot encodes the categorical columns in the DataFrame.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        categorical_cols (list): List of column names to one-hot encode.
        threshold (int): The threshold for the number of unique values
                         in a column to decide whether to one-hot encode it.

    Returns:
        pd.DataFrame: The DataFrame with one-hot encoded columns.
        list: List of columns that weren't one-hot encoded.
    """

    # Find the columns that weren't one-hot encoded
    not_encoded_cols = []

    # Iterate through the categorical columns
    for col in categorical_cols:
        unique_count = df[col].nunique()
        if unique_count <= threshold:
            # Perform one-hot encoding for columns with unique_count less than or equal to the threshold
            if keep_col: copied_col = df[col]
            df = pd.get_dummies(df, columns=[col])
            if keep_col: df[col] = copied_col
            
        else:
            # Append the column name to the not_encoded_cols list
            not_encoded_cols.append(col)

    return df, not_encoded_cols

In [4]:
def get_columns_by_type(df, data_type):
    columns_list = []
    
    for column in df.columns:
        if df[column].dtype == data_type:
            columns_list.append(column)
    
    return columns_list

In [5]:
# https://www.kaggle.com/code/andradaolteanu/rsna-fracture-detection-dicom-images-explore
def df_info(df, name="Default"): 
    print(clr.S+f"=== {name} ==="+clr.E)
#    print(clr.S+f"Shape:"+clr.E, df.shape)
    print(clr.S+f"Shape:"+clr.E, format(df.shape[0], ","), format(df.shape[1], ","))
    print(clr.S+f"Missing Values:"+clr.E, format(df.isna().sum().sum(), ","), "total missing datapoints.")
    print(clr.S+"Columns:"+clr.E, list(df.columns), "\n")
    
    display_html(df.tail())
    print("\n")

class clr:
    S = '\033[1m' + '\033[94m'
    E = '\033[0m'
    
my_colors = ["#5EAFD9", "#449DD1", "#3977BB", 
             "#2D51A5", "#5C4C8F", "#8B4679",
             "#C53D4C", "#E23836", "#FF4633", "#FF5746"]
CMAP1 = ListedColormap(my_colors)

In [6]:
# These helper and data cleaning functions are from the old fast.ai course
# The repository is here: https://github.com/fastai/fastai/tree/master/old
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)
        
def make_date(df, date_field:str):
    "Make sure `df[field_name]` is of the right date type."
    field_dtype = df[date_field].dtype
    if isinstance(field_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        field_dtype = np.datetime64
    if not np.issubdtype(field_dtype, np.datetime64):
        df[date_field] = pd.to_datetime(df[date_field], infer_datetime_format=True)
        


def add_datepart(df, fldnames, drop=True, time=False, errors="raise"):
    """
    Add Date Parts converts a column of df from a datetime64 to many columns containing 
    the information from the date. It returns a modified version of the original DataFrame.
    """
    
    if isinstance(fldnames, str):
        fldnames = [fldnames]
    
    for fldname in fldnames:
        fld = df[fldname]
        fld_dtype = fld.dtype
        
        if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
            fld_dtype = np.datetime64

        if not np.issubdtype(fld_dtype, np.datetime64):
            df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True, errors=errors)
        
        targ_pre = re.sub('[Dd]ate$', '', fldname)
        attr = ['Year', 'Month', 'Day', 'Dayofweek', 'Dayofyear',
                'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 
                'Is_year_end', 'Is_year_start']
        
        if time:
            attr = attr + ['Hour', 'Minute', 'Second']
        
        for n in attr:
            df[targ_pre + n] = getattr(fld.dt, n.lower())
        
        if drop:
            df = df.drop(fldname, axis=1)

    return df

        
        
def ifnone (a,b): #(a:Any,b:Any)->Any:
    "`a` if `a` is not None, otherwise `b`."
    return b if a is None else a

def train_cats(df):    
    for n,c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()

def apply_cats(df, trn):
    for n,c in df.items():
        if (n in trn.columns) and (trn[n].dtype.name=='category'):
            df[n] = c.astype('category').cat.as_ordered()
            df[n].cat.set_categories(trn[n].cat.categories, ordered=True, inplace=True)

def numericalize(df, col, name, max_n_cat):
    if not is_numeric_dtype(col) and ( max_n_cat is None or len(col.cat.categories)>max_n_cat):
        df[name] = pd.Categorical(col).codes+1

def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)      

############################
# End fast.ai funcitons...
############################

# This function I believe came from this guy: https://www.kaggle.com/siavrez

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df