In [None]:
def data_quality_report_cont(dataframe, list_of_features):
    """
    Generates a data quality report for continuous features in a given DataFrame.

    This function calculates and prints the following metrics for each feature in the provided list:
    - Total count of non-missing values
    - Total count of missing values
    - Percentage of missing values
    - Cardinality (number of unique values)
    - Descriptive statistics (e.g., mean, standard deviation, min, max) for each feature

    Parameters:
    -----------
    dataframe : pandas.DataFrame
        The DataFrame containing the data to be analyzed.
    list_of_features : list of str
        The list of column names (features) for which the data quality report is generated.

    Returns:
    --------
    None
        This function prints the data quality report to the console.

    Docstring generated by ChatGPT.
    """
    # Initialize variables
    round_to = 2
    list_feature_name = []
    list_count = []
    list_missing = []
    list_percent = []
    list_cardinality = []

    for feature in list_of_features:
        # Get stats for each feature
        total_count = dataframe[feature].count()
        total_missing = dataframe[feature].isnull().sum()
        percent_missing = total_missing/total_count * 100
        cardinality = len(dataframe[feature].unique())

        # Append result to variables
        list_feature_name.append(feature)
        list_count.append(total_count)
        list_missing.append(total_missing)
        list_percent.append(np.round(percent_missing, round_to))
        list_cardinality.append(cardinality)

    # Create dataframe
    data = {
        "Feature": list_feature_name,
        "Count": list_count,
        "Missing": list_missing,
        "% missing": list_percent,
        "Cardinality": list_cardinality
    }
    df = pd.DataFrame(data)

    # Get descriptive statistics and transpose
    stats = np.round(dataframe[list_of_features].describe(), round_to)
    transposed_stats = stats.T

    # Print results
    print("Data Quality for Continous Features")
    print(f'Total Features: {len(list_of_features)}')
    print(df)

    print("\n")
    print("Descriptive Stats")
    print(transposed_stats)

In [None]:
def data_quality_report_cat(dataframe, list_of_features):
    """
    Generates a data quality report for categorical features in a given DataFrame.

    This function calculates and prints the following metrics for each feature in the provided list:
    - Total count of non-missing values
    - Total count of missing values
    - Percentage of missing values
    - Cardinality (number of unique values)
    - Mode 1 (most frequent value) and its frequency and percentage
    - Mode 2 (second most frequent value) and its frequency and percentage
    - Descriptive statistics for each feature

    Parameters:
    -----------
    dataframe : pandas.DataFrame
        The DataFrame containing the data to be analyzed.
    list_of_features : list of str
        The list of column names (features) for which the data quality report is generated.

    Returns:
    --------
    None
        This function prints the data quality report to the console.
    
    Docstring generated by ChatGPT.
    """
    # Initialize variables
    round_to = 2
    list_feature_name = []
    list_count = []
    list_missing = []
    list_percent = []
    list_cardinality = []
    list_mode1 = []
    list_mode1_freq = []
    list_mode1_perc = []
    list_mode2 = []
    list_mode2_freq = []
    list_mode2_perc = []

    for feature in list_of_features:
        
        total_count = dataframe[feature].count()
        total_missing = dataframe[feature].isnull().sum()
        percent_missing = np.round(total_missing / total_count * 100, round_to)
        cardinality = len(dataframe[feature].unique())

        # Use value counts to get modes
        results = dataframe[feature].value_counts()
        # Calculate mode
        mode1_name = results.index[0]
        mode1_count = results.iloc[0]
        mode1_percent = np.round((mode1_count / total_count) * 100, round_to)
        # Calculate 2nd mode
        mode2_name = results.index[1]
        mode2_count = results.iloc[1]
        mode2_percent = np.round((mode2_count / total_count) * 100, round_to)

        # Append results to lists
        list_feature_name.append(feature)
        list_count.append(total_count)
        list_missing.append(total_missing)
        list_percent.append(percent_missing)
        list_cardinality.append(cardinality)
        list_mode1.append(mode1_name)
        list_mode1_freq.append(mode1_count)
        list_mode1_perc.append(mode1_percent)
        list_mode2.append(mode2_name)
        list_mode2_freq.append(mode2_count)
        list_mode2_perc.append(mode2_percent)

    # Create dataframes
    data = {
        "Feature": list_feature_name,
        "Count": list_count,
        "Missing": list_missing,
        "% Missing": list_percent,
        "Cardinality": list_cardinality,
    }

    data_mode1 = {
        "Feature": list_feature_name,
        "Mode 1": list_mode1,
        "Mode 1 Freq.": list_mode1_freq,
        "Mode 1 %": list_mode1_perc,
    }

    data_mode2 = {
        "Feature": list_feature_name,
        "Mode 2": list_mode2,
        "Mode 2 Freq.": list_mode2_freq,
        "Mode 2 %": list_mode2_perc,
    }

    df = pd.DataFrame(data)
    df1 = pd.DataFrame(data_mode1)
    df2 = pd.DataFrame(data_mode2)

    # Get descriptive statistics and transpose
    stats = dataframe[list_of_features].describe(include='object')
    transposed_stats = stats.T

    # Print results
    print('Data Quality Report for Categorical Features')
    print('============================================')
    print('Stats')
    print('-----')
    print(df)

    print('\n')
    print('Mode 1')
    print('------')
    print(df1)

    print('\n')
    print('Mode 2')
    print('------')
    print(df2)

    print('\n')
    print('Descriptive Stats')
    print('-----------------')
    print(transposed_stats)

In [None]:
def multiclass_cm_metrics(cm, rnd=5):
    """
    Calculate and return various performance metrics from a confusion matrix.

    Parameters:
    cm (numpy.ndarray): Confusion matrix as a numpy array of any size.
    rnd (int, optional): Number of decimal places to round the performance metrics. Default is 5.

    Returns:
    pandas.DataFrame: A dataframe containing the calculated performance metrics:
                      - 'Accuracy': Proportion of correct predictions.
                      - 'Error rate': Proportion of incorrect predictions.
                      - 'Sensitivity (Recall)': True positive rate for each class.
                      - 'Specificity': True negative rate for each class.
                      - 'Precision': Proportion of positive identifications that were actually correct for each class.
                      - 'F1': Harmonic mean of precision and recall for each class.
                      - 'F2': Weighted harmonic mean of precision and recall with more weight on recall for each class.
                      - 'F0.5': Weighted harmonic mean of precision and recall with more weight on precision for each class.

    Example:
    >>> from sklearn.metrics import confusion_matrix
    >>> import numpy as np
    >>> cm = np.array([[50, 10, 5],
                       [5, 35, 5],
                       [5, 10, 40]])
    >>> cm_performance(cm)

    Docstring generated by ChatGPT
    """

    # Initialize lists to hold metrics for each class
    classes = cm.shape[0]
    metrics = ['Accuracy', 'Error rate', 'Sensitivity (Recall)', 
               'Specificity', 'Precision', 'F1', 'F2', 'F0.5']
    performance_dict = {metric: [] for metric in metrics}

    # Calculate metrics for each class
    for i in range(classes):
        TP = cm[i, i]
        FN = np.sum(cm[i, :]) - TP
        FP = np.sum(cm[:, i]) - TP
        TN = np.sum(cm) - (TP + FP + FN)

        accuracy = (TP + TN) / np.sum(cm)
        error_rate = 1 - accuracy
        sensitivity_recall = TP / (TP + FN) if (TP + FN) != 0 else 0
        specificity = TN / (TN + FP) if (TN + FP) != 0 else 0
        precision = TP / (TP + FP) if (TP + FP) != 0 else 0
        f1 = (2 * precision * sensitivity_recall) / (precision + sensitivity_recall) if (precision + sensitivity_recall) != 0 else 0
        f2 = (5 * precision * sensitivity_recall) / ((4 * precision) + sensitivity_recall) if ((4 * precision) + sensitivity_recall) != 0 else 0
        f05 = (1.25 * precision * sensitivity_recall) / ((0.25 * precision) + sensitivity_recall) if ((0.25 * precision) + sensitivity_recall) != 0 else 0

        performance_dict['Accuracy'].append(accuracy)
        performance_dict['Error rate'].append(error_rate)
        performance_dict['Sensitivity (Recall)'].append(sensitivity_recall)
        performance_dict['Specificity'].append(specificity)
        performance_dict['Precision'].append(precision)
        performance_dict['F1'].append(f1)
        performance_dict['F2'].append(f2)
        performance_dict['F0.5'].append(f05)

    # Convert to DataFrame with classes as columns
    performance_df = pd.DataFrame(performance_dict, 
                                  index=[f'Class {i}' for i in range(classes)])
    performance_df = performance_df.T.round(rnd)  # Transpose and round

    print("Confusion Matrix:")
    print(cm)
    return performance_df