In [54]:
import numpy as np
from sklearn.utils.validation import check_array,check_consistent_length
from sklearn.utils.multiclass import type_of_target
from scipy import sparse as sp
from math import log
def contingency_matrix(
    labels_true, labels_pred, *, eps=None, sparse=False, dtype=np.int64
):
    """Build a contingency matrix describing the relationship between labels.

    Parameters
    ----------
    labels_true : array-like of shape (n_samples,)
        Ground truth class labels to be used as a reference.

    labels_pred : array-like of shape (n_samples,)
        Cluster labels to evaluate.

    eps : float, default=None
        If a float, that value is added to all values in the contingency
        matrix. This helps to stop NaN propagation.
        If ``None``, nothing is adjusted.

    sparse : bool, default=False
        If `True`, return a sparse CSR continency matrix. If `eps` is not
        `None` and `sparse` is `True` will raise ValueError.

        .. versionadded:: 0.18

    dtype : numeric type, default=np.int64
        Output dtype. Ignored if `eps` is not `None`.

        .. versionadded:: 0.24

    Returns
    -------
    contingency : {array-like, sparse}, shape=[n_classes_true, n_classes_pred]
        Matrix :math:`C` such that :math:`C_{i, j}` is the number of samples in
        true class :math:`i` and in predicted class :math:`j`. If
        ``eps is None``, the dtype of this array will be integer unless set
        otherwise with the ``dtype`` argument. If ``eps`` is given, the dtype
        will be float.
        Will be a ``sklearn.sparse.csr_matrix`` if ``sparse=True``.
    """

    if eps is not None and sparse:
        raise ValueError("Cannot set 'eps' when sparse=True")

    classes, class_idx = np.unique(labels_true, return_inverse=True)
    clusters, cluster_idx = np.unique(labels_pred, return_inverse=True)
    n_classes = classes.shape[0]
    n_clusters = clusters.shape[0]
    # Using coo_matrix to accelerate simple histogram calculation,
    # i.e. bins are consecutive integers
    # Currently, coo_matrix is faster than histogram2d for simple cases
    contingency = sp.coo_matrix(
        (np.ones(class_idx.shape[0]), (class_idx, cluster_idx)),
        shape=(n_classes, n_clusters),
        dtype=dtype,
    )
    if sparse:
        contingency = contingency.tocsr()
        contingency.sum_duplicates()
    else:
        contingency = contingency.toarray()
        if eps is not None:
            # don't use += as contingency is integer
            contingency = contingency + eps
    return contingency

def check_clusterings(labels_true, labels_pred):
    """Check that the labels arrays are 1D and of same dimension.

    Parameters
    ----------
    labels_true : array-like of shape (n_samples,)
        The true labels.

    labels_pred : array-like of shape (n_samples,)
        The predicted labels.
    """
    labels_true = check_array(
        labels_true,
        ensure_2d=False,
        ensure_min_samples=0,
        dtype=None,
    )

    labels_pred = check_array(
        labels_pred,
        ensure_2d=False,
        ensure_min_samples=0,
        dtype=None,
    )

    type_label = type_of_target(labels_true)
    type_pred = type_of_target(labels_pred)

    if "continuous" in (type_pred, type_label):
        msg = (
            "Clustering metrics expects discrete values but received"
            f" {type_label} values for label, and {type_pred} values "
            "for target"
        )
        warnings.warn(msg, UserWarning)

    # input checks
    if labels_true.ndim != 1:
        raise ValueError("labels_true must be 1D: shape is %r" % (labels_true.shape,))
    if labels_pred.ndim != 1:
        raise ValueError("labels_pred must be 1D: shape is %r" % (labels_pred.shape,))
    check_consistent_length(labels_true, labels_pred)

    return labels_true, labels_pred

def mutual_info_score(labels_true, labels_pred, *, contingency=None):
    """Mutual Information between two clusterings.

    The Mutual Information is a measure of the similarity between two labels
    of the same data. Where :math:`|U_i|` is the number of the samples
    in cluster :math:`U_i` and :math:`|V_j|` is the number of the
    samples in cluster :math:`V_j`, the Mutual Information
    between clusterings :math:`U` and :math:`V` is given as:

    .. math::

        MI(U,V)=\\sum_{i=1}^{|U|} \\sum_{j=1}^{|V|} \\frac{|U_i\\cap V_j|}{N}
        \\log\\frac{N|U_i \\cap V_j|}{|U_i||V_j|}

    This metric is independent of the absolute values of the labels:
    a permutation of the class or cluster label values won't change the
    score value in any way.

    This metric is furthermore symmetric: switching :math:`U` (i.e
    ``label_true``) with :math:`V` (i.e. ``label_pred``) will return the
    same score value. This can be useful to measure the agreement of two
    independent label assignments strategies on the same dataset when the
    real ground truth is not known.

    Read more in the :ref:`User Guide <mutual_info_score>`.

    Parameters
    ----------
    labels_true : array-like of shape (n_samples,), dtype=integral
        A clustering of the data into disjoint subsets, called :math:`U` in
        the above formula.

    labels_pred : array-like of shape (n_samples,), dtype=integral
        A clustering of the data into disjoint subsets, called :math:`V` in
        the above formula.

    contingency : {array-like, sparse matrix} of shape \
            (n_classes_true, n_classes_pred), default=None
        A contingency matrix given by the
        :func:`~sklearn.metrics.cluster.contingency_matrix` function. If value
        is ``None``, it will be computed, otherwise the given value is used,
        with ``labels_true`` and ``labels_pred`` ignored.

    Returns
    -------
    mi : float
       Mutual information, a non-negative value, measured in nats using the
       natural logarithm.

    See Also
    --------
    adjusted_mutual_info_score : Adjusted against chance Mutual Information.
    normalized_mutual_info_score : Normalized Mutual Information.

    Notes
    -----
    The logarithm used is the natural logarithm (base-e).
    """
    if contingency is None:
        labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
        contingency = contingency_matrix(labels_true, labels_pred, sparse=True)
    else:
        contingency = check_array(
            contingency,
            accept_sparse=["csr", "csc", "coo"],
            dtype=[int, np.int32, np.int64],
        )

    if isinstance(contingency, np.ndarray):
        # For an array
        nzx, nzy = np.nonzero(contingency)
        nz_val = contingency[nzx, nzy]
    else:
        # For a sparse matrix
        nzx, nzy, nz_val = sp.find(contingency)

    contingency_sum = contingency.sum()
    pi = np.ravel(contingency.sum(axis=1))
    pj = np.ravel(contingency.sum(axis=0))
    print(pi)
    print(pj)
    # Since MI <= min(H(X), H(Y)), any labelling with zero entropy, i.e. containing a
    # single cluster, implies MI = 0
    if pi.size == 1 or pj.size == 1:
        return 0.0

    log_contingency_nm = np.log(nz_val)
    contingency_nm = nz_val / contingency_sum
    # Don't need to calculate the full outer product, just for non-zeroes
    outer = pi.take(nzx).astype(np.int64, copy=False) * pj.take(nzy).astype(
        np.int64, copy=False
    )
    log_outer = -np.log(outer) + log(pi.sum()) + log(pj.sum())

    mi = (
        contingency_nm * (log_contingency_nm - log(contingency_sum))
        + contingency_nm * log_outer
    )
   
    mi = np.where(np.abs(mi) < np.finfo(mi.dtype).eps, 0.0, mi)
    return np.clip(mi.sum(), 0.0, None)


def normalized_mutual_info_score(
    labels_true, labels_pred, *, average_method="arithmetic"
):
    """Normalized Mutual Information between two clusterings.

    Normalized Mutual Information (NMI) is a normalization of the Mutual
    Information (MI) score to scale the results between 0 (no mutual
    information) and 1 (perfect correlation). In this function, mutual
    information is normalized by some generalized mean of ``H(labels_true)``
    and ``H(labels_pred))``, defined by the `average_method`.

    This measure is not adjusted for chance. Therefore
    :func:`adjusted_mutual_info_score` might be preferred.

    This metric is independent of the absolute values of the labels:
    a permutation of the class or cluster label values won't change the
    score value in any way.

    This metric is furthermore symmetric: switching ``label_true`` with
    ``label_pred`` will return the same score value. This can be useful to
    measure the agreement of two independent label assignments strategies
    on the same dataset when the real ground truth is not known.

    Read more in the :ref:`User Guide <mutual_info_score>`.

    Parameters
    ----------
    labels_true : int array-like of shape (n_samples,)
        A clustering of the data into disjoint subsets.

    labels_pred : int array-like of shape (n_samples,)
        A clustering of the data into disjoint subsets.

    average_method : {'min', 'geometric', 'arithmetic', 'max'}, default='arithmetic'
        How to compute the normalizer in the denominator.

        .. versionadded:: 0.20

        .. versionchanged:: 0.22
           The default value of ``average_method`` changed from 'geometric' to
           'arithmetic'.

    Returns
    -------
    nmi : float
       Score between 0.0 and 1.0 in normalized nats (based on the natural
       logarithm). 1.0 stands for perfectly complete labeling.

    See Also
    --------
    v_measure_score : V-Measure (NMI with arithmetic mean option).
    adjusted_rand_score : Adjusted Rand Index.
    adjusted_mutual_info_score : Adjusted Mutual Information (adjusted
        against chance).

    Examples
    --------

    Perfect labelings are both homogeneous and complete, hence have
    score 1.0::

      >>> from sklearn.metrics.cluster import normalized_mutual_info_score
      >>> normalized_mutual_info_score([0, 0, 1, 1], [0, 0, 1, 1])
      ... # doctest: +SKIP
      1.0
      >>> normalized_mutual_info_score([0, 0, 1, 1], [1, 1, 0, 0])
      ... # doctest: +SKIP
      1.0

    If classes members are completely split across different clusters,
    the assignment is totally in-complete, hence the NMI is null::

      >>> normalized_mutual_info_score([0, 0, 0, 0], [0, 1, 2, 3])
      ... # doctest: +SKIP
      0.0
    """
    labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
    classes = np.unique(labels_true)
    clusters = np.unique(labels_pred)
    # Special limit cases: no clustering since the data is not split.
    # It corresponds to both labellings having zero entropy.
    # This is a perfect match hence return 1.0.
    if (
        classes.shape[0] == clusters.shape[0] == 1
        or classes.shape[0] == clusters.shape[0] == 0
    ):
        return 1.0

    contingency = contingency_matrix(labels_true, labels_pred, sparse=True)
    contingency = contingency.astype(np.float64, copy=False)
    # Calculate the MI for the two clusterings
    mi = mutual_info_score(labels_true, labels_pred, contingency=contingency)
    #print(labels_true)
   # print(labels_pred)
    # At this point mi = 0 can't be a perfect match (the special case of a single
    # cluster has been dealt with before). Hence, if mi = 0, the nmi must be 0 whatever
    # the normalization.
   
    if mi == 0:
        return 0.0

    # Calculate entropy for each labeling
    h_true, h_pred = entropy(labels_true), entropy(labels_pred)

    normalizer = _generalized_average(h_true, h_pred, average_method)
    
    return mi / normalizer


In [56]:
def entropy2(labels):
    n_labels = len(labels)

    if n_labels <= 1:
        return 0

    counts = np.bincount(labels)
    probs = counts / n_labels
    n_classes = np.count_nonzero(probs)

    if n_classes <= 1:
        return 0

    ent = 0.

    # Compute standard entropy.
    for i in probs:
        ent -= i * log(i, base=n_classes)

    return ent


IndentationError: unexpected indent (2254884474.py, line 3)

In [55]:
a =[0,1,2,3,4,5]
b =[5,5,5,5,5,5]
c = normalized_mutual_info_score(a,b)
print(c)

[1 1 1 1 1 1]
[6]
0.0


In [33]:
import numpy as np
a =[0,1,2,3,4,5]
b =[5,5,5,5,5,5]

[0, 0, 0, 0]
[0, 1, 2, 3]

def shannon_entropy(A, mode="auto", verbose=False):
    """
    https://stackoverflow.com/questions/42683287/python-numpy-shannon-entropy-array
    """
    A = np.asarray(A)

    # Determine distribution type
    if mode == "auto":
        condition = np.all(A.astype(float) == A.astype(int))
        print(condition)
        if condition:
            mode = "discrete"
        else:
            mode = "continuous"
    if verbose:
        print(mode, file=sys.stderr)
    # Compute shannon entropy
    pA = A / A.sum()
    print(f"A.sum() {A.sum()}")
    # Remove zeros
    pA = pA[np.nonzero(pA)[0]]
    if mode == "continuous":
        return -np.sum(pA*np.log2(A))  
    if mode == "discrete":
        return -np.sum(pA*np.log2(pA))   

def mutual_information(x,y, mode="auto", normalized=False):
    """
    I(X, Y) = H(X) + H(Y) - H(X,Y)
    https://stackoverflow.com/questions/20491028/optimal-way-to-compute-pairwise-mutual-information-using-numpy
    """
    x = np.asarray(x)
    y = np.asarray(y)
    # Determine distribution type
    print(mode)
    if mode == "auto":
        condition_1 = np.all(x.astype(float) == x.astype(int))
        condition_2 = np.all(y.astype(float) == y.astype(int))
        print(condition_1)
        print(condition_2)
        if all([condition_1, condition_2]):
            mode = "discrete"
        else:
            mode = "continuous"

    H_x = shannon_entropy(x, mode=mode)
    print(H_x)
    H_y = shannon_entropy(y, mode=mode)
    print(H_y)
    H_xy = shannon_entropy(np.concatenate([x,y]), mode=mode)

    # Mutual Information
    I_xy = H_x + H_y - H_xy
    if normalized:
        return I_xy/np.sqrt(H_x*H_y)
    else:
        return  I_xy
    
mutual_information([0, 0, 0, 0],[0, 1, 2, 3],normalized =True)

auto
True
True
A.sum() 0
nan
A.sum() 6
1.4591479170272448
A.sum() 6


  pA = A / A.sum()


nan

In [17]:
from sys import argv
from math import log
import os
print(os.getcwd())
S = 0

def read_cover(filename):
    cover = {}
    global S
    nodes = set()
    with open(filename) as f:
        for line in f:
            node, c = line.split()
            nodes.add(node)
            if c not in cover:
                cover[c] = set([node])
            else:
                cover[c].add(node)
    S = len(nodes)
    return cover, nodes

def mutual_info(c_A, c_B):
    print(c_A) 
    print(c_B)
    N_mA = len(c_A)
    N_mB = len(c_B)
    I_num = 0
    for i in c_A:
        for j in c_B:
            n_i = len(c_A[i])
            n_j = len(c_B[j])
            n_ij = len(c_A[i] & c_B[j])
            if n_ij == 0:
                continue
            log_term = log((n_ij * S) / (n_i * n_j))

            I_num += n_ij * log_term
    I_num *= -2

    I_den = 0
    for i in c_A:
        n_i = len(c_A[i])
        I_den += n_i * log(n_i / S)

    for j in c_B:
        n_j = len(c_B[j])
        I_den += n_j * log(n_j / S)

    I = I_num / I_den
    return I

def main():
    if len(argv) < 3:
        print('Enter the filename of the two covers as command line args')
        return

    c_A, nodes_A = read_cover("cover1")
    c_B, nodes_B = read_cover("cover2")
    
   #if nodes_A == nodes_B:
   #    print('Improper covers! Please check the inputs.')
   #    return
    I = mutual_info(c_A, c_B)
    print('The mutual information of the two covers is {}'.format(I))

if __name__ == '__main__':
    main()

C:\JupyterDoc\SOM
{'0': {'3', 'c', 'b', 'a'}, '1': {'6', 'd'}}
{'0': {'c', 'b', 'a'}, '1': {'3', 'd'}, '2': {'6'}}
The mutual information of the two covers is 0.4920936619047235


In [None]:
import pandas as pd
import numpy as np
import newSom 
import importlib
importlib.reload(newSom)
import experiment
import dataset_read
import researchpy as rp
import scipy.stats as stats
import matplotlib.pyplot as plt
import collections
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 

In [None]:
pip install jupyter-cache

In [None]:
dataread = dataset_read.DATAREAD()

In [None]:
csv = pd.read_csv("CustomerSegmentation/Train.csv")
csv2 = pd.read_csv("CustomerSegmentation/Test.csv")
csv = csv.drop(columns=['ID','Segmentation'])
csv2 = csv2.drop(columns=['ID'])   

csv['Family_Size'] = pd.to_numeric(csv['Family_Size'])
csv2['Family_Size'] = pd.to_numeric(csv2['Family_Size'])
csv['Work_Experience'] = pd.to_numeric(csv['Work_Experience'])
csv2['Work_Experience'] = pd.to_numeric(csv2['Work_Experience'])

csv_original_encode1 = csv
csv_original_encode2 = csv2

# class label do not need to onehot encoding, label ercoding will be OK
dataread.label_encoding(csv_original_encode1,"Var_1")
#one hot encoding vs proposed
dataread.effect_encoding(csv_original_encode1,["Gender","Ever_Married","Graduated","Profession","Spending_Score","Family_Size","Work_Experience"])


csv_training_original_encoded = dataread.original_encoding_data.sample(int(dataread.original_encoding_data.shape[0]*0.5))


dataread.label_encoding(csv_original_encode2,"Var_1")
#one hot encoding vs proposed
dataread.effect_encoding(csv_original_encode2,["Gender","Ever_Married","Graduated","Profession","Spending_Score","Family_Size","Work_Experience"])

#dataread.original_encoding_data is udpated through dataread.label_encoding function
csv_test_original_encoded = dataread.original_encoding_data

dataread.label_encoding(csv,"Gender")
dataread.label_encoding(csv,"Ever_Married")
dataread.label_encoding(csv,"Graduated")
dataread.label_encoding(csv,"Profession")
dataread.label_encoding(csv,"Spending_Score")
dataread.label_encoding(csv,"Var_1")
dataread.label_encoding(csv,"Family_Size")
dataread.label_encoding(csv,"Work_Experience")



dataread.label_encoding(csv2,"Gender")
dataread.label_encoding(csv2,"Ever_Married")
dataread.label_encoding(csv2,"Graduated")
dataread.label_encoding(csv2,"Profession")
dataread.label_encoding(csv2,"Spending_Score")
dataread.label_encoding(csv2,"Var_1")
dataread.label_encoding(csv2,"Family_Size")
dataread.label_encoding(csv2,"Work_Experience")



csv_training = csv.sample(int(csv.shape[0]*0.5))
csv_test = csv2




dataread.initializedataset(csv,csv_training,csv_test,csv_training_original_encoded,csv_test_original_encoded,"Var_1")

In [None]:
#onhot encoding vs proposed
import experiment
unstable_repeat_num= 30
scope_num = 80
class_num = 13
dim_num = 7
best_num = 48
interval = 3


experiment = experiment.Experiment()
experiment.UTtest_Discrete_Continuous(dataread,False,class_num,best_num, scope_num,unstable_repeat_num,0,interval,1,3)