                                                      Fuzzy-C-Means


In [1]:
import pandas as pd
import numpy as np
import random
import operator
import math

                                  Implementing Fuzzy-C-Means On Creditcard Dataset

In [2]:
data = pd.read_csv("creditcard.csv")

                                  Taking equal no. of Fraud & Non-Fraud Cases

In [3]:
non_fraud = data[data['Class']==0]
fraud = data[data['Class']==1]

In [4]:
non_fraud.shape

(284315, 31)

In [5]:
 fraud.shape

(492, 31)

In [6]:
non_fraud = non_fraud.sample(fraud.shape[0])
non_fraud.shape

(492, 31)

In [7]:
data = fraud.append(non_fraud, ignore_index=True)
data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,406.0,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.177840,0.261145,-0.143276,0.00,1
1,472.0,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,...,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,529.00,1
2,4462.0,-2.303350,1.759247,-0.359745,2.330243,-0.821628,-0.075788,0.562320,-0.399147,-0.238253,...,-0.294166,-0.932391,0.172726,-0.087330,-0.156114,-0.542628,0.039566,-0.153029,239.93,1
3,6986.0,-4.397974,1.358367,-2.592844,2.679787,-1.128131,-1.706536,-3.496197,-0.248778,-0.247768,...,0.573574,0.176968,-0.436207,-0.053502,0.252405,-0.657488,-0.827136,0.849573,59.00,1
4,7519.0,1.234235,3.019740,-4.304597,4.732795,3.624201,-1.357746,1.713445,-0.496358,-1.282858,...,-0.379068,-0.704181,-0.656805,-1.632653,1.488901,0.566797,-0.010016,0.146793,1.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
979,8137.0,1.158417,-0.082388,0.724182,1.006849,-0.654996,-0.265482,-0.409298,0.044077,1.868635,...,-0.196522,-0.209020,-0.083478,0.289390,0.477143,0.386090,-0.058688,-0.006674,19.99,0
980,2720.0,-1.304077,-4.952189,-1.584203,0.492046,-2.073699,-0.053731,1.502903,-0.479615,-1.174571,...,0.422547,-1.383065,-1.282866,0.129743,-0.069458,0.946466,-0.337225,0.226632,1399.20,0
981,150793.0,0.037481,0.807068,0.247838,-0.602129,0.413638,-1.013318,0.958786,-0.142544,0.051902,...,-0.265579,-0.611370,0.078266,-0.107755,-0.515247,0.149812,0.245750,0.094919,4.49,0
982,96018.0,2.047246,0.249497,-1.654144,0.742906,0.219339,-1.394131,0.105874,-0.388903,2.032949,...,0.031047,0.462933,-0.003227,-0.245795,0.187775,-0.129434,-0.032335,-0.038082,7.40,0


In [8]:
columns = list(data.columns)
features = columns[:len(columns)-1]
class_labels = list(data[columns[-1]])
df = data[features]

# Number of Attributes
num_attr = len(df.columns) - 1

# Number of Clusters
k = 2

# Maximum number of iterations
MAX_ITER = 100

# Number of data points
n = len(df)

# Fuzzy parameter
m = 2.00


In [9]:
print(data)

         Time        V1        V2        V3        V4        V5        V6  \
0       406.0 -2.312227  1.951992 -1.609851  3.997906 -0.522188 -1.426545   
1       472.0 -3.043541 -3.157307  1.088463  2.288644  1.359805 -1.064823   
2      4462.0 -2.303350  1.759247 -0.359745  2.330243 -0.821628 -0.075788   
3      6986.0 -4.397974  1.358367 -2.592844  2.679787 -1.128131 -1.706536   
4      7519.0  1.234235  3.019740 -4.304597  4.732795  3.624201 -1.357746   
..        ...       ...       ...       ...       ...       ...       ...   
979    8137.0  1.158417 -0.082388  0.724182  1.006849 -0.654996 -0.265482   
980    2720.0 -1.304077 -4.952189 -1.584203  0.492046 -2.073699 -0.053731   
981  150793.0  0.037481  0.807068  0.247838 -0.602129  0.413638 -1.013318   
982   96018.0  2.047246  0.249497 -1.654144  0.742906  0.219339 -1.394131   
983  152584.0 -0.514720  1.301689 -0.426040 -0.241031  0.770752 -1.424875   

           V7        V8        V9  ...       V21       V22       V23  \
0  

In [10]:
def accuracy(cluster_labels, class_labels):
    county = [0,0]
    countn = [0,0]
    tp = [0, 0]
    tn = [0, 0]
    fp = [0, 0]
    fn = [0, 0]
    
    for i in range(len(df)):
        # Here 1 = Yes, 0 = No
        if cluster_labels[i] == 1 and class_labels[i] == 1:
            tp[0] = tp[0] + 1
        if cluster_labels[i] == 0 and class_labels[i] == 0:
            tn[0] = tn[0] + 1
        if cluster_labels[i] == 1 and class_labels[i] == 0:
            fp[0] = fp[0] + 1
        if cluster_labels[i] == 0 and class_labels[i] == 1:
            fn[0] = fn[0] + 1
    
    for i in range(len(df)):
        # Here 0 = Yes, 1 = No
        if cluster_labels[i] == 0 and class_labels[i] == 0:
            tp[1] = tp[1] + 1
        if cluster_labels[i] == 1 and class_labels[i] == 1:
            tn[1] = tn[1] + 1
        if cluster_labels[i] == 0 and class_labels[i] == 1:
            fp[1] = fp[1] + 1
        if cluster_labels[i] == 1 and class_labels[i] == 0:
            fn[1] = fn[1] + 1

    a0 = float((tp[0] + tn[0]))/(tp[0] + tn[0] + fn[0] + fp[0])
    a1 = float((tp[1] + tn[1]))/(tp[1] + tn[1] + fn[1] + fp[1])
    p0 = float(tp[0])/float(tp[0] + fp[0])
    p1 = float(tp[1])/float(tp[1] + fp[1])
    r0 = float(tp[0])/float(tp[0] + fn[0])
    r1 = float(tp[1])/float(tp[1] + fn[1])
    
    accuracy = [a0*100,a1*100]
    precision = [p0*100,p1*100]
    recall = [r0*100,r1*100]
    
    return accuracy, precision, recall


Function to create Membership Matrix:

In [11]:
def initializeMembershipMatrix():
    membership_mat = list()
    for i in range(n):
        random_num_list = [random.random() for i in range(k)]
        summation = sum(random_num_list)
        temp_list = [x/summation for x in random_num_list]
        membership_mat.append(temp_list)
    return membership_mat


Fuction for calculating Cluster center:

In [12]:
def calculateClusterCenter(membership_mat):
    cluster_mem_val = list(zip(*membership_mat))
    cluster_centers = list()
    for j in range(k):
        x = list(cluster_mem_val[j])
        xraised = [e ** m for e in x]
        denominator = sum(xraised)
        temp_num = list()
        for i in range(n):
            data_point = list(df.iloc[i])
            prod = [xraised[i] * val for val in data_point]
            temp_num.append(prod)
        numerator = list(map(sum, zip(*temp_num)))
        center = [z/denominator for z in numerator]
        cluster_centers.append(center)
    return cluster_centers


In [13]:
def updateMembershipValue(membership_mat, cluster_centers):
    p = float(2/(m-1))
    for i in range(n):
        x = list(df.iloc[i])
        distances = [np.linalg.norm(list(map(operator.sub, x, cluster_centers[j]))) for j in range(k)]
        for j in range(k):
            den = sum([math.pow(float(distances[j]/distances[c]), p) for c in range(k)])
            membership_mat[i][j] = float(1/den)       
    return membership_mat

Function for creating clusters:

In [14]:
def getClusters(membership_mat):
    cluster_labels = list()
    for i in range(n):
        max_val, idx = max((val, idx) for (idx, val) in enumerate(membership_mat[i]))
        cluster_labels.append(idx)
    return cluster_labels


Main Function module:

In [15]:
def fuzzyCMeansClustering():
    # Membership Matrix
    membership_mat = initializeMembershipMatrix()
    curr = 0
    while curr <= MAX_ITER:
        cluster_centers = calculateClusterCenter(membership_mat)
        membership_mat = updateMembershipValue(membership_mat, cluster_centers)
        cluster_labels = getClusters(membership_mat)
        curr += 1
    print(membership_mat)
    return cluster_labels, cluster_centers


In [16]:
labels, centers = fuzzyCMeansClustering()
print(labels)
print('\n')
print(centers)
a,p,r = accuracy(labels, class_labels)
print('\n\n')
print("Accuracy = " + str(a))
print("Precision = " + str(p))
print("Recall = " + str(r))

[[0.11040758049471806, 0.8895924195052819], [0.11024208385405564, 0.8897579161459443], [0.0997238222137289, 0.9002761777862711], [0.09294152980185995, 0.9070584701981401], [0.09149824701873785, 0.9085017529812621], [0.09147925710525731, 0.9085207428947427], [0.09145484695242825, 0.9085451530475717], [0.0914331479441727, 0.9085668520558272], [0.09141144828656125, 0.9085885517134387], [0.09125138329834315, 0.9087486167016569], [0.09108313597797853, 0.9089168640220215], [0.09089854882015999, 0.9091014511798401], [0.0904884444897411, 0.9095115555102588], [0.08994753475657524, 0.9100524652434248], [0.08973266997976864, 0.9102673300202313], [0.08908214900895954, 0.9109178509910405], [0.08906308536640711, 0.9109369146335928], [0.08896503017947413, 0.9110349698205259], [0.08875525863191001, 0.91124474136809], [0.08852088339679724, 0.9114791166032027], [0.0881309725634519, 0.9118690274365482], [0.08799185778751575, 0.9120081422124844], [0.08780086472973246, 0.9121991352702675], [0.0877790330674