# Basic Modules


In [4]:
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np
from scipy.stats import chi2
from mpl_toolkits.mplot3d import Axes3D
import itertools
%matplotlib inline

def Normalization(Data):
    Mean1 = np.mean(Data, axis = 0)
    Std1  = np.std(Data, axis = 0)
    return (Data-Mean1)/Std1

def ReturnDataFrame(path):
        return pd.read_csv(path, sep=',',skipinitialspace=True)  

def MahalonobisDetection(Data, alpha):
    Data = Data - np.mean(Data, axis = 0)
    n1,n2 = Data.shape
    Cov = (1/float(n1-1))*Data.T*Data
    M = np.zeros(n1)
    for i in range(0,n1):
        M[i] = Data[i,:]*np.linalg.inv(Cov)*Data.T[:,i]
    c = chi2.isf(alpha,n2) 
    return  M, c , Cov

    
def PCA(NData):
    NDataMean = NData - np.mean(NData,axis = 0)

    n1 , n2 = NDataMean.shape

    NCov = (NDataMean.T)*NDataMean
    NCov = (1/float(n1-1))*NCov
    NEigenvaluesc, NEigenvectorsc = np.linalg.eigh(NCov) 
    idx = NEigenvaluesc.argsort()[::-1]  
    NEigenvaluesc = NEigenvaluesc[idx]
    NEigenvectorsc  =  NEigenvectorsc [:,idx]
    return NEigenvaluesc, NEigenvectorsc

def SelectingBestSubset2class(Data, nfeat, fmask,mmask):
    
    t1 , t2 = Data.shape
    
    C1 = np.asmatrix(Data[fmask,:])
    C2 = np.asmatrix(Data[mmask,:])
    n1, dummy = C1.shape
    n2, dummy = C2.shape    
    
    P1 = float(n1)/float(t1)
    P2 = float(n2)/float(t1)
    
    Flag = True 
    
    L1   = range(t2)
    
    t2 = t2 - 1
    
    J = -100000.0
    
    while(Flag):
        
        #for i in range(t2)
        #    p1 = [ k for k in range(t2) : if i != k ] 
        p1 = list(itertools.combinations(L1,t2))
        print(len(p1))
        for j in p1:
            TData = Data[:,j]
            C1 = np.asmatrix(TData[fmask,:])
            C2 = np.asmatrix(TData[mmask,:]) 
            Cov1 = (1/float(n1-1))*np.dot(C1.T,C1)
            Cov2 = (1/float(n2-1))*np.dot(C2.T,C2)         
            Sw = P1*Cov1+P2*Cov2
            m1 = (1/float(n1))*np.sum(C1,axis = 0)
            m2 = (1/float(n2))*np.sum(C2,axis = 0)
            m0 = P1*m1+P2*m2
            Sm = (1/float(t1-1))*np.dot((TData - m0).T,(TData-m0))
            
            Jt = np.trace(Sm)/np.trace(Sw)
            
            if (Jt > J):
                print(L1)
                J = Jt
                L1 = j
                
        if (t2 == nfeat):
            Flag = False
            print('The selected features ')
            print(L1)
            print('J value for selection '+str(J))

        t2 = t2-1
         
    return L1, J

def kmeans(Data,centroids,error):
    lbelong = []
    x1,x2 = Data.shape
    y1,y2 = centroids.shape
    oldcentroids = np.matrix(np.random.random_sample((y1,y2)))
    # Loop for the epochs
    # This allows to control the error
    trace = [];
    while ( np.sqrt(np.sum(np.power(oldcentroids-centroids,2)))>error):
        # Loop for the Data
        for i in range(0,x2):
            dist = []
            point = Data[:,i]
            #loop for the centroids
            for j in range(0, y2):
                centroid = centroids[:,j]
                dist.append(np.sqrt(np.sum(np.power(point-centroid,2))))
            lbelong.append(dist.index(min(dist)))        
        oldcentroids = centroids
        trace.append(centroids)
        
        #Update centroids     
        for j in range(0, y2):
            indexc = [i for i,val in enumerate(lbelong) if val==(j)]
            Datac = Data[:,indexc]
            print(len(indexc))
            if (len(indexc)>0):
                centroids[:,j]= Datac.sum(axis=1)/len(indexc)
    return centroids, lbelong, trace

def LinearRegression(Class1, Class2):
    # Generate the X
    n1, dummy = Class1.shape
    n2, dummy = Class2.shape

    C1 = np.hstack((np.ones((n1,1)),Class1))
    C2 = np.hstack((np.ones((n2,1)),Class2))
    X = np.matrix(np.vstack((C1,C2)))
    # Get the label array
    y = np.matrix(np.vstack((np.ones((n1,1)),-np.ones((n2,1)))))

    # Finally get the w for the decision surface
    w = np.linalg.inv((np.transpose(X)*X))*np.transpose(X)*y    
    
    return X[0:n1,:]*w, X[n1:n1+n2,:]*w



if __name__ == '__main__':
  # Load CVS
  Path1 = 'voice.csv'
  DataMatrix = ReturnDataFrame(Path1)
  
  DataMatrix.replace({'male': -1.0, 'female': 1.0},
                      inplace=True)
  
  DataLabels = DataMatrix['label']
  
  DataMatrix.drop('label', axis=1, inplace=True)
  
  
  # Transform to an NP Array
  Data = DataMatrix.values
  Label = DataLabels.values
  
  fmask = (Label == 1.0)
  mmask = (Label == -1.0)
  
  # Normalize your Data # 
  NData = np.array(Normalization(Data))
  #NData = np.asmatrix(Data)
  
  
  # Select Best Features
  nfeat = 4
  L1  , J = SelectingBestSubset2class(NData, nfeat, fmask,mmask)
  
  # Select The Best 
  #BNData = NData[:,L1]
  
  ### Apply the Eigenvalues
  #Eigv, Eig = PCA(BNData)
  #
  #idx = Eigv.argsort()[::-1]   
  #
  #Eigv = Eigv[idx]
  #Eig = Eig[:,idx]
  
  #NP1 =  np.transpose(Eig)
  #
  #PBNData = (NP1*BNData.T).T
  #
  #Class1 = PBNData[fmask,:]
  #Class2 = PBNData[mmask,:]
  
  
  ## Detect The Outliers and Remove Them # 
  
  #alpha = 0.05
  
  #M1, c1 , cov1 = MahalonobisDetection(Class1, alpha)
  #M2, c2 , cov2 = MahalonobisDetection(Class2, alpha)
  
  #Class1 = Class1[(M1<c1),:]
  #Class2 = Class2[(M2<c2),:] 
 
  #Classification1, Classification2 =  LinearRegression(Class1, Class2)
  


20
range(0, 20)
(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)
(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19)
19
171
(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19)
17
136
680
2380
6188
12376
19448
24310
24310
19448
12376
6188
2380
The selected features 
(0, 2, 3, 4, 5, 6, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19)
J value for selection 0.999684243764
