# Penguin Classifier

In [1]:
import numpy as np
import pandas as pd 

from sklearn import svm
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import MinMaxScaler, StandardScaler 
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

import pennylane as qml
from pennylane.templates import AngleEmbedding, IQPEmbedding, AmplitudeEmbedding

from imblearn.over_sampling import SMOTE
oversampler = SMOTE(random_state=42)

np.random.seed(42)



## Load and Process the Dataset

In [9]:
data = pd.read_csv("penguins_size.csv")
data.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


### Handle Missing Values

In [10]:
data.isnull().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

In [11]:
# replace missing values with most frequent value in that column 
imputer = SimpleImputer(strategy='most_frequent')
data.iloc[:,:] = imputer.fit_transform(data)

In [12]:
data.isnull().sum()

species              0
island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

### Make 'sex', 'island' and 'species' attributes integers

In [13]:
lb = LabelEncoder()

data["sex"] = lb.fit_transform(data["sex"])
data['sex'][:5]

0    2
1    1
2    1
3    2
4    1
Name: sex, dtype: int64

In [14]:
data["species"] = lb.fit_transform(data["species"])
data["species"][:200]

0      0
1      0
2      0
3      0
4      0
      ..
195    1
196    1
197    1
198    1
199    1
Name: species, Length: 200, dtype: int64

In [15]:
data["island"] = lb.fit_transform(data["island"])
data["island"][:200]

0      2
1      2
2      2
3      2
4      2
      ..
195    1
196    1
197    1
198    1
199    1
Name: island, Length: 200, dtype: int64

### Balance the dataset

In [16]:
data['species'].value_counts()

0    152
2    124
1     68
Name: species, dtype: int64

In [17]:
X = data.drop(['species'],axis=1)
y = data['species']

In [18]:
X,y = oversampler.fit_resample(X,y)

X = np.array(X)
y = np.array(y)

In [19]:
print(X.shape,y.shape)

(456, 6) (456,)


## Classification with Analytic Expressions

In [20]:
# returns the kernel matrix for a given dataset, using a particular kernel function 
def kernel_matrix(X,kernel):
    gram_matrix = np.zeros([X.shape[0],X.shape[0]])
    
    # returns the indices for the upper triangle of the gram matrix 
    for index in list(filter(lambda i: i[1]>=i[0], np.ndindex(gram_matrix.shape))):
        i = index[0]
        j = index[1]
        gram_matrix[i][j] = kernel(X[i],X[j])
            
        if(j<gram_matrix.shape[0] and i<gram_matrix.shape[1]):
            gram_matrix[j][i] = gram_matrix[i][j]

    return gram_matrix

In [21]:
# uses the analytic expressions that we have derived for the kernels to calculate the kernel function on two data points 

# kernel that arises from amplitude encoding
def analytic_linear_kernel(x,z,d=1):
    # normalize the two vectors
    x = x/np.linalg.norm(x)
    z = z/np.linalg.norm(z)
    
    return np.abs(np.inner(x,z))**(2*d)

<span style="color:red">Write a function called analytic_cosine_kernel that computes the analytic expression for the cosine kernel. This function must accept two parameters, x and z, just like the analytic_linear_kernel function, and should output the kernel value.</span>

In [None]:
# write code here

### Classification with Kernel that arises from Amplitude Encoding

In [22]:
# scale the dataset to have zero mean and unit variance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [24]:
linear_kernel_matrix = kernel_matrix(X,analytic_linear_kernel)

In [25]:
clf = svm.SVC(kernel='precomputed')

# we use 5-fold cross validation 
test_scores = cross_validate(clf,linear_kernel_matrix,y,cv=5)['test_score']
test_scores

array([0.32608696, 0.52747253, 0.49450549, 0.51648352, 0.37362637])

### Classification with Kernel that arises from Rotation Encoding

In [27]:
# just standard scaling 
scaler_1 = StandardScaler()
X_scaled_1 = scaler_1.fit_transform(X)

# standard scaling then minmax scaling 
scaler_2 = MinMaxScaler(feature_range=(0,2*np.pi))
X_scaled_2 = scaler_2.fit_transform(X_scaled_1)

# just minmax scaling 
X_scaled_3 = scaler_2.fit_transform(X)

# minmax scaling then standard scaling 
X_scaled_4 = scaler_1.fit_transform(X_scaled_3)

In [30]:
cosine_kernel_matrix = kernel_matrix(X_scaled_4,analytic_cosine_kernel)

In [31]:
clf = svm.SVC(kernel='precomputed')

# we use 5-fold cross validation 
test_scores = cross_validate(clf,cosine_kernel_matrix,y,cv=5)['test_score']
test_scores

array([0.94565217, 0.97802198, 0.96703297, 0.97802198, 0.98901099])

## Classification by Simulating Circuits

### Classification with Kernel that arises from Rotation Encoding

In [32]:
n_qubits = 6

dev_kernel = qml.device("default.qubit", wires=n_qubits)

projector = np.zeros((2**n_qubits, 2**n_qubits))
projector[0, 0] = 1

@qml.qnode(dev_kernel)
def cosine_kernel(x1, x2):
    """The quantum kernel."""
    AngleEmbedding(x1*2, wires=range(n_qubits),rotation="Y")
    # adjoint returns the complex conjugate transpose of the angle embedding operator
    # divide by 2 because of the definition of the Ry gate used by pennylane
    # can also use "X" or "Z" rotation 
    qml.adjoint(AngleEmbedding)(x2*2, wires=range(n_qubits),rotation="Y")
    # returns the expectation value of the projector 
    return qml.expval(qml.Hermitian(projector, wires=range(n_qubits)))

In [34]:
analytic_cosine_kernel(X[1],X[0])

0.0013277647373476492

In [33]:
# see how long it takes to execute the kernel for two data points
cosine_kernel(X[1],X[0])

tensor(0.00132776, requires_grad=True)

In [23]:
# cosine_kernel_matrix = kernel_matrix(X_scaled_4,cosine_kernel)
# np.savetxt("cosine_kernel_matrix.csv", cosine_kernel_matrix, delimiter=",")

In [35]:
cosine_kernel_matrix = np.genfromtxt("cosine_kernel_matrix.csv", delimiter=",")

In [36]:
clf = svm.SVC(kernel='precomputed')

# we use 5-fold cross validation 
test_scores = cross_validate(clf,cosine_kernel_matrix,y,cv=5)['test_score']
test_scores

array([0.94565217, 0.97802198, 0.96703297, 0.97802198, 0.98901099])

### Classification with Kernel that arises from Amplitude Encoding

In [37]:
X_padded = np.append(np.append(X,np.zeros([456,1]),axis=1),np.zeros([456,1]),axis=1)

In [None]:
X_padded.shape

<span style="color:red">Write some code to implement the kernel that arises from amplitude embedding. The function that computes the kernel must be called linear_kernel. </span>

In [None]:
# write code here

In [29]:
# see how long it takes to execute the kernel for two data points
linear_kernel(X_padded[0],X_padded[0])

tensor(1., requires_grad=True)

In [30]:
# linear_kernel_matrix = kernel_matrix(X_padded,linear_kernel)
# np.savetxt("linear_kernel_matrix.csv", linear_kernel_matrix, delimiter=",")

In [31]:
linear_kernel_matrix = np.genfromtxt("linear_kernel_matrix.csv", delimiter=",")

In [32]:
clf = svm.SVC(kernel='precomputed')

# we use 5-fold cross validation 
test_scores = cross_validate(clf,linear_kernel_matrix,y,cv=5)['test_score']
test_scores

array([0.32608696, 0.52747253, 0.49450549, 0.51648352, 0.37362637])

### Classification with Kernel that arises from IQP Encoding

<span style="color:red">Write some code to implement the kernel that arises from IQP encoding. The function that computes the kernel must be called IQP_kernel.</span>

In [34]:
scaler = MinMaxScaler(feature_range=(-1,1))
X_scaled_5 = scaler.fit_transform(X)

X_scaled_6 = scaler.fit_transform(X_scaled_1)

In [None]:
# write code here 

In [35]:
IQP_kernel(X_scaled_6[0],X_scaled_6[1])

tensor(0.01894141, requires_grad=True)

In [36]:
# iqp_kernel_matrix = kernel_matrix(X_scaled_6,IQP_kernel)
# np.savetxt("iqp_kernel_matrix.csv", iqp_kernel_matrix, delimiter=",")

In [37]:
iqp_kernel_matrix = np.genfromtxt("iqp_kernel_matrix.csv", delimiter=",")

In [38]:
clf = svm.SVC(kernel='precomputed')

# we use 5-fold cross validation 
test_scores = cross_validate(clf,iqp_kernel_matrix,y,cv=5)['test_score']
test_scores

array([0.9673913 , 0.98901099, 0.98901099, 0.95604396, 0.98901099])