In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import pennylane as qml


In [2]:
data=pd.read_csv("C:/Users/JOSHWIN ISAC/OneDrive/Documents/GitHub/Quantum-Computing-project/Quantum_NLP/IMDB_dataset.csv")

In [3]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
data.shape

(50000, 2)

In [5]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [6]:
label_encoder=LabelEncoder()
data['sentiment'] = label_encoder.fit_transform(data['sentiment'])

In [7]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [7]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
import string

[nltk_data] Downloading package stopwords to C:\Users\JOSHWIN
[nltk_data]     ISAC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\JOSHWIN
[nltk_data]     ISAC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
#Text to lower
def to_lower(text):
  return text.lower()

#remove punctuation
def remove_punc(text):
  translator=str.maketrans('','',string.punctuation)
  return text.translate(translator)

#remove stopwords
def remove_stopwords(text):
  stop_words=set(stopwords.words('english'))
  words=text.split()
  filtered_words=[word for word in words if word not in stop_words]
  return ' '.join(filtered_words)

#remove numbers
def remove_numbers(text):
  return re.sub(r'\d+', '', text)

# Lemmatize text
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

#remove symbols
def remove_symbols(text):
  return re.sub(r'[^\w\s]', '', text)

In [9]:
def preprocess_text(text):
  text=to_lower(text)
  text=remove_punc(text)
  text=remove_stopwords(text)
  text=remove_numbers(text)
  text=lemmatize_text(text)
  text=remove_symbols(text)
  return text

#calling the funtion preprocess
data['review']=data['review'].apply(preprocess_text)
data.head(5)

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching oz episode you...,1
1,wonderful little production br br filming tech...,1
2,thought wonderful way spend time hot summer we...,1
3,basically there family little boy jake think t...,0
4,petter matteis love time money visually stunni...,1


In [10]:
#show the comments after preprocessing in df
data['Cleaned_reviews']=data['review'].apply(to_lower).apply(remove_punc).apply(remove_stopwords).apply(remove_numbers).apply(lemmatize_text)
data.head(5)

Unnamed: 0,review,sentiment,Cleaned_reviews
0,one reviewer mentioned watching oz episode you...,1,one reviewer mentioned watching oz episode you...
1,wonderful little production br br filming tech...,1,wonderful little production br br filming tech...
2,thought wonderful way spend time hot summer we...,1,thought wonderful way spend time hot summer we...
3,basically there family little boy jake think t...,0,basically family little boy jake think zombie ...
4,petter matteis love time money visually stunni...,1,petter matteis love time money visually stunni...


In [11]:
vectorizer=TfidfVectorizer(max_features=4)
features = vectorizer.fit_transform(data['Cleaned_reviews']).toarray()
y=data['sentiment']

# Normalize features for quantum compatibility
features = np.array([f / np.linalg.norm(f) if np.linalg.norm(f) > 0 else f for f in features])
print(features)

# Get the labels
labels = data['sentiment'].values

[[0.94671192 0.         0.         0.32208158]
 [0.98583868 0.         0.         0.16769646]
 [0.89072146 0.         0.         0.45454953]
 ...
 [0.90471659 0.         0.42601396 0.        ]
 [0.         0.24029925 0.         0.97069886]
 [0.10498244 0.         0.98868493 0.10714846]]


In [12]:
labels

array([1, 1, 1, ..., 0, 0, 0])

In [13]:
import qiskit_code_assistant_jupyterlab
from qiskit_machine_learning.kernels import FidelityQuantumKernel
from qiskit_machine_learning.state_fidelities import ComputeUncompute
from qiskit_machine_learning.algorithms import QSVC
from qiskit import QuantumCircuit
from qiskit.primitives import Sampler
from qiskit.circuit.library import ZZFeatureMap

In [14]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)


In [15]:
# Define the quantum feature map using a predefined library circuit
num_features = features.shape[1]  # Number of features in your data
feature_map = ZZFeatureMap(feature_dimension=num_features, reps=2)


In [None]:
from sklearn.metrics import accuracy_score

# Initialize the quantum circuit simulator
sampler = Sampler()

# Create Fidelity Quantum Kernel
quantum_kernel = FidelityQuantumKernel(
    fidelity=ComputeUncompute(sampler=sampler),
    feature_map=feature_map
)

# Define QSVC model
qsvc = QSVC(quantum_kernel=quantum_kernel)

# Train QSVC
qsvc.fit(X_train, y_train)

# Test QSVC
y_pred = qsvc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"QSVC Accuracy: {accuracy * 100:.2f}%")


  sampler = Sampler()
  fidelity=ComputeUncompute(sampler=sampler),


In [None]:
# # Initialize a quantum simulator device
# n_qubits=4
# dev = qml.device('default.qubit', wires=n_qubits)  # Number of qubits depends on your feature size

# # Quantum circuit using QNode decorator
# @qml.qnode(dev)
# def quantum_circuit(feature_vector):
#     # Encode features into quantum states (using RY rotations)
#     for i, feature in enumerate(feature_vector):
#         qml.RY(feature, wires=i)  # Apply RY rotation for each feature
#     return [qml.expval(qml.PauliZ(i)) for i in range(n_qubits)]  # Measure Pauli-Z expectation values

# # Quantum feature extraction function
# def quantum_features(data):
#     return np.array([quantum_circuit(f) for f in data])  # Apply quantum circuit for each data point

In [None]:
# X_train,X_test,y_train,y_test=train_test_split(features,labels,test_size=0.2, random_state=42)

# # Extract quantum features
# X_train_quantum = quantum_features(X_train)
# X_test_quantum = quantum_features(X_test)

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score

# classifier =LogisticRegression()
# classifier.fit(X_train,y_train)

# # Evaluate the model
# predictions = classifier.predict(X_test)
# accuracy = accuracy_score(y_test, predictions)
# print(f"Accuracy (Classical Model): {accuracy}")

Accuracy (Classical Model): 0.7384


In [None]:
# from qiskit.circuit.library import ZZFeatureMap
# from qiskit.primitives import Sampler
# from qiskit_machine_learning.state_fidelities import ComputeUncompute
# from qiskit_machine_learning.kernels import FidelityQuantumKernel
# from qiskit_aer import AerSimulator

In [None]:
# # Define the feature map
# n_qubits = 4
# feature_map = ZZFeatureMap(feature_dimension=n_qubits, reps=2, entanglement='linear')

# # Initialize AerSimulator
# simulator = AerSimulator()

In [None]:
# def compute_kernel(x1, x2):
#     # Circuit for the first data point
#     qc1 = QuantumCircuit(n_qubits)
#     qc1.compose(feature_map.bind_parameters(x1), inplace=True)
    
#     # Circuit for the second data point
#     qc2 = QuantumCircuit(n_qubits)
#     qc2.compose(feature_map.bind_parameters(x2), inplace=True)
    
#     # Run the simulation
#     result1 = simulator.run(qc1).result()
#     result2 = simulator.run(qc2).result()
    
#     # Get statevectors
#     state1 = result1.get_statevector(qc1)
#     state2 = result2.get_statevector(qc2)
    
#     # Compute overlap
#     return np.abs(np.dot(np.conj(state1), state2)) ** 2


In [None]:
# from sklearn.svm import SVC

# from sklearn.metrics import accuracy_score
# SVM_classifier=SVC()
# SVM_classifier.fit(X_train_quantum,y_train)


# prediction_SVM=SVM_classifier.predict(X_test_quantum)
# accuracy_SVM=accuracy_score(y_test,prediction_SVM)

# print(f"Accuracy (Classical Model): {accuracy_SVM}")

Accuracy (Classical Model): 0.5525
