In [2]:
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
dataset = load_dataset("PiC/phrase_similarity")
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/divyansh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
training_data = dataset['train']
validation_data = dataset['validation']
test_data = dataset['test']

with open('GloVe/vectors.txt', 'r') as f:
    vectors=f.readlines()
import numpy as np
def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    

training_vectors=[]
def remove_stopwords(sentence):
    stop_words = set(stopwords.words('english'))
    words = sentence.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)


def get_avg(phrase):

    words = phrase.split()
    avg = [0]*50
    for word in words:
        for vector in vectors:
            if word in vector:
                vector = vector.split()
                for i in range(50):
                    avg[i] += float(vector[i+1])
                break
    for i in range(50):
        avg[i] /= len(words)
    return avg

In [4]:

for i in range(len(training_data)):
    training_data[i]['phrase1'] = training_data[i]['phrase1'].lower()
    training_data[i]['phrase2'] = training_data[i]['phrase2'].lower()
    
    training_data[i]['phrase1'] = remove_stopwords(training_data[i]['phrase1'])
    training_data[i]['phrase2'] = remove_stopwords(training_data[i]['phrase2'])
    
    avg1=get_avg(training_data[i]['phrase1'])
    avg2=get_avg(training_data[i]['phrase2'])
    
    f1 = all(x == 0 for x in avg1)
    f2 = all(x == 0 for x in avg2)
    if f1 or f2:
        continue
    else:
        training_vectors.append([avg1,avg2,training_data[i]['label']])

   

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
C_values = [0.01, 0.1, 1, 10, 100]

model = LogisticRegression(C=1)

l1 = np.array([vec[0] for vec in training_vectors])
l2 = np.array([vec[1] for vec in training_vectors])

X = np.concatenate((l1,l2),axis=1)
y = np.array([vec[2] for vec in training_vectors])   

    ##preprocess X
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
mean = np.mean(X, axis=0)
std = np.std(X, axis=0)
    
scaler = StandardScaler()
X = scaler.fit_transform(X)
    
cov_matrix = np.cov(X, rowvar=False)
eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
idx = eigenvalues.argsort()[::-1]
eigenvectors = eigenvectors[:,idx]
eigenvalues = eigenvalues[idx]

    
n_components = 3
pca = PCA(n_components=n_components)
X = pca.fit_transform(X)


In [6]:
model.fit(X, y)
validation_vectors=[]   
for i in range(len(validation_data)):
    validation_data[i]['phrase1'] = validation_data[i]['phrase1'].lower()
    validation_data[i]['phrase2'] = validation_data[i]['phrase2'].lower()
        
    validation_data[i]['phrase1'] = remove_stopwords(validation_data[i]['phrase1'])
    validation_data[i]['phrase2'] = remove_stopwords(validation_data[i]['phrase2'])
        
    avg1=get_avg(validation_data[i]['phrase1'])
    avg2=get_avg(validation_data[i]['phrase2'])
        
    f1 = all(x == 0 for x in avg1)
    f2 = all(x == 0 for x in avg2)
    if f1 or f2:
        continue
    else:
        validation_vectors.append([avg1,avg2,validation_data[i]['label']])

l1 = np.array([vec[0] for vec in validation_vectors])
l2 = np.array([vec[1] for vec in validation_vectors])

X_v = np.concatenate((l1,l2),axis=1)

y_v = np.array([vec[2] for vec in validation_vectors])  
X_v = (X_v - mean) / std

X_v = scaler.transform(X_v)
X_v = pca.transform(X_v)

y_pred = model.predict(X_v)

accuracy = accuracy_score(y_v, y_pred)

print(f"C=1, accuracy={accuracy}")

C=1, accuracy=0.5020242914979757


In [7]:
from sklearn.svm import SVC

model_svm = SVC(kernel='linear', C=1)

model_svm.fit(X, y)

y_pred = model_svm.predict(X_v)

accuracy = accuracy_score(y_v, y_pred)

print(f"SVM, accuracy={accuracy}")


SVM, accuracy=0.5030364372469636


In [8]:
test_vectors=[]

for i in range(len(test_data)):
    test_data[i]['phrase1'] = test_data[i]['phrase1'].lower()
    test_data[i]['phrase2'] = test_data[i]['phrase2'].lower()
        
    test_data[i]['phrase1'] = remove_stopwords(test_data[i]['phrase1'])
    test_data[i]['phrase2'] = remove_stopwords(test_data[i]['phrase2'])
        
    avg1=get_avg(test_data[i]['phrase1'])
    avg2=get_avg(test_data[i]['phrase2'])
        
    f1 = all(x == 0 for x in avg1)
    f2 = all(x == 0 for x in avg2)
    if f1 or f2:
        continue
    else:
        test_vectors.append([avg1,avg2,test_data[i]['label']])
        
l1 = np.array([vec[0] for vec in test_vectors])
l2 = np.array([vec[1] for vec in test_vectors])

X_t = np.concatenate((l1,l2),axis=1)

y_t = np.array([vec[2] for vec in test_vectors])

X_t = (X_t - mean) / std

X_t = scaler.transform(X_t)
X_t = pca.transform(X_t)

y_pred = model.predict(X_t)
y_pred_two = model_svm.predict(X_t)
accuracy = accuracy_score(y_t, y_pred)
accuracy_two = accuracy_score(y_t, y_pred_two)

print(f"Logistic Regression, accuracy={accuracy}")
print(f"SVM, accuracy={accuracy_two}")

Logistic Regression, accuracy=0.5017685699848409
SVM, accuracy=0.5032844871147044


In [9]:
dicts={}

for i in range(len(test_data)):
    feature = cosine_similarity(get_avg(test_data[i]['phrase1']),get_avg(test_data[i]['phrase2']))
    dicts[(test_data[i]['phrase1'],test_data[i]['phrase2'])]=feature
    


  return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))


In [10]:
sorted_dict = sorted(dicts.items(), key=lambda x: x[1])

# Print the first element (highest value)
print("Highest Value:", sorted_dict[-1])

# Print the last element (lowest value)
print("Lowest Value:", sorted_dict[0])

Highest Value: (('general service', 'General Service'), nan)
Lowest Value: (('open front', 'unobstructed forward-facing'), -0.15442479159477063)
