In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import utils as utils
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/brendanbrady/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Read the CSV file into a DataFrame
df = pd.read_csv('text_entailment_dataset/train.csv')

# Shuffle the DataFrame
df = df.sample(frac=0.1, random_state=42)  # Shuffle with fixed seed for reproducibility

# Split the data into training and validation sets
train_df, validation_df = train_test_split(df, test_size=0.1, random_state=42)

# Write the training and validation DataFrames to separate CSV files
train_df.to_csv('text_entailment_dataset/train_data.csv', index=False)
validation_df.to_csv('text_entailment_dataset/validation_data.csv', index=False)


train_dataset = df = pd.read_csv('text_entailment_dataset/train_data.csv')
validation_dataset = df = pd.read_csv('text_entailment_dataset/validation_data.csv')
test_dataset = df = pd.read_csv('text_entailment_dataset/test_data.csv')

In [3]:
train_dataset.head()

Unnamed: 0,premise,hypothesis,label
0,Boys and adults at a soccer game.,The kids and adults are playing each other.,1
1,A man in a bright orange shirt scales a slate ...,A man wears a blue shirt.,2
2,Two boys hold badminton rackets to the sky in ...,The boys are outside.,0
3,A person on a cellphone stands at a window in ...,A person swimming in the pool and talking to s...,2
4,Two men in a jogging race on a black top stree...,One man is wearing a black top.,0


In [4]:
train_dataset.shape

(49442, 3)

In [5]:
validation_dataset.head()

Unnamed: 0,premise,hypothesis,label
0,A black dog running on the beach near the shore.,a dog is sleeping,2
1,A blue go-cart in the dirt pointing downhill.,A child racing his go-cart against his friends.,1
2,A chef tastes his food over a boiling pot.,A chef is working.,0
3,A woman posing for a picture while cooking food.,There is a women watching others cook,2
4,A boy hiding behind a tree,The boy is asian.,1


In [6]:
validation_dataset.shape

(5494, 3)

In [7]:
test_dataset.head()

Unnamed: 0,premise,hypothesis
0,This church choir sings to the masses as they ...,The church has cracks in the ceiling.
1,This church choir sings to the masses as they ...,The church is filled with song.
2,This church choir sings to the masses as they ...,A choir singing at a baseball game.
3,"A woman with a green headscarf, blue shirt and...",The woman is young.
4,"A woman with a green headscarf, blue shirt and...",The woman is very happy.


In [8]:
test_dataset.shape

(9824, 2)

In [9]:
train_dataset[["premise"]] = train_dataset[["premise"]].astype(str)
train_dataset["premise"] = train_dataset["premise"].apply(utils.change_lower)
train_dataset["premise"] = train_dataset["premise"].apply(utils.clean_data)
train_dataset["premise"] = train_dataset["premise"].apply(utils.remover)

test_dataset[["hypothesis"]] = train_dataset[["hypothesis"]].astype(str)
train_dataset["hypothesis"] = train_dataset["hypothesis"].apply(utils.change_lower)
train_dataset["hypothesis"] = train_dataset["hypothesis"].apply(utils.clean_data)
train_dataset["hypothesis"] = train_dataset["hypothesis"].apply(utils.remover)

validation_dataset[["premise"]] = validation_dataset[["premise"]].astype(str)
validation_dataset["premise"] = validation_dataset["premise"].apply(utils.change_lower)
validation_dataset["premise"] = validation_dataset["premise"].apply(utils.clean_data)
validation_dataset["premise"] = validation_dataset["premise"].apply(utils.remover)

validation_dataset[["hypothesis"]] = validation_dataset[["hypothesis"]].astype(str)
validation_dataset["hypothesis"] = validation_dataset["hypothesis"].apply(utils.change_lower)
validation_dataset["hypothesis"] = validation_dataset["hypothesis"].apply(utils.clean_data)
validation_dataset["hypothesis"] = validation_dataset["hypothesis"].apply(utils.remover)

test_dataset[["premise"]] = test_dataset[["premise"]].astype(str)
test_dataset["premise"] = test_dataset["premise"].apply(utils.change_lower)
test_dataset["premise"] = test_dataset["premise"].apply(utils.clean_data)
test_dataset["premise"] = test_dataset["premise"].apply(utils.remover)

test_dataset[["hypothesis"]] = test_dataset[["hypothesis"]].astype(str)
test_dataset["hypothesis"] = test_dataset["hypothesis"].apply(utils.change_lower)
test_dataset["hypothesis"] = test_dataset["hypothesis"].apply(utils.clean_data)
test_dataset["hypothesis"] = test_dataset["hypothesis"].apply(utils.remover)

In [10]:
train_pairs = []
for i in range(len(train_dataset.index)):
    train_pairs.append(train_dataset["premise"][i] + " " + train_dataset["hypothesis"][i])

validation_pairs = []
for i in range(len(validation_dataset.index)):
    validation_pairs.append(validation_dataset["premise"][i] + " " + validation_dataset["hypothesis"][i])

test_pairs = []
for i in range(len(test_dataset.index)):
    test_pairs.append(test_dataset["premise"][i] + " " + test_dataset["hypothesis"][i])

In [11]:
BINARIZED = False

vectorizer = CountVectorizer(binary=BINARIZED)

train_vectorized = vectorizer.fit_transform(train_pairs).toarray()

validation_vectorized = vectorizer.transform(validation_pairs).toarray()

test_vectorized = vectorizer.transform(test_pairs).toarray()

In [12]:
print("Size of vocabulary:", len(vectorizer.get_feature_names_out()))

Size of vocabulary: 15825


In [13]:
MAX_ITER = 200
classifier = LogisticRegression(max_iter=MAX_ITER).fit(train_vectorized, train_dataset["label"].to_numpy())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def get_prfa(dev_y: list, preds: list, verbose=False) -> tuple:
    """
    Calculate precision, recall, f1, and accuracy for a given set of predictions and labels.
    Args:
        dev_y: list of labels
        preds: list of predictions
        verbose: whether to print the metrics
    Returns:
        tuple of precision, recall, f1, and accuracy
    """
    precision = precision_score(dev_y, preds, average="micro")
    recall = recall_score(dev_y, preds, average="micro")
    accuracy = accuracy_score(dev_y, preds)
    f1 = f1_score(dev_y, preds, average="micro")
    # print("f1:", f1)

    return precision, recall, f1, accuracy

In [28]:
train_predictions = classifier.predict(train_vectorized)

prfa = get_prfa(train_dataset["label"].to_list(), train_predictions)

print(prfa[0])
print(prfa[1])
print(prfa[2])
print(prfa[3])

0.6846810404109867
0.6846810404109867
0.6846810404109867
0.6846810404109867


In [26]:
validation_predictions = classifier.predict(validation_vectorized)

p, r, f, a = get_prfa(validation_predictions, validation_dataset["label"])

print(p)
print(r)
print(f)
print(a)

0.4788860575172916
0.4788860575172916
0.4788860575172916
0.4788860575172916
