In [8]:
import pandas as pd
import re

In [9]:
train = pd.read_json("train.json")
test = pd.read_json("test.json")
validation = pd.read_json("validation.json")
train.head(5)

Unnamed: 0,premise,hypothesis,label
0,Pluto rotates once on its axis every 6.39 Eart...,Earth rotates on its axis once times in one day.,neutral
1,---Glenn =====================================...,Earth rotates on its axis once times in one day.,entails
2,geysers - periodic gush of hot water at the su...,The surface of the sun is much hotter than alm...,neutral
3,Facts: Liquid water droplets can be changed in...,Evaporation is responsible for changing liquid...,entails
4,"By comparison, the earth rotates on its axis o...",Earth rotates on its axis once times in one day.,entails


In [None]:
import re

def clean_text(text: str) -> str:
    text = text.lower()  # lowercase
    
    # replace non-digit dots with space
    text = re.sub(r'(?<!\d)\.(?!\d)', ' ', text)
    
    # remove everything except letters, numbers, dot-in-numbers, spaces
    text = re.sub(r'[^a-z0-9.\s]', ' ', text)
    
    # normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [None]:
# apply clean text on train
train["premise_clean"] = train["premise"].apply(clean_text)
train["hypothesis_clean"]  = train["hypothesis"].apply(clean_text)


# apply clean text on test
test["premise_clean"] = test["premise"].apply(clean_text)
test["hypothesis_clean"]  = test["hypothesis"].apply(clean_text)


# apply clean text on validation
validation["premise_clean"] = validation["premise"].apply(clean_text)
validation["hypothesis_clean"]  = validation["hypothesis"].apply(clean_text)

In [None]:
# import required libraries
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize


[nltk_data] Downloading package punkt_tab to C:\Users\lucac.DESKTOP-
[nltk_data]     V7T4SBD\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
# define contractions dictionary
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have",
                    "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not",
                    "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did",
                    "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have",
                    "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have",
                    "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                    "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us",
                    "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have",
                    "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
                    "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not",
                    "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have",
                    "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
                    "so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                    "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                    "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not",
                    "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have",
                    "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have",
                    "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will",
                    "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have",
                    "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have",
                    "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                    "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}


In [None]:
# define preprocessing function
def pre_process(sent_list):
    """
    Preprocess a list of sentences:
    - Handle contractions
    - Remove punctuation
    - Tokenize
    """
    output = []
    for sent in sent_list:
        sent = sent.lower() # case-folding (already done, but keeping for consistency)
        for word, new_word in contraction_dict.items():
            sent = sent.replace(word, new_word) # dealing with contractions
        sent = re.sub(r'[^\w\s]','',sent) # removing punctuation
        output.append(word_tokenize(sent)) # tokenization
    return output

# Apply preprocessing to get tokenized lists
print("Tokenizing premises...")
train_premise_tokens = pre_process(train["premise_clean"].tolist())
test_premise_tokens = pre_process(test["premise_clean"].tolist())
val_premise_tokens = pre_process(validation["premise_clean"].tolist())

print("Tokenizing hypotheses...")
train_hypothesis_tokens = pre_process(train["hypothesis_clean"].tolist())
test_hypothesis_tokens = pre_process(test["hypothesis_clean"].tolist())
val_hypothesis_tokens = pre_process(validation["hypothesis_clean"].tolist())

print(f"Example tokenized premise: {train_premise_tokens[0]}")
print(f"Example tokenized hypothesis: {train_hypothesis_tokens[0]}")


Tokenizing premises...
Tokenizing hypotheses...
Example tokenized premise: ['pluto', 'rotates', 'once', 'on', 'its', 'axis', 'every', '639', 'earth', 'days']
Example tokenized hypothesis: ['earth', 'rotates', 'on', 'its', 'axis', 'once', 'times', 'in', 'one', 'day']


In [None]:
word_to_ix = {"<PAD>": 0, "<UNK>": 1}  # PAD for padding, UNK for unknown words

# build vocabulary from training data only
for sentence in train_premise_tokens + train_hypothesis_tokens:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

word_list = list(word_to_ix.keys())
vocab_size = len(word_to_ix)

print(f"Vocabulary size: {vocab_size}")
print(f"First 20 words in vocab: {word_list[:20]}")


Vocabulary size: 20499
First 20 words in vocab: ['<PAD>', '<UNK>', 'pluto', 'rotates', 'once', 'on', 'its', 'axis', 'every', '639', 'earth', 'days', 'glenn', 'per', 'day', 'the', 'about', 'geysers', 'periodic', 'gush']


In [None]:
# convert tokens to indices
def to_index(data, to_ix):
    """
    Convert token lists to index lists
    Use <UNK> token for words not in vocabulary
    """
    result = []
    for sentence in data:
        indexed_sentence = []
        for word in sentence:
            if word in to_ix:
                indexed_sentence.append(to_ix[word])
            else:
                indexed_sentence.append(to_ix["<UNK>"])  # Unknown word
        result.append(indexed_sentence)
    return result

# convert all datasets to indices
train_premise_idx = to_index(train_premise_tokens, word_to_ix)
train_hypothesis_idx = to_index(train_hypothesis_tokens, word_to_ix)

test_premise_idx = to_index(test_premise_tokens, word_to_ix)
test_hypothesis_idx = to_index(test_hypothesis_tokens, word_to_ix)

val_premise_idx = to_index(val_premise_tokens, word_to_ix)
val_hypothesis_idx = to_index(val_hypothesis_tokens, word_to_ix)

print(f"Example indexed premise: {train_premise_idx[0]}")
print(f"Example indexed hypothesis: {train_hypothesis_idx[0]}")


Example indexed premise: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
Example indexed hypothesis: [10, 3, 5, 6, 7, 4, 48, 78, 60, 14]


In [22]:
# maximum length from the training set
MAX_LENGTH_PREMISE = max([len(s) for s in train_premise_idx])
MAX_LENGTH_HYPOTHESIS = max([len(s) for s in train_hypothesis_idx])

print(f"Max premise length: {MAX_LENGTH_PREMISE}")
print(f"Max hypothesis length: {MAX_LENGTH_HYPOTHESIS}")

Max premise length: 11640
Max hypothesis length: 36


In [None]:
# encode labels
label_to_ix = {"entails": 0, "neutral": 1}
ix_to_label = {0: "entails", 1: "neutral"}

# convert labels to indices
train_labels = [label_to_ix[label] for label in train["label"]]
test_labels = [label_to_ix[label] for label in test["label"]]
val_labels = [label_to_ix[label] for label in validation["label"]]

print(f"Label mapping: {label_to_ix}")
print(f"Example labels: {train_labels[:5]}")
print(f"Original labels: {train['label'].iloc[:5].tolist()}")

Label mapping: {'entails': 0, 'neutral': 1}
Example labels: [1, 0, 1, 0, 0]
Original labels: ['neutral', 'entails', 'neutral', 'entails', 'entails']


In [24]:
# final statistics
print(f"Training examples: {len(train_premise_idx)}")
print(f"Validation examples: {len(val_premise_idx)}")
print(f"Test examples: {len(test_premise_idx)}")
print(f"Vocabulary size: {vocab_size}")
print(f"Number of labels: {len(label_to_ix)}")
print(f"Max premise length: {MAX_LENGTH_PREMISE}")
print(f"Max hypothesis length: {MAX_LENGTH_HYPOTHESIS}")

Training examples: 23088
Validation examples: 1304
Test examples: 2126
Vocabulary size: 20499
Number of labels: 2
Max premise length: 11640
Max hypothesis length: 36


In [None]:
# save preprocessed data
import pickle

preprocessed_data = {
    'train_premise_idx': train_premise_idx,
    'train_hypothesis_idx': train_hypothesis_idx,
    'train_labels': train_labels,
    'test_premise_idx': test_premise_idx,
    'test_hypothesis_idx': test_hypothesis_idx,
    'test_labels': test_labels,
    'val_premise_idx': val_premise_idx,
    'val_hypothesis_idx': val_hypothesis_idx,
    'val_labels': val_labels,
    'word_to_ix': word_to_ix,
    'label_to_ix': label_to_ix,
    'ix_to_label': ix_to_label,
    'vocab_size': vocab_size,
    'MAX_LENGTH_PREMISE': MAX_LENGTH_PREMISE,
    'MAX_LENGTH_HYPOTHESIS': MAX_LENGTH_HYPOTHESIS
}

with open('preprocessed_data.pkl', 'wb') as f:
    pickle.dump(preprocessed_data, f)
    
print("Preprocessed data saved to 'preprocessed_data.pkl'")


Preprocessed data saved to 'preprocessed_data.pkl'


## How to Load Preprocessed Data

In the model training notebook, you can load this data with:

```python
import pickle

with open('preprocessed_data.pkl', 'rb') as f:
    data = pickle.load(f)

# Access the data
train_premise_idx = data['train_premise_idx']
train_hypothesis_idx = data['train_hypothesis_idx']
train_labels = data['train_labels']
word_to_ix = data['word_to_ix']
vocab_size = data['vocab_size']
# ... etc
```
