# Train Models
Notebook with the code needed to train and store models
to disk. This notebook has to be clean (do not define functions here, do them in
an external utils.py and import them). The notebook has to be reproducible (if
you run it twice, the same output has to be displayed and stored to disk).

## Import Utils and libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
%ls

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
[0m[01;34mdrive[0m/  [01;34msample_data[0m/


In [None]:
#Libraries to access the data and save the models
import os
import pickle


In [None]:
!pip install pyspellchecker
!pip install unidecode
!pip install sentence_transformers
!pip install textstat
!pip install fasttext

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1
Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.8
Collecting sentence_transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvi

In [None]:
# Set the HOME environment variable to the desired path
os.environ['HOME'] = '/content/drive/MyDrive/NLP Project'

# Now you can use os.environ['HOME'] to access the new value
home_dir = os.environ['HOME']

#All directories used
models_dir = os.path.join(home_dir, 'models')
datasets_dir=os.path.join(home_dir, 'Datasets')
path_folder_quora=os.path.join(datasets_dir, 'QuoraQuestionPairs')
SimpleSolution_dir= os.path.join(models_dir, 'SimpleSolution')
CleanQuestions_dir=os.path.join(datasets_dir, 'questions')
TFIDF_dir= os.path.join(models_dir, 'TFIDF')
DataframesFeatureExtraction_dir = os.path.join(datasets_dir, 'DataframesFeatureExtraction')
DataframesFeatureDistEmbeddings_dir=os.path.join(datasets_dir, 'DataframesFeatureDistEmbeddings')
Fasttext_unsup_dir= os.path.join(datasets_dir, 'Fasttext_Unsup')
ModelNewFeatures_dir=os.path.join(models_dir, 'ModelNewFeatures')

#Import all libraries, functions and classes needed from utils
exec(open(os.path.join(home_dir, 'utils.py')).read())

[nltk_data] Downloading package punkt to /content/drive/MyDrive/NLP
[nltk_data]     Project/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /content/drive/MyDrive/NLP Project/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Read and split the data

In [None]:
#Read data
train_df = pd.read_csv(os.path.join(path_folder_quora, 'quora_train_data.csv'))

# use this to provide the expected generalization results
test_df = pd.read_csv(os.path.join(path_folder_quora, 'quora_test_data.csv'))

#Split data
A_df, te_df = sklearn.model_selection.train_test_split(train_df,
                                                       test_size=0.05,
                                                       random_state=123)
tr_df, va_df = sklearn.model_selection.train_test_split(A_df,
                                                        test_size=0.05,
                                                        random_state=123)
y_tr = tr_df['is_duplicate'].values
X_tr = tr_df.drop(['is_duplicate'], axis =1)

y_va = va_df['is_duplicate'].values
X_va = va_df.drop(['is_duplicate'], axis =1)

y_te = te_df['is_duplicate'].values
X_te = te_df.drop(['is_duplicate'], axis =1)

print('X_tr.shape=',X_tr.shape)
print('y_tr.shape=',y_tr.shape)
print('X_va.shape=',X_va.shape)
print('y_va.shape=',y_tr.shape)
print('X_te.shape=',X_te.shape)
print('y_tr.shape=',y_tr.shape)

X_tr.shape= (291897, 5)
y_tr.shape= (291897,)
X_va.shape= (15363, 5)
y_va.shape= (291897,)
X_te.shape= (16172, 5)
y_tr.shape= (291897,)


# Simple solution
Here we implement the simple solution from the given code

In [None]:
# We combine all questions in a single list
all_q1 = list(X_tr["question1"])
all_q2 = list(X_tr["question2"])
all_questions = all_q1 + all_q2



# Cast all questions to a string.
all_questions = cast_list_as_strings(all_questions)


# Fit the count_vectorizer with all questions
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,1))
count_vectorizer.fit(all_questions)


#Return a scipy sparse matrix for each dataset with the features from question 1 and question 2
X_tr_q1q2 = get_features_from_df(X_tr, count_vectorizer)
X_va_q1q2 = get_features_from_df(X_va, count_vectorizer)
X_te_q1q2  = get_features_from_df(X_te, count_vectorizer)


# Train Logistic Regression Model
lr_model = sklearn.linear_model.LogisticRegression(solver="liblinear",
                                                   random_state=123)
lr_model.fit(X_tr_q1q2, y_tr)

## Save model and all datasets

In [None]:
# Check if the directory exists, if not create it
if not os.path.isdir(SimpleSolution_dir):
    os.makedirs(SimpleSolution_dir)

# Save model
with open(os.path.join(SimpleSolution_dir, 'lr_model.pkl'), 'wb') as file:
    pickle.dump(lr_model, file)

# Save training dataset
with open(os.path.join(SimpleSolution_dir, 'X_tr_q1q2.pkl'), 'wb') as file:
    pickle.dump(X_tr_q1q2, file)

# Save validation dataset
with open(os.path.join(SimpleSolution_dir, 'X_va_q1q2.pkl'), 'wb') as file:
    pickle.dump(X_va_q1q2, file)

# Save test dataset
with open(os.path.join(SimpleSolution_dir, 'X_te_q1q2.pkl'), 'wb') as file:
    pickle.dump(X_te_q1q2, file)



# Improvement

## TFIDF

**TF-IDF Calculation / Training**

In [None]:
# Check if the file exists
filenameTFIDF_train = os.path.join(TFIDF_dir, 'tfidf_train_set.sav')
filenamelog_train = os.path.join(TFIDF_dir,'log_train_set.sav')

if os.path.exists(filenameTFIDF_train):
    if os.path.exists(filenamelog_train):
        pass
else:
    # Create TFIDF instance and fit it once
    tfidf = TFIDF()
    corpus = cast_list_as_strings(list(tr_df["question1"])) + cast_list_as_strings(list(tr_df["question2"]))
    tfidf.fit(corpus)

    # Call transform for X_tr_q1 and X_tr_q2
    X_tr_q1 = tfidf.transform(cast_list_as_strings(list(tr_df["question1"])))
    X_tr_q2 = tfidf.transform(cast_list_as_strings(list(tr_df["question2"])))

    # Stack features horizontally
    X_tr_q1q2 = scipy.sparse.hstack((X_tr_q1,X_tr_q2))

    # Save TFIDF vectors to disk
    pickle.dump(X_tr_q1q2, open(filenameTFIDF_train, 'wb'))
    print("TFIDF vectors for training set saved to file:", filenameTFIDF_train)

    # Fit logistic reg
    logistic_train_set = sklearn.linear_model.LogisticRegression(solver="liblinear", random_state=123)
    y_train = tr_df["is_duplicate"].values
    logistic_train_set.fit(X_tr_q1q2, y_train)

    # Save trained model to disk
    pickle.dump(logistic_train_set, open(filenamelog_train, 'wb'))
    print("Logistic Reg for training set saved to file:", filenamelog_train)

**Validation**

In [None]:
# Check if the file exists
filenameTFIDF_val = os.path.join(TFIDF_dir, 'tfidf_val_set.sav')

if os.path.exists(filenameTFIDF_val):
    pass
else:
    # Call transform for X_va_q1 and X_va_q2
    X_va_q1 = tfidf.transform(cast_list_as_strings(list(va_df["question1"])))
    X_va_q2 = tfidf.transform(cast_list_as_strings(list(va_df["question2"])))

    # Stack features horizontally
    X_va_q1q2 = scipy.sparse.hstack((X_va_q1,X_va_q2))

    # Save TFIDF vectors to disk
    pickle.dump(X_va_q1q2, open(filenameTFIDF_val, 'wb'))
    print("TFIDF vectors for validation set saved to file:", filenameTFIDF_val)

    y_val = va_df["is_duplicate"].values

**Choosing best hyperparameters for Logistic Regression**

In [None]:
# Define parameter grid
param_grid = {
    'penalty': ['l2', 'l1'],
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear']
}

# Check if the file exists
filenamelog_train_val = os.path.join(TFIDF_dir, 'log_train_val_set.sav')

if os.path.exists(filenamelog_train_val):
    pass
else:
    # Call the function to perform grid search
    best_score, best_params = perform_grid_search(X_tr_q1q2, y_train, X_va_q1q2, y_val, param_grid)

    print("Best score:", best_score)
    print("Best parameters:", best_params)

**Retraining classifier on the combined training and validation sets using the selected hyperparameters**

In [None]:
# Check if the file exists
filenamelog_train_val = os.path.join(TFIDF_dir, 'log_train_val_set.sav')
filenameTFIDF_train_val = os.path.join(TFIDF_dir, 'tfidf_train_val_set.sav')

if os.path.exists(filenamelog_train_val):
    if os.path.exists(filenameTFIDF_train_val):
        pass
else:
    # Create TFIDF instance and fit it once
    tfidf = TFIDF()
    corpus = cast_list_as_strings(list(tr_df["question1"])) + cast_list_as_strings(list(tr_df["question2"])) + cast_list_as_strings(list(va_df["question1"])) + cast_list_as_strings(list(va_df["question2"]))
    tfidf.fit(corpus)

    # Call transform for X_tr_q1 and X_tr_q2
    X_tr_q1 = tfidf.transform(cast_list_as_strings(list(tr_df["question1"])))
    X_tr_q2 = tfidf.transform(cast_list_as_strings(list(tr_df["question2"])))

    # Stack features horizontally
    X_tr_q1q2 = scipy.sparse.hstack((X_tr_q1,X_tr_q2))

    # Call transform for X_va_q1 and X_va_q2
    X_va_q1 = tfidf.transform(cast_list_as_strings(list(va_df["question1"])))
    X_va_q2 = tfidf.transform(cast_list_as_strings(list(va_df["question2"])))

    # Stack features horizontally
    X_va_q1q2 = scipy.sparse.hstack((X_va_q1,X_va_q2))

    # Stack both training and validation features
    X_q1q2 = scipy.sparse.vstack((X_tr_q1q2,X_va_q1q2))

    # Save TFIDF vectors to disk
    pickle.dump(X_q1q2, open(filenameTFIDF_train_val, 'wb'))
    print("TFIDF vectors for training and val set saved to file::", filenameTFIDF_train_val)

    # Fit log reg with best parameters found in GridSearch
    logistic_train_val_set = sklearn.linear_model.LogisticRegression(penalty = best_params['penalty'],
                                                   C = best_params['C'],
                                                   solver = best_params['solver'],
                                                   random_state=123)

    y_true_tr_va = np.concatenate((tr_df["is_duplicate"].values, va_df["is_duplicate"].values),axis=0)
    logistic_train_val_set.fit(X_q1q2, y_true_tr_va)

    # Save trained model to disk
    pickle.dump(logistic_train_val_set, open(filenamelog_train_val, 'wb'))
    print("Logistic Reg for training and val set saved to file:", filenamelog_train_val)

### Transforming TFIDF for test set

In [None]:
# Check if the file exists
filenameTFIDF_test = os.path.join(TFIDF_dir, 'tfidf_test_set.sav')
if os.path.exists(filenameTFIDF_test):
    pass
else:
    # Call transform for X_te_q1 and X_te_q2
    X_te_q1 = tfidf.transform(cast_list_as_strings(list(te_df["question1"])))
    X_te_q2 = tfidf.transform(cast_list_as_strings(list(te_df["question2"])))

    # Stack features horizontally
    X_te_q1q2 = scipy.sparse.hstack((X_te_q1,X_te_q2))

    # Save TFIDF vectors to disk
    pickle.dump(X_te_q1q2, open(filenameTFIDF_test, 'wb'))
    print("TFIDF vectors for test set saved to file::", filenameTFIDF_test)

### Retraining model with whole entire dataset (train + validation + test subsets), as if we were in the Kaggle Challenge

In [None]:
# Check if the file exists
filenamelog_train_val_test = os.path.join(TFIDF_dir, 'log_train_val_test_set.sav')
filenameTFIDF_train_val_test = os.path.join(TFIDF_dir, 'tfidf_train_val_test_set.sav')

if os.path.exists(filenamelog_train_val_test):
    if os.path.exists(filenameTFIDF_train_val_test):
        pass
else:

    # Create TFIDF instance and fit it once
    tfidf = TFIDF()
    corpus = cast_list_as_strings(list(train_df["question1"])) + cast_list_as_strings(list(train_df["question2"]))
    tfidf.fit(corpus)

    # Call transform for X_tvt_q1 and X_tvt_q2 (train_df contains all subsets used in the code above)
    X_tvt_q1 = tfidf.transform(cast_list_as_strings(list(train_df["question1"])))
    X_tvt_q2 = tfidf.transform(cast_list_as_strings(list(train_df["question2"])))

    # Stack features horizontally
    X_tvt_q1q2 = scipy.sparse.hstack((X_tvt_q1,X_tvt_q2))

    # Save TFIDF vectors to disk
    pickle.dump(X_tvt_q1q2, open(filenameTFIDF_train_val_test, 'wb'))
    print("TFIDF vectors for training, val and test set saved to file:", filenameTFIDF_train_val_test)

    # Fit log reg
    logistic_train_val_test_set = sklearn.linear_model.LogisticRegression(penalty = best_params['penalty'],
                                                   C = best_params['C'],
                                                   solver = best_params['solver'],
                                                   random_state=123)
    y_tvt = train_df["is_duplicate"].values
    logistic_train_val_test_set.fit(X_tvt_q1q2, y_tvt)

    # Save trained model to disk
    pickle.dump(logistic_train_val_test_set, open(filenamelog_train_val_test, 'wb'))
    print("Logistic Reg for training, val and test set saved to file:", filenamelog_train_val_test)

### Kaggle Predictions: tfidf.transform to obtain feature vectors

In [None]:
# Check if the file exists
filenameTFIDF_kaggle = os.path.join(TFIDF_dir, 'tfidf_kaggle.sav')

if os.path.exists(filenameTFIDF_kaggle):
    pass
else:
    # Call transform for X_test_q1 and X_test_q2 (test_df not used until now)
    X_kaggle_q1 = tfidf.transform(cast_list_as_strings(list(test_df["question1"])))
    X_kaggle_q2 = tfidf.transform(cast_list_as_strings(list(test_df["question2"])))

    # Stack features horizontally
    X_kaggle_q1q2 = scipy.sparse.hstack((X_kaggle_q1,X_kaggle_q2))

    # Save TFIDF vectors to disk
    pickle.dump(X_kaggle_q1q2, open(filenameTFIDF_kaggle, 'wb'))
    print("TFIDF vectors for Kaggle set saved to file:", filenameTFIDF_kaggle)

## Fast Text

Let us implement the Fast Text approach.

We will train now Fast Text with the Quora Challenge questions (using just the training dataset), thus creating the proper embeddings and also training a supervised classification model using these embeddings. This will be done with the "train_supervised" function from Fast Text.

In [None]:
file1 = os.path.join(home_dir, './Datasets/FastTextSupervised/preguntas_train.txt')
flag = os.path.exists(file1)
if not flag:
  preguntas_train = tr_df[['question1', 'question2']].values.tolist()
  label_train = tr_df['is_duplicate'].tolist()
  file1 = os.path.join(home_dir, './Datasets/FastTextSupervised/preguntas_train.txt')
  training_format_sup(preguntas_train,label_train, file1)

file2 = os.path.join(home_dir, './Datasets/FastTextSupervised/preguntas_val.txt')
flag = os.path.exists(file2)
if not flag:
  preguntas_val = va_df[['question1', 'question2']].values.tolist()
  label_val = va_df['is_duplicate'].tolist()
  file2 = os.path.join(home_dir, './Datasets/FastTextSupervised/preguntas_val.txt')
  training_format_sup(preguntas_val,label_val,file2)


In [None]:
#Train
outfile1 = os.path.join(home_dir, './models/FastText_Sup/Fast_Text_Supervised.bin')
flag = os.path.exists(outfile1)
if not flag:
  model = fasttext.train_supervised(input= file1, lr=0.1, epoch=25, wordNgrams=2)
  model.save_model(outfile1)


With the intention of improving the preivous model we compute a hyperparameter optimization using the validation dataset. Note that we realized that the dataset was unbalanced, this is why we focus in the metric f1.

In [None]:
#For hyperparameter using validation set

outfile2 = os.path.join(home_dir, './models/FastText_Sup/Fast_Text_Supervised_val.bin')
flag = os.path.exists(outfile2)
if not flag:
  model_val = fasttext.train_supervised(input=file1, autotuneValidationFile= file2, autotuneMetric="f1:_label_1")
  model_val.save_model(outfile2)

We will see in the reproduce results file that unfortunately this last model did not improve the accuracy of our first model.

With this, we still want to improve the results so now we will do a preprocessing of the data (the questions) and also extract some freatures from these ones. So, we will train an unsupervised Fast Text model and then, computing the distances between the embeddings and with the new features (feature extractions) we will train a new model.

## Preprocessing

In this section, the goal is to preprocess all questions by cleaning them. To start, we create the questions_df dataframe, which contains all questions along with their respective question IDs.

In [None]:
# concatenate qid1 and qid2 into a new column called "qid"
qid1 = train_df[['qid1', 'question1']].rename(columns={'qid1': 'qid', 'question1': 'question'})
qid2 = train_df[['qid2', 'question2']].rename(columns={'qid2': 'qid', 'question2': 'question'})
questions_df = pd.concat([qid1, qid2])
print('Shape before removing repetitions:',questions_df.shape)

# Remove repeated questions, based on the id of each question
questions_df = questions_df.drop_duplicates(subset=['qid'])
print('Shape after removing repetitions:',questions_df.shape)


# sort the dataframe by "qid"
questions_df = questions_df.sort_values(by=['qid'])

# reset the index of the dataframe
questions_df = questions_df.reset_index(drop=True)
questions_df['question'] = cast_list_as_strings(list(questions_df["question"]))
questions_df.head()

Shape before removing repetitions: (646864, 2)
Shape after removing repetitions: (450125, 2)


Unnamed: 0,qid,question
0,1,What is the step by step guide to invest in sh...
1,2,What is the step by step guide to invest in sh...
2,3,What is the story of Kohinoor (Koh-i-Noor) Dia...
3,4,What would happen if the Indian government sto...
4,5,How can I increase the speed of my internet co...


### Text Cleaning
During the text cleaning process, we implement the following steps:
- Expand contractions and abbreviations
- Remove punctuation
- Spellchecking (too computationally expensive)
- Remove stopwords
- Remove accents
- Normalize spaces

Spellchecking was omitted from the process due to computational constraints. We attempted two different methods, including BKTrees and the library pyspellchecker, both of which encountered the same issue.

In [None]:
# Initialize a counter
counter = 0

# Apply text_cleaning function to each question
for idx, row in questions_df.iterrows():
    # Apply text_cleaning function to the 'question' column
    cleaned_text = text_cleaning(row['question'])
    """
    We added a counter because it was taking to long to clean all questions, specially spellchecking
    # Update counter
    counter += 1

    We added a counter because it was taking to long to clean all questions, specially spellchecking
    # Print index every 200 iterations
    if counter % 200 == 0:
        print("Index:", idx)
    """
    # Update 'question' column with cleaned text
    questions_df.at[idx, 'question'] = cleaned_text

## Feature Extraction


Update datasets with the questions preprocessed

In [None]:
X_tr,y_tr=updatequestions(questions_df,X_tr,y_tr)
X_va,y_va=updatequestions(questions_df,X_va,y_va)
X_te,y_te=updatequestions(questions_df,X_te,y_te)

In [None]:
X_tr.head(5)

Unnamed: 0,id,qid1,qid2,question1,question2
0,125898,203030,203031,Java C C popular language amongst startups bac...,develop sofor winare Java GUI C C backend
1,36249,66113,66114,convert direct speech reported speech vice ver...,feel weak spoken ish sentences ready mind cann...
2,199864,301469,301470,buy used wine barrels,buy used wine barrels
3,277339,17728,138400,best day life Excluding family things like births,Best Day life till date
4,392907,525647,525648,webworkin works,get web designing work


Compute and add the new features to the dataset

In [None]:
# Apply feature extraction to training set
train_features = X_tr.apply(lambda row: extract_features(row['question1'], row['question2']), axis=1)
train_features_df = pd.DataFrame(train_features.tolist())
X_tr = pd.concat([X_tr, train_features_df], axis=1)

# Apply feature extraction to validation set
validation_features = X_va.apply(lambda row: extract_features(row['question1'], row['question2']), axis=1)
validation_features_df = pd.DataFrame(validation_features.tolist())
X_va = pd.concat([X_va, validation_features_df], axis=1)

# Apply feature extraction to test set
test_features = X_te.apply(lambda row: extract_features(row['question1'], row['question2']), axis=1)
test_features_df = pd.DataFrame(test_features.tolist())
X_te = pd.concat([X_te, test_features_df], axis=1)


In [None]:
X_tr.head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,first_word_equal,common_words_ratio,flesch_reading_ease_q1,flesch_reading_ease_q2,flesch_kincaid_grade_q1,flesch_kincaid_grade_q2
0,125898,203030,203031,Java C C popular language amongst startups bac...,develop sofor winare Java GUI C C backend,0,0.375,28.5,63.36,11.5,6.4
1,36249,66113,66114,convert direct speech reported speech vice ver...,feel weak spoken ish sentences ready mind cann...,0,0.0,36.96,87.72,10.3,3.3
2,199864,301469,301470,buy used wine barrels,buy used wine barrels,1,1.0,92.8,92.8,1.3,1.3
3,277339,17728,138400,best day life Excluding family things like births,Best Day life till date,1,0.375,71.82,117.16,5.2,-1.9
4,392907,525647,525648,webworkin works,get web designing work,0,0.0,77.91,75.88,2.9,3.7


### Compute embeddings and distance between questions

In [None]:
#Create directory for fasttext unsupervised files
if not os.path.isdir(Fasttext_unsup_dir):
    os.makedirs(Fasttext_unsup_dir)

output_file = os.path.join(Fasttext_unsup_dir, 'training_data.txt')

First we train the unsuppervised fasttext, in order to compute the embeddings of each question

In [None]:
questions = questions_df['question'].tolist()
# Train the FastText model
training_data_file = 'formatted_questions.txt'
format_data(questions, output_file)
model = fasttext.train_unsupervised(output_file, thread=4)
model.save_model(os.path.join(Fasttext_unsup_dir, 'model_unsup.bin'))

In [None]:
loaded_model = fasttext.load_model(os.path.join(Fasttext_unsup_dir, 'model_unsup.bin'))
# Compute embeddings for each question
embeddings = []
for question in questions_df['question']:
    clean_question = question.strip().replace('\n', '')
    embeddings.append(loaded_model.get_sentence_vector(clean_question))

# Add embeddings to the dataframe
questions_df['embedding'] = embeddings

# Remove the 'question' column
questions_df.drop(columns=['question'], inplace=True)

In [None]:
print(questions_df.head(5))

   qid                                          embedding
0    1  [0.045364164, -0.0104960315, -0.084570326, 0.0...
1    2  [0.036480516, -0.005577362, -0.09713346, 0.018...
2    3  [0.09571289, 0.039823744, -0.09578352, 0.03402...
3    4  [0.073731326, 0.02422495, -0.06295221, -0.0667...
4    5  [0.1512996, 0.015331171, -0.034311138, 0.12251...


Update datasets with the embeddings and compute the distance between embeddings.

In [None]:
X_tr=updateembeddings(questions_df,X_tr)
X_va=updateembeddings(questions_df,X_va)
X_te=updateembeddings(questions_df,X_te)

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [None]:
# Check if the directory exists, if not create it
if not os.path.isdir(DataframesFeatureDistEmbeddings_dir):
    os.makedirs(DataframesFeatureDistEmbeddings_dir)

# Save DataFrames
with open(os.path.join(DataframesFeatureDistEmbeddings_dir, 'X_tr.pkl'), 'wb') as file:
    pickle.dump(X_tr, file)

with open(os.path.join(DataframesFeatureDistEmbeddings_dir, 'X_va.pkl'), 'wb') as file:
    pickle.dump(X_va, file)

with open(os.path.join(DataframesFeatureDistEmbeddings_dir, 'X_te.pkl'), 'wb') as file:
    pickle.dump(X_te, file)

# Save arrays
with open(os.path.join(DataframesFeatureDistEmbeddings_dir, 'y_tr.pkl'), 'wb') as file:
    pickle.dump(y_tr, file)

with open(os.path.join(DataframesFeatureDistEmbeddings_dir, 'y_va.pkl'), 'wb') as file:
    pickle.dump(y_va, file)

with open(os.path.join(DataframesFeatureDistEmbeddings_dir, 'y_te.pkl'), 'wb') as file:
    pickle.dump(y_te, file)

### Train Random Forest and XGBoost with the new features computed


Here we employed GridSearchCV to fine-tune our model by experimenting with various hyperparameters. Additionally, we systematically dropped different columns to determine their impact on model performance. Ultimately, we selected the optimal model identified through this iterative process for further analysis.







### XGBoost

In [None]:
xgboost_model = xgb.XGBClassifier(random_state=123)
xgboost_model.fit(X_tr.drop(['id','qid1','qid2'],axis = 1), y_tr)

### Random Forest

In [None]:
rf_model = RandomForestClassifier(max_depth = 5, random_state=123)
rf_model.fit(X_tr.drop(['id','qid1','qid2','first_word_equal', 'flesch_kincaid_grade_q1', 'flesch_kincaid_grade_q2','manhattan_distance','euclidean_distance'],axis = 1), y_tr)

In [None]:
#Save Models
# Check if the directory exists, if not create it
if not os.path.isdir(ModelNewFeatures_dir):
    os.makedirs(ModelNewFeatures_dir)

# Save Random Forest model
with open(os.path.join(ModelNewFeatures_dir, 'rf_model.pkl'), 'wb') as file:
    pickle.dump(rf_model, file)

# Save XGBoost model
with open(os.path.join(ModelNewFeatures_dir, 'xgboost_model.pkl'), 'wb') as file:
    pickle.dump(xgboost_model, file)


# Sentence Transformers model

Sentence transformers provide an efficient way to convert text into numerical data, enabling machine learning models to process and analyze text with greater accuracy. This approach uses transformer models, such as BERT, to generate sentence embeddings that capture the contextual meanings of sentences. By doing so, it facilitates a wide range of NLP tasks including semantic similarity measurement, clustering, and information retrieval. In this section, we will explore how to utilize the sentence-transformers library to quickly generate embeddings for a list of sentences and then use these embeddings for semantic similarity computation. Let us dive into the code to see how this powerful tool can be implemented in practice.

Note that here we do not do any preprocessing, since we want to have the raw questions. Removing stop-words, for example, may make the text a bit more difficult for the pretrained models to understand.

## MiniLM with contrastive loss

One of the lighter but better sentence-transformers models is *all-MiniLM-L6-v2*. We will load this model just to let the reader know that this model will also be used in the reproduce_results notebook.

More information about the model: [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)

In [None]:
# model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

We can also fine-tune this model.

In [None]:
folder = os.path.join(home_dir, './models/MiniLM-05')
flag = os.path.exists(folder) and os.path.isdir(folder)
if not flag:
  model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
  fit (model, tr_df, va_df, loss='ContrastiveLoss', epochs=7, margin = 0.5, out_model = folder)

Now let us fine tune the same model, but changing one of the parameters of the training, that is the margin of the ContrastiveLoss. Let us fine tune the same models but changing the margin. Remember that It is to be noted that the representations of dissimilar pairs will only contribute to the loss if the estimated distance  $D(X1,X2)<m$, meaning that it will no longer care how far the negative pairs  $X1$  and  $X2$  are once this limit reaches. So, it can focus more on the difficult to embed points.

In [None]:
folder = os.path.join(home_dir, './models/MiniLM-03')
flag = os.path.exists(folder) and os.path.isdir(folder)
if not flag:
  model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') # This is one of the lighter but better sentence-transformers model.
  fit (model, tr_df, va_df, loss='ContrastiveLoss', epochs=7, margin=0.3, out_model = folder)

## MiniLM with Online contrastive loss

Let us fine tune the same models but applying in this case the Online Contrastive Loss.

In [None]:
folder = os.path.join(home_dir, './models/MiniLM-05-Online')
flag = os.path.exists(folder) and os.path.isdir(folder)
if not flag:
  model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') # This is one of the lighter but better sentence-transformers model.
  fit (model, tr_df, va_df, loss='OnlineContrastiveLoss', epochs=7, margin=0.5, out_model = folder)

As we did before, we will also modify the value of the margin.

In [None]:
folder = os.path.join(home_dir, './models/MiniLM-03-Online')
flag = os.path.exists(folder) and os.path.isdir(folder)
if not flag:
  model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') # This is one of the lighter but better sentence-transformers model.
  fit (model, tr_df, va_df, loss='OnlineContrastiveLoss', epochs=7, margin=0.3, out_model = folder)

## Distilbert Quora

We found a pretrained model on a custom Quora dataset. In particular, a Distilbert model. We will fine tune this model to see if choosing a more specific model enables us to get better results. This model maps the sentences to a 768 dimensional vector space.

More information: [Quora Distilbert Base](https://huggingface.co/sentence-transformers/quora-distilbert-base)



## Baseline model

Note that since this is an internet model that we did not train, and that we only use it as baselone, it is not necessary to save it but whenever we want to use it we load it and that's it. However, I call it in this code so that it is noted that it will be used later.

In [None]:
# distilbert_quora = SentenceTransformer("sentence-transformers/quora-distilbert-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### Fine-tuning the baseline model

To see how much the model performance can improve if you fine-tune it, let us fine tune this model to the datasets we are using.

In [None]:
# folder = os.path.join(home_dir, './models/distilbert-quora-finetuned-new')
# flag = os.path.exists(folder) and os.path.isdir(folder)
# if not flag:
#   distilbert_quora = SentenceTransformer("sentence-transformers/quora-distilbert-base")
#   fit (distilbert_quora, tr_df, va_df, epochs=7, out_model = folder)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/540 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/490 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  self.pid = os.fork()


Epoch:   0%|          | 0/7 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2281 [00:00<?, ?it/s]

Batches:   0%|          | 0/229 [00:00<?, ?it/s]

Batches:   0%|          | 0/229 [00:00<?, ?it/s]

Batches:   0%|          | 0/229 [00:00<?, ?it/s]

Batches:   0%|          | 0/229 [00:00<?, ?it/s]

Batches:   0%|          | 0/229 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2281 [00:00<?, ?it/s]

  self.pid = os.fork()


Batches:   0%|          | 0/229 [00:00<?, ?it/s]

Batches:   0%|          | 0/229 [00:00<?, ?it/s]

Batches:   0%|          | 0/229 [00:00<?, ?it/s]

Batches:   0%|          | 0/229 [00:00<?, ?it/s]

Batches:   0%|          | 0/229 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2281 [00:00<?, ?it/s]

  self.pid = os.fork()


Batches:   0%|          | 0/229 [00:00<?, ?it/s]

Batches:   0%|          | 0/229 [00:00<?, ?it/s]

Batches:   0%|          | 0/229 [00:00<?, ?it/s]

Batches:   0%|          | 0/229 [00:00<?, ?it/s]

Batches:   0%|          | 0/229 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2281 [00:00<?, ?it/s]

  self.pid = os.fork()


Batches:   0%|          | 0/229 [00:00<?, ?it/s]

KeyboardInterrupt: 

I wish I could finish its training, but since the GPU was almost full, the model could not save the results, so I decided to interrupt it and just show the pretrained model on reproduce_results, and write there the final conclusions.