# Lecture des données

In [1]:
import pandas as pd

### Read train data

In [2]:
train_filename = "dataset/sts-b_train.csv"
data_train = pd.read_csv(train_filename) 
data_train.head()

Unnamed: 0,sentence1,sentence2,score,id
0,A plane is taking off.,An air plane is taking off.,5.0,0
1,A man is playing a large flute.,A man is playing a flute.,3.8,1
2,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,3.8,2
3,Three men are playing chess.,Two men are playing chess.,2.6,3
4,A man is playing the cello.,A man seated is playing the cello.,4.25,4


### Read test data

In [3]:
test_filename = "dataset/sts-b_test.csv"
data_test = pd.read_csv(test_filename) 
data_test.head()

Unnamed: 0,id,sentence1,sentence2
0,100000,A man with a hard hat is dancing.,A man wearing a hard hat is dancing.
1,100001,A young child is riding a horse.,A child is riding a horse.
2,100002,A man is feeding a mouse to a snake.,The man is feeding a mouse to the snake.
3,100003,A woman is playing the guitar.,A man is playing guitar.
4,100004,A woman is playing the flute.,A man is playing a flute.


# Preprocessing

### Lower all words

In [4]:
data_train['pre_sentence1'] = data_train['sentence1'].str.lower()
data_train['pre_sentence2'] = data_train['sentence2'].str.lower()

In [5]:
data_test['pre_sentence1'] = data_test['sentence1'].str.lower()
data_test['pre_sentence2'] = data_test['sentence2'].str.lower()

### Remove punctuations

In [6]:
import string

In [7]:
table = str.maketrans('', '', string.punctuation)

In [8]:
data_train['pre_sentence1'] = data_train['pre_sentence1'].str.translate(table)
data_train['pre_sentence2'] = data_train['pre_sentence2'].str.translate(table)

In [9]:
data_test['pre_sentence1'] = data_test['pre_sentence1'].str.translate(table)
data_test['pre_sentence2'] = data_test['pre_sentence2'].str.translate(table)

### Tokenize

In [10]:
from nltk import word_tokenize

In [11]:
data_train['pre_sentence1'] = data_train.apply(lambda row: word_tokenize(row['pre_sentence1']), axis=1)
data_train['pre_sentence2'] = data_train.apply(lambda row: word_tokenize(row['pre_sentence2']), axis=1)

In [12]:
data_test['pre_sentence1'] = data_test.apply(lambda row: word_tokenize(row['pre_sentence1']), axis=1)
data_test['pre_sentence2'] = data_test.apply(lambda row: word_tokenize(row['pre_sentence2']), axis=1)

### Lemmatize

In [13]:
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [14]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [15]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(sentence):
    return [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in sentence]

In [16]:
data_train['pre_sentence1'] = data_train.apply(lambda row: lemmatize_text(row['pre_sentence1']), axis=1)
data_train['pre_sentence2'] = data_train.apply(lambda row: lemmatize_text(row['pre_sentence2']), axis=1)

In [17]:
data_test['pre_sentence1'] = data_test.apply(lambda row: lemmatize_text(row['pre_sentence1']), axis=1)
data_test['pre_sentence2'] = data_test.apply(lambda row: lemmatize_text(row['pre_sentence2']), axis=1)

### Corpus preprocessed

In [18]:
import numpy as np

In [19]:
train_corpus = np.concatenate([data_train["sentence1"].values, data_train["sentence2"].values])
pre_train_corpus = np.concatenate([data_train["pre_sentence1"].values, data_train["pre_sentence2"].values])
pre_train_corpus = [' '.join(sentence) for sentence in pre_train_corpus]

In [20]:
test_corpus = np.concatenate([data_test["sentence1"].values, data_test["sentence2"].values])
pre_test_corpus = np.concatenate([data_test["pre_sentence1"].values, data_test["pre_sentence2"].values])
pre_test_corpus = [' '.join(sentence) for sentence in pre_test_corpus]

# Cosine similarity

In [21]:
from scipy.spatial.distance import cosine

In [22]:
def calculate_cosine_similarity(embedding1, embedding2):
    cosine_similarity = 1 - cosine(embedding1, embedding2)
    return cosine_similarity

# Create TF-IDF matrix

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(pre_train_corpus).todense()
tfidf_matrix = np.array(tfidf_matrix)

# Calculate similarity (train)

In [25]:
NUMBER_SENTENCES = len(data_train)

In [26]:
resultats_train = pd.DataFrame(columns=['score', 'scoreTFIDF'])

In [27]:
resultats_train['score'] = data_train['score']

In [28]:
# TF-IDF
resultats_train["scoreTFIDF"] = data_train.apply(
        lambda row: (5.0 * calculate_cosine_similarity(
            tfidf_matrix[int(row.name), :], tfidf_matrix[NUMBER_SENTENCES + int(row.name), :]
        )) 
        if (
            sum(tfidf_matrix[int(row.name), :]) != 0
            and sum(tfidf_matrix[NUMBER_SENTENCES + int(row.name), :]) != 0
        )
        else 0,
        axis=1,
)

In [29]:
resultats_train.head()

Unnamed: 0,score,scoreTFIDF
0,5.0,3.579088
1,3.8,4.165823
2,3.8,4.310364
3,2.6,5.0
4,4.25,4.100541


# Spearmanr correlation (train)

In [30]:
from scipy.stats import spearmanr

In [31]:
scoreTFIDF = resultats_train['scoreTFIDF'].tolist()
score = resultats_train['score'].tolist()

print("Score pour TF-IDF : " + str(spearmanr(scoreTFIDF, score)[0]))

Score pour TF-IDF : 0.7100237483656321


# Test value

In [32]:
tfidf_matrix_test = vectorizer.fit_transform(pre_test_corpus).todense()
tfidf_matrix_test = np.array(tfidf_matrix_test)

In [33]:
NUMBER_SENTENCES_TEST = len(data_test)

In [34]:
resultat = pd.DataFrame(columns=['id', 'score'])

In [35]:
resultat['score'] = data_test.apply(
        lambda row: 5.0 * calculate_cosine_similarity(
            tfidf_matrix_test[int(row.name), :], tfidf_matrix_test[NUMBER_SENTENCES_TEST + int(row.name), :]
        )
        if (
            sum(tfidf_matrix_test[int(row.name), :]) != 0
            and sum(tfidf_matrix_test[NUMBER_SENTENCES_TEST + int(row.name), :]) != 0
        )
        else 0,
        axis=1,
)

In [36]:
resultat['id'] = data_test['id']

resultat.head()

Unnamed: 0,id,score
0,100000,4.427079
1,100001,4.33258
2,100002,5.0
3,100003,3.906691
4,100004,4.145747


In [37]:
resultat.to_csv('./results/Baseline_TFIDF_2.csv', index=False)