# IMPORT PACKAGES

In [1]:
import pandas as pd
import random
from transformers import BertTokenizer, BertModel
import torch
import string
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import torchtext.vocab as vocab
from torch import nn

# LA PREMIERE PARTIE: data preparation

In [2]:
# Load the dataset
train_data = pd.read_excel('/kaggle/input/tweets-datasetsxlsx/test.xlsx', nrows=200)
test_data = pd.read_excel('/kaggle/input/tweets-datasetsxlsx/test.xlsx', nrows=200)

In [3]:
# Fonction pour créer des paires de tweets aléatoires
def create_random_tweet_pairs(data, num_pairs):
    tweet_pairs = []
    tweet_indices = list(range(len(data)))
    for _ in range(num_pairs):
        i, j = random.sample(tweet_indices, 2)
        tweet_pairs.append((data.iloc[i]['text'], data.iloc[j]['text']))
    return tweet_pairs


# Fonction pour étiqueter les paires de tweets en fonction de la similarité des utilisateurs
def label_tweet_pairs(tweet_pairs, data):
    labeled_pairs = []
    for pair in tweet_pairs:
        tweet1, tweet2 = pair
        user1 = data[data['text'] == tweet1]['user'].iloc[0]
        user2 = data[data['text'] == tweet2]['user'].iloc[0]
        similarity_label = 1 if user1 == user2 else 0
        labeled_pairs.append((tweet1, tweet2, user1, user2, similarity_label))
    return labeled_pairs

In [4]:
# Créer des paires de tweets pour l'entraînement et les tests
train_tweet_pairs = create_random_tweet_pairs(train_data, 100)
test_tweet_pairs = create_random_tweet_pairs(test_data, 50)

# Étiqueter les paires de tweets pour l'entraînement et les tests
labeled_train_pairs = label_tweet_pairs(train_tweet_pairs, train_data)
labeled_test_pairs = label_tweet_pairs(test_tweet_pairs, test_data)

# Convertir les paires étiquetées en DataFrame pour une manipulation facile
train_df = pd.DataFrame(labeled_train_pairs, columns=['text1', 'text2', 'user1', 'user2', 'label'])
test_df = pd.DataFrame(labeled_test_pairs, columns=['text1', 'text2', 'user1', 'user2', 'label'])

# Enregistrer les données étiquetées dans de nouveaux fichiers Excel
train_df.to_excel('train_labeled.xlsx', index=False)
test_df.to_excel('test_labeled.xlsx', index=False)

# Afficher un exemple de paire de tweets étiquetée
print("Exemple de paire de tweets étiquetée pour les données d'entraînement:")
print(train_df.head())

Exemple de paire de tweets étiquetée pour les données d'entraînement:
                                               text1  \
0  We need your support to win the #PepsiRefresh ...   
1  on my knees just praying for my Caribbean peop...   
2  @joanneijoanna I said those r criminal thought...   
3  @nofloodshooker ART IS THE OnLY ReAL class pay...   
4  Swimming with Sharks is one of my favorite thi...   

                                               text2     user1     user2  \
0  #ESQUIREuk #ukNavy #CoverStory http://instagra...   rihanna   rihanna   
1  #glamour #ratedRera http://instagram.com/p/a8g...   rihanna   rihanna   
2                          I'm in LONDON bitchesssss  ladygaga   rihanna   
3  @BarackObama thanku for the support you are se...  ladygaga  ladygaga   
4  He want dat... #CAKE RT @EvelynLozada: @rihann...   rihanna   rihanna   

   label  
0      1  
1      1  
2      0  
3      1  
4      1  


# LA DEUXIEME PARTIE: data preprocessing

In [5]:
# Initialiser le tokenizer pour les tweets
tokenizer = TweetTokenizer()
# Initialiser le stemmer
stemmer = PorterStemmer()

In [6]:
# Fonction de nettoyage du texte
def clean_text(text):
    # Convertir en minuscules
    text = text.lower()
    # Supprimer la ponctuation, sauf les hashtags et mentions
    text = ''.join([char for char in text if char not in string.punctuation or char in ['#', '@']])
    # Tokenization
    tokens = tokenizer.tokenize(text)
    # Supprimer les mots vides (stop words)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Stemming
    tokens = [stemmer.stem(word) for word in tokens]
    # Reformer le texte à partir des tokens
    cleaned_text = ' '.join(tokens)
    return cleaned_text

In [7]:
# Appliquer la fonction de nettoyage aux données d'entraînement
train_data['cleaned_text'] = train_data['text'].apply(clean_text)
# Appliquer la fonction de nettoyage aux données de test
test_data['cleaned_text'] = test_data['text'].apply(clean_text)

# Afficher les données après nettoyage
print("Exemple de données d'entraînement après nettoyage:")
print(train_data[['text', 'cleaned_text']].head())

Exemple de données d'entraînement après nettoyage:
                                                text  \
0  @BarackObama thanku for the support you are se...   
1  The first time Tom Ford and Nick Knight worked...   
2  I feel absolutely fabulous.pic.twitter.com/NZC...   
3  #BraveCharlie bornthiswayfoundation, an opport...   
4  Chipmunk Cheeks   Wisdom Teeth out before tour...   

                                        cleaned_text  
0  @barackobama thanku support send mother time v...  
1  first time tom ford nick knight work togeth go...  
2        feel absolut fabulouspictwittercomnzcb 9zzn  
3  #bravecharli bornthiswayfound opportun peopl w...  
4  chipmunk cheek wisdom teeth tour cant eat im g...  


# LA TROISIEME PARTIE:

In [8]:
# Embedding Layer
# Download and load the pre-trained Word2Vec embeddings using torchtext
word2vec = vocab.GloVe(name='6B', dim=300)  # You can specify other dimensions if needed


# Function to get the embeddings of each word in a text
def get_text_embeddings(text, model):
    tokens = text.split()
    embeddings = []
    for token in tokens:
        if token in model.stoi:
            embeddings.append(model.vectors[model.stoi[token]].tolist())  # Convert tensor to list
    if len(embeddings) == 0:
        # If no word has an embedding, return a list of zeros
        return [[0.0] * model.vectors.size(1)]
    return embeddings

.vector_cache/glove.6B.zip: 862MB [02:41, 5.35MB/s]                               
100%|█████████▉| 399999/400000 [01:07<00:00, 5884.06it/s]


In [9]:
# Apply the function to get the embeddings on the training and test data
train_data['text_embeddings'] = train_data['cleaned_text'].apply(lambda x: get_text_embeddings(x, word2vec))
test_data['text_embeddings'] = test_data['cleaned_text'].apply(lambda x: get_text_embeddings(x, word2vec))

# Save the data with embeddings to new Excel files
train_data.to_excel('train_embeddings.xlsx', index=False)
test_data.to_excel('test_embeddings.xlsx', index=False)

# Example of displaying the data after adding the embeddings
print("Example of training data with embeddings:")
print(train_data[['cleaned_text', 'text_embeddings']].head())

Example of training data with embeddings:
                                        cleaned_text  \
0  @barackobama thanku support send mother time v...   
1  first time tom ford nick knight work togeth go...   
2        feel absolut fabulouspictwittercomnzcb 9zzn   
3  #bravecharli bornthiswayfound opportun peopl w...   
4  chipmunk cheek wisdom teeth tour cant eat im g...   

                                     text_embeddings  
0  [[-0.28084999322891235, -0.23964999616146088, ...  
1  [[-0.15560999512672424, 0.5006899833679199, -0...  
2  [[0.19787000119686127, 0.10199999809265137, 0....  
3  [[-0.2583099901676178, 0.43643999099731445, -0...  
4  [[0.4510299861431122, 0.20100000500679016, 0.1...  


In [10]:
#partiiiie 3.2
# Load pre-trained transformer model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")


# Function to get embeddings using transformer encoder
def get_transformer_embeddings(text, model, tokenizer):
    input_ids = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True)
    with torch.no_grad():
        outputs = model(input_ids)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().tolist()  # Convert tensor to list
    return embeddings


# Apply the function to get embeddings on the training and test data
train_data['text_embeddings'] = train_data['cleaned_text'].apply(
    lambda x: get_transformer_embeddings(x, model, tokenizer))
test_data['text_embeddings'] = test_data['cleaned_text'].apply(
    lambda x: get_transformer_embeddings(x, model, tokenizer))

# Save the data with embeddings to new Excel files
train_data.to_excel('train_transformer_embeddings.xlsx', index=False)
test_data.to_excel('test_transformer_embeddings.xlsx', index=False)

print("Exemple de données d'entraînement avec embeddings BERT :")
print(train_data[['cleaned_text', 'text_embeddings']].head())

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Exemple de données d'entraînement avec embeddings BERT :
                                        cleaned_text  \
0  @barackobama thanku support send mother time v...   
1  first time tom ford nick knight work togeth go...   
2        feel absolut fabulouspictwittercomnzcb 9zzn   
3  #bravecharli bornthiswayfound opportun peopl w...   
4  chipmunk cheek wisdom teeth tour cant eat im g...   

                                     text_embeddings  
0  [0.1966565102338791, -0.12563742697238922, 0.6...  
1  [-0.0633881464600563, 0.1929583102464676, 0.59...  
2  [0.01856006495654583, 0.018714340403676033, 1....  
3  [0.24664439260959625, 0.09449848532676697, 0.6...  
4  [0.2095598727464676, 0.1293947845697403, 0.575...  


In [11]:
#partieeeee3.3
# Feature Extraction
# Define a function to extract features from the transformer encoder output
def extract_features(tweet_embeddings):
    # Convert list of embeddings to tensor
    tweet_embeddings_tensor = torch.tensor(tweet_embeddings)
    # Convert the tensor to float type
    tweet_embeddings_tensor = tweet_embeddings_tensor.float()
    # Return the tweet embeddings tensor
    return tweet_embeddings_tensor


# Apply the function to extract features on the training and test data
train_data['features'] = train_data['text_embeddings'].apply(extract_features)
test_data['features'] = test_data['text_embeddings'].apply(extract_features)

# Example of displaying the data after feature extraction
print("Example of training data with extracted features:")
print(train_data[['cleaned_text', 'features']].head())

Example of training data with extracted features:
                                        cleaned_text  \
0  @barackobama thanku support send mother time v...   
1  first time tom ford nick knight work togeth go...   
2        feel absolut fabulouspictwittercomnzcb 9zzn   
3  #bravecharli bornthiswayfound opportun peopl w...   
4  chipmunk cheek wisdom teeth tour cant eat im g...   

                                            features  
0  [tensor(0.1967), tensor(-0.1256), tensor(0.690...  
1  [tensor(-0.0634), tensor(0.1930), tensor(0.597...  
2  [tensor(0.0186), tensor(0.0187), tensor(1.0780...  
3  [tensor(0.2466), tensor(0.0945), tensor(0.6304...  
4  [tensor(0.2096), tensor(0.1294), tensor(0.5754...  


In [12]:
#partieeeee3.4
# Manhattan Distance Calculation
# Define a function to calculate the Manhattan distance between two tweet representations
def calculate_manhattan_distance(features1, features2):
    # Ensure both features have the same shape
    if features1.shape != features2.shape:
        # Broadcast features to a common shape
        max_shape = torch.tensor([max(s1, s2) for s1, s2 in zip(features1.shape, features2.shape)])
        features1 = features1.expand(max_shape)
        features2 = features2.expand(max_shape)

    # Calculate the absolute difference between the features
    absolute_difference = torch.abs(features1 - features2)

    # Sum the absolute differences along the appropriate dimensions
    if len(absolute_difference.shape) > 1:
        # If the tensor has more than one dimension, sum along axis 1
        manhattan_distance = torch.sum(absolute_difference, dim=1)
    else:
        # If the tensor has only one dimension, sum directly
        manhattan_distance = absolute_difference

    # Return the Manhattan distance
    return manhattan_distance


# Apply the function to calculate Manhattan distance on the training and test data
train_data['manhattan_distance'] = train_data.apply(
    lambda row: calculate_manhattan_distance(row['features'][0], row['features'][1]), axis=1)
test_data['manhattan_distance'] = test_data.apply(
    lambda row: calculate_manhattan_distance(row['features'][0], row['features'][1]), axis=1)

# Example of displaying the data after Manhattan distance calculation
print("Example of training data with Manhattan distance:")
print(train_data[['cleaned_text', 'manhattan_distance']].head())

Example of training data with Manhattan distance:
                                        cleaned_text manhattan_distance
0  @barackobama thanku support send mother time v...     tensor(0.3223)
1  first time tom ford nick knight work togeth go...     tensor(0.2563)
2        feel absolut fabulouspictwittercomnzcb 9zzn     tensor(0.0002)
3  #bravecharli bornthiswayfound opportun peopl w...     tensor(0.1521)
4  chipmunk cheek wisdom teeth tour cant eat im g...     tensor(0.0802)


In [13]:
#partieeeeee3.5
# Define a function to add a dense layer with sigmoid activation
class DenseLayer(nn.Module):
    def __init__(self, input_dim):
        super(DenseLayer, self).__init__()
        self.dense = nn.Linear(input_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.dense(x)
        x = self.sigmoid(x)
        return x

# Apply the function to calculate Manhattan distance on the training and test data
train_data['manhattan_distance'] = train_data.apply(
    lambda row: calculate_manhattan_distance(row['features'][0], row['features'][1]), axis=1)
test_data['manhattan_distance'] = test_data.apply(
    lambda row: calculate_manhattan_distance(row['features'][0], row['features'][1]), axis=1)

# Calculate the input dimension for the dense layer
input_dim = train_data['manhattan_distance'].apply(lambda x: x.shape[0] if len(x.shape) > 0 else 1).iloc[0]

# Check if the input dimension is valid
if input_dim > 0:
    # Initialize the dense layer
    dense_layer = DenseLayer(input_dim)

    # Apply the dense layer to get the similarity score on the training and test data
    train_data['similarity_score'] = train_data['manhattan_distance'].apply(lambda x: dense_layer(x) if len(x.shape) > 0 else dense_layer(torch.zeros(1)))
    test_data['similarity_score'] = test_data['manhattan_distance'].apply(lambda x: dense_layer(x) if len(x.shape) > 0 else dense_layer(torch.zeros(1)))

    # Example of displaying the data after adding similarity scores
    print("Example of training data with similarity scores:")
    print(train_data[['cleaned_text', 'similarity_score']].head())
else:
    print("No valid input dimension found. Check the 'manhattan_distance' column.")

Example of training data with similarity scores:
                                        cleaned_text  \
0  @barackobama thanku support send mother time v...   
1  first time tom ford nick knight work togeth go...   
2        feel absolut fabulouspictwittercomnzcb 9zzn   
3  #bravecharli bornthiswayfound opportun peopl w...   
4  chipmunk cheek wisdom teeth tour cant eat im g...   

                              similarity_score  
0  [tensor(0.6356, grad_fn=<UnbindBackward0>)]  
1  [tensor(0.6356, grad_fn=<UnbindBackward0>)]  
2  [tensor(0.6356, grad_fn=<UnbindBackward0>)]  
3  [tensor(0.6356, grad_fn=<UnbindBackward0>)]  
4  [tensor(0.6356, grad_fn=<UnbindBackward0>)]  


# LA QUATRIEME PARTIE: evaluation

In [14]:
def evaluate(tweet_pairs, true_labels):
    true_positives = sum(1 for pair, label in zip(tweet_pairs, true_labels) if label == 1)
    false_positives = sum(1 for pair, label in zip(tweet_pairs, true_labels) if label == 0)
    false_negatives = sum(1 for pair, label in zip(tweet_pairs, true_labels) if label == 1)

    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives != 0 else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall != 0 else 0

    return precision, recall, f1_score

In [15]:
true_labels = [pair[4] for pair in labeled_test_pairs]  # Extracting true labels from labeled_test_pairs
precision, recall, f1_score = evaluate(labeled_test_pairs, true_labels)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

Precision: 0.6
Recall: 0.5
F1 Score: 0.5454545454545454
