In [None]:
!pip install gensim==4.2.0
!pip install texthero==1.0.5
!pip install unidecode

**Importation des librairies**

In [None]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns
import re
from bs4 import BeautifulSoup
import texthero as hero
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from xgboost import plot_importance
from transformers import BertTokenizer, BertModel
import torch


**Importation du Dataset Scraper**

In [None]:
df=pd.read_csv("/kaggle/input/scrapped-data/st_questions_data.csv")
df.head()

**Exploration des deux datasests**

In [None]:
df["OpenStatus"].unique()

In [None]:
print(df.shape)
print(df.isnull().sum())
df.describe(include="all")

In [None]:

df.OpenStatus.value_counts()

**Let's import an existing datasets from kaggle release at 2011**

In [None]:
data=pd.read_csv("/kaggle/input/kaggle-st-questions/train-sample.csv")

In [None]:
print(data.shape)
data.head()

In [None]:
print(data.isnull().sum())
data.describe(include="all")

In [None]:
data.OpenStatus.value_counts()

**Visualisation de la distribution des classes des deux datasets**

In [None]:
sns.countplot(data=df, x='OpenStatus',palette='viridis')
plt.title("Distribution des catégories Open status")
plt.xlabel("Statuts")
plt.ylabel("Nombre de questions")
plt.xticks(rotation=45)  # Rotation des labels pour plus de lisibilité
plt.show()

In [None]:
sns.countplot(data=data, x='OpenStatus',palette='viridis')
plt.title("Distribution des catégories Open status")
plt.xlabel("Statuts")
plt.ylabel("Nombre de questions")
plt.xticks(rotation=45)  # Rotation des labels pour plus de lisibilité
plt.show()

**Nettoyage du dataset scraper**

In [None]:
#Fonction de nettoyage de texte brute scrapper
def clean_text(text):
    if pd.isna(text):  # Gerer les valeurs manquantes
        return text
    
    # Supprimer les balises HTML en gardant seulement le texte brut
    text_without_html = BeautifulSoup(text, "html.parser").get_text()

    # Supprimer les blocs de code (entre backticks et dans <code></code>)
    text_without_code = re.sub(r"```.*?```", "", text_without_html, flags=re.DOTALL)  # Code entre backticks
    text_without_code = re.sub(r"<code>.*?</code>", "", text_without_code, flags=re.DOTALL)  # Code dans <code></code>

    # Supprimer les URL (commencent par http:// ou https://)
    text_without_urls = re.sub(r"http[s]?://\S+", "", text_without_code)

    # Supprimer les hashtags (#) et mentions (@)
    text_without_hashtags_mentions = re.sub(r"#\S+", "", text_without_urls)  # Hashtags
    text_cleaned = re.sub(r"@\S+", "", text_without_hashtags_mentions)  # Mentions

    # Supprimer les espaces superflus
    text_cleaned = re.sub(r"\s+", " ", text_cleaned).strip()
    return text_cleaned

In [None]:
#Application de la fonction pour nettoyer le datasets scrappe
df['Title']=df['Title'].apply(clean_text)
df['Body']=df['Body'].apply(clean_text)

In [None]:
df.head()

**Text mining avec la librairie texthero**  [https://texthero.org/](http://) 

In [None]:
# # texthero Define a custom pipeline for preprocessing
# custom_pipeline = [
#     preprocessing.fillna,                   # Fill NaN values
#     preprocessing.lowercase,                # Convert to lowercase
#     preprocessing.remove_whitespace,        # Remove leading/trailing whitespace
#     preprocessing.remove_punctuation,       # Remove punctuation
#     preprocessing.remove_diacritics,        # Remove diacritics
# ]

**# Redéfininition de remove_punctuation pour forcer regex=True due a la nouvelle version de pandas**

In [None]:
def remove_punctuation(input):
    RE_PUNCT = re.compile(r'([%s])+' % re.escape(string.punctuation), re.UNICODE)
    return input.str.replace(RE_PUNCT, " ", regex=True)

# Remplacer la fonction remove_punctuation de Texthero
hero.preprocessing.remove_punctuation = remove_punctuation

In [None]:
# kaggle dataset is tooo big, let's take just 8 k rows. 
# Sample 8k randomly! 
data1=data.sample(8000, random_state = 42)

In [None]:
data1.head()

In [None]:
# Ensure columns are filled with empty strings for NaN values
data1[['Tag1', 'Tag2', 'Tag3', 'Tag4', 'Tag5']] = data1[['Tag1', 'Tag2', 'Tag3', 'Tag4', 'Tag5']].fillna('')

# Combine the columns into a single 'Tags' column
data1['Tags'] = data1[['Tag1', 'Tag2', 'Tag3', 'Tag4', 'Tag5']].apply(
    lambda x: ', '.join([str(tag) for tag in x if tag]), axis=1
)

In [None]:
data1.head()

In [None]:
#Nettoyer les tags combines pour plus de clarete
data1.drop(['Tag1', 'Tag2', 'Tag3', 'Tag4', 'Tag5'],axis=1,inplace=True)

**Text mining sur la colonne Title Body et Tags des questions des 2 datasets avec texthero**

In [None]:
data1['Title'] = hero.clean(data1['Title'])
data1['BodyMarkdown'] = hero.clean(data1['BodyMarkdown'])
data1['Tags'] = hero.clean(data1['Tags'])
df['Title'] = hero.clean(df['Title'])
df['Body'] = hero.clean(df['Body'])
df['Tags'] = hero.clean(df['Tags'])

In [None]:
df.head()

In [None]:
data1.head()

**Mapping des deux datasets:celui de kaggle et scraper**

In [None]:
#Mapper la colonne OpenStatus(target) des 2 datasets
mapping_col = {
    'open': 'open',
    'not a real question': 'Not suitable for this site',
    'off topic': 'Not suitable for this site',
    'not constructive': 'Needs details or clarity',
    'too localized': 'Needs more focus'
}

In [None]:
data1['OpenStatus'] = data1['OpenStatus'].map(mapping_col)
data1.head()

In [None]:
data1.OpenStatus.value_counts()

In [None]:
# Faire le mapping des colonnes existant dans les 2 datasets mais portant des noms differents 
data1 = data1.rename(columns={"ReputationAtPostCreation": "Reputation","BodyMarkdown":"Body","OwnerUndeletedAnswerCountAtPostTime":"AnswerCount"})
data1.head()

**Feature Engineering**

In [None]:
#Creation de la colonne Text_Length une idee sur la longueur du contenu de la question
df['BodyLength'] = df['Body'].apply(lambda x: len(x.split()))
data1['BodyLength'] = data1['Body'].apply(lambda x: len(x.split()))

**Concatenation des 2 datasets**

In [None]:
df_conc=pd.concat([df, data1], ignore_index=True)
df_conc.head()

In [None]:
#Suppression des variables non pertinentes 
df_conc.drop(['QuestionId','CreationDate','AcceptRate','IsAnswered','ViewCount','Score','LastActivityDate','PostId','PostCreationDate','OwnerCreationDate','PostClosedDate'],axis=1,inplace=True)
df_conc.head()

In [None]:
#Suppression de la ligne avec la colonne Reputation et OwnerUserId nan
df_conc.dropna(inplace=True)
print(df_conc.shape)
df_conc.isnull().sum()

**Choix des Variables Pertinentes**

In [None]:
sns.countplot(data=df_conc, x='OpenStatus',palette='viridis')
plt.title("Distribution des catégories Open status")
plt.xlabel("Statuts")
plt.ylabel("Nombre de questions")
plt.xticks(rotation=45)  # Rotation des labels pour plus de lisibilité
plt.show()

**Visualisation des relations**

In [None]:
# Appliquer un style
sns.set_theme(style="whitegrid", palette="pastel")

# Définir une palette de couleurs
palette = sns.color_palette("Set2")

# Graphique 1 : Relation entre ReputationAtPostCreation et OpenStatus
plt.figure(figsize=(15, 8))
sns.boxplot(x="OpenStatus", y="Reputation", data=df_conc, palette=palette)
plt.title("Relation entre Reputation et OpenStatus", fontsize=16, fontweight="bold")
plt.ylabel("ReputationAtPostCreation", fontsize=14)
plt.xlabel("OpenStatus", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()


In [None]:
# Graphique 2 : Relation entre Text_Length et OpenStatus
plt.figure(figsize=(15, 8))
sns.boxplot(x="OpenStatus", y="BodyLength", data=df_conc, palette=palette)
plt.title("Relation entre BodyLength et OpenStatus", fontsize=16, fontweight="bold")
plt.ylabel("BodyLength", fontsize=14)
plt.xlabel("OpenStatus", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
# Graphique 2 : Relation entre AnswerCount et OpenStatus
plt.figure(figsize=(15, 8))
sns.boxplot(x="OpenStatus", y="AnswerCount", data=df_conc, palette=palette)
plt.title("Relation entre AnswerCount et OpenStatus", fontsize=16, fontweight="bold")
plt.ylabel("AnswerCount", fontsize=14)
plt.xlabel("OpenStatus", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

**Modeling**

**Codage de la variable cible**

In [None]:
label_id = {'open': 0, 'Not suitable for this site': 1, 'Needs details or clarity': 2, 'Needs more focus': 3, 'Duplicate': 4,'Opinion-based': 5}

In [None]:
df_conc["OpenStatus"]=df_conc["OpenStatus"].map(label_id)
df_conc.head()

In [None]:
df_conc.shape
df_conc.isnull().sum()

**Division de l'ensemble de donnees**

In [None]:
df_train,df_remain= train_test_split(df_conc,test_size=0.20, shuffle=True, random_state=42, stratify=df_conc['OpenStatus'])
df_val,df_test= train_test_split(df_remain,test_size=0.3333, shuffle=True, random_state=42, stratify=df_remain['OpenStatus'])

In [None]:
print(df_train.shape)
df_train.head()

In [None]:

def generate_bert_embeddings(df, text_column, bert_model_name="bert-base-uncased"):
    """
    genere des tokens et des embeddings BERT.
    Args:
    - df (pd.DataFrame): DataFrame d'entree.
    - text_column (str): Nom de la colonne contenant les textes.
    - bert_model_name (str): Nom du modèle BERT.

    Returns:
    - embeddings (torch.Tensor): Embeddings 
    """
    tokenizer = BertTokenizer.from_pretrained(bert_model_name)
    model = BertModel.from_pretrained(bert_model_name)

    # Tokenization
    tokens = tokenizer(list(df[text_column]), 
                       padding=True, truncation=True,max_length=64,return_tensors="pt")

    # Génération des embeddings
    with torch.no_grad():
        embeddings = model(**tokens).last_hidden_state.mean(dim=1)

    return tokens, embeddings
    

In [None]:
X_train=df_train[['Reputation','AnswerCount','BodyLength','Title','Body','Tags']]
y_train=df_train['OpenStatus']
X_val=df_val[['Reputation','AnswerCount','BodyLength','Title','Body','Tags']]
y_val=df_val['OpenStatus']
X_test=df_test[['Reputation','AnswerCount','BodyLength','Title','Body','Tags']]
y_test=df_test['OpenStatus']

In [None]:
_, title_embed_train = generate_bert_embeddings(X_train, "Title")
_, body_embed_train = generate_bert_embeddings(X_train, "Body")
_, tag_embed_train = generate_bert_embeddings(X_train, "Tags")

In [None]:
_, title_embed_val = generate_bert_embeddings(X_val, "Title")
_, body_embed_val = generate_bert_embeddings(X_val, "Body")
_, tag_embed_val = generate_bert_embeddings(X_val, "Tags")

In [None]:
_, title_embed_test = generate_bert_embeddings(X_test, "Title")
_, body_embed_test = generate_bert_embeddings(X_test, "Body")
_, tag_embed_test = generate_bert_embeddings(X_test, "Tags")

In [None]:
#Normaliser la colonne "ReputationAtPostCreation" et "Text_Length"
scaler = StandardScaler()
X_train["Reputation"] = scaler.fit_transform(X_train[["Reputation"]]) 
X_train["BodyLength"] = scaler.fit_transform(X_train[["BodyLength"]]) 
X_train["AnswerCount"] = scaler.fit_transform(X_train[["AnswerCount"]]) 
# Convertir les colonnes numeriques en tenseurs
numeric_tensor1 = torch.tensor(X_train['Reputation'].values, dtype=torch.float32).unsqueeze(1)
numeric_tensor2 = torch.tensor(X_train['BodyLength'].values, dtype=torch.float32).unsqueeze(1)
numeric_tensor3 = torch.tensor(X_train['AnswerCount'].values, dtype=torch.float32).unsqueeze(1)
# Combiner les tenseurs numériques en un seul tenseur
num_ten_comb_train = torch.cat([numeric_tensor1, numeric_tensor2,numeric_tensor3], dim=1)

# Combiner les embeddings textuels et les colonnes numériques
comb_feat_train = torch.cat([title_embed_train,body_embed_train,tag_embed_train, num_ten_comb_train], dim=1)
comb_feat_train

In [None]:
#Normalisation et standarisation des variables numeriques
X_val["Reputation"] = scaler.fit_transform(X_val[["Reputation"]]) 
X_val["BodyLength"] = scaler.fit_transform(X_val[["BodyLength"]]) 
X_val["AnswerCount"] = scaler.fit_transform(X_val[["AnswerCount"]]) 
# Convertir les colonnes numeriques en tenseurs
num_tens1 = torch.tensor(X_val['Reputation'].values, dtype=torch.float32).unsqueeze(1)
num_tens2 = torch.tensor(X_val['BodyLength'].values, dtype=torch.float32).unsqueeze(1)
num_tens3 = torch.tensor(X_val['AnswerCount'].values, dtype=torch.float32).unsqueeze(1)
# Combiner les tenseurs numériques en un seul tenseur
num_ten_val = torch.cat([num_tens1, num_tens2,num_tens3], dim=1)

# Combiner les embeddings textuels et les colonnes numériques
comb_feat_val = torch.cat([title_embed_val,body_embed_val,tag_embed_val, num_ten_val], dim=1)
comb_feat_val

In [None]:
#Normalisation et standarisation des variables numeriques
X_test["Reputation"] = scaler.fit_transform(X_test[["Reputation"]]) 
X_test["BodyLength"] = scaler.fit_transform(X_test[["BodyLength"]]) 
X_test["AnswerCount"] = scaler.fit_transform(X_test[["AnswerCount"]]) 
# Convertir les colonnes numeriques en tenseurs
num1 = torch.tensor(X_test['Reputation'].values, dtype=torch.float32).unsqueeze(1)
num2 = torch.tensor(X_test['BodyLength'].values, dtype=torch.float32).unsqueeze(1)
num3 = torch.tensor(X_test['AnswerCount'].values, dtype=torch.float32).unsqueeze(1)
# Combiner les tenseurs numériques en un seul tenseur
num_ten_test = torch.cat([num1, num2,num3], dim=1)

# Combiner les embeddings textuels et les colonnes numériques
comb_feat_test = torch.cat([title_embed_test,body_embed_test,tag_embed_test, num_ten_test], dim=1)
comb_feat_test

In [None]:
 # Fine tuning de l'algorithme Random Forest
param_grid = {
     'n_estimators': [50, 100, 200],
     'max_depth': [None, 10, 20],
     'min_samples_split': [2, 5, 10]
}

grid_rdf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy',n_jobs=-1)
grid_rdf.fit(comb_feat_train, y_train)

print("Meilleurs paramètres :", grid_rdf.best_params_)
print("Meilleure précision :", grid_rdf.best_score_)

In [None]:

rf_model=grid_rdf.best_estimator_
# Afficher les détails du modèle
print(rf_model)
rf_model.fit(comb_feat_train,y_train)
y_pred = rf_model.predict(comb_feat_val.numpy())
print("Random Forest Results:")
print(classification_report(y_val, y_pred))


In [None]:
 # Fine tuning de l'algorithme xgboost
 param_xgb = {
     'n_estimators': [50, 100, 200],
     'max_depth': [3, 5, 7],
     'learning_rate': [0.01, 0.1, 0.2],
     'subsample': [0.8, 1.0],
     'colsample_bytree': [0.8, 1.0]
}

 # Initialiser GridSearchCV
 grid_xgb = GridSearchCV(estimator=XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
                            param_grid=param_xgb,
                            scoring='accuracy',
                            cv=5,
                            n_jobs=-1)

 # Exécuter la recherche
 grid_xgb.fit(comb_feat_train, y_train)

 # Afficher les meilleurs hyperparamètres
 print("Meilleurs paramètres :", grid_xgb.best_params_)
 print("Meilleure précision :", grid_xgb.best_score_)

In [None]:
xgb_model=grid_xgb.best_estimator
print(xgb_model)
xgb_model.fit(comb_feat_train,y_train)
y_pred_xgb = xgb_model.predict(comb_feat_val.numpy())
print("XgBoost Results:")
print(classification_report(y_val, y_pred_xgb))


In [None]:

# Afficher l'importance des caractéristiques
plt.figure(figsize=(10, 8))
plot_importance(xgb_model, max_num_features=10)
plt.title("Importance des caractéristiques")
plt.show()

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class RobustCombinedNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(RobustCombinedNN, self).__init__()
        
        # Couche entièrement connectée avec régularisation
        self.fc_layers = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),  # Normalisation pour accélérer l'entraînement
            nn.ReLU(),
            nn.Dropout(0.4),  # Régularisation
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim // 2, num_classes)  # Sortie pour les classes
        )
    
    def forward(self, combined_input):
        # Passer les données combinées dans les couches entièrement connectées
        logits = self.fc_layers(combined_input)
        # Appliquer Softmax pour obtenir des probabilités
        probabilities = F.softmax(logits, dim=1)
        return probabilities


In [None]:
y_train = torch.tensor(y_train.values, dtype=torch.long) 
y_val = torch.tensor(y_val.values, dtype=torch.long) 
y_test = torch.tensor(y_test.values, dtype=torch.long) 

In [None]:
input_dim = comb_feat_train.shape[1]
hidden_dim =2*input_dim
num_classes = 6
model = RobustCombinedNN(input_dim, hidden_dim, num_classes)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt

# Initialisation des listes pour stocker les pertes et accuracy
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

# Hyperparamètres
epochs = 10
batch_size = 32
learning_rate = 0.005
weight_decay = 1e-4

# Optimiseur et fonction de perte
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
criterion = nn.CrossEntropyLoss()
# Boucle d'entraînement
for epoch in range(epochs):
    model.train()  # Mode entraînement
    epoch_train_loss = 0.0
    train_correct_predictions = 0
    train_total_samples = 0

    # Entraînement
    for i in range(0, comb_feat_train.size(0), batch_size):
        # Extraction d'un lot
        batch_inputs = comb_feat_train[i:i + batch_size]
        batch_labels = y_train[i:i + batch_size]

        # Réinitialisation des gradients
        optimizer.zero_grad()

        # Prédictions
        probabilities = model(batch_inputs)

        # Calcul de la perte
        loss = criterion(probabilities, batch_labels)

        # Rétropropagation
        loss.backward()
        optimizer.step()

        # Ajouter la perte de ce lot
        epoch_train_loss += loss.item()

        # Calcul de l'accuracy
        _, predicted_classes = torch.max(probabilities, dim=1)
        train_correct_predictions += (predicted_classes == batch_labels).sum().item()
        train_total_samples += batch_labels.size(0)

    # Calcul de la perte moyenne et de l'accuracy pour l'entraînement
    average_train_loss = epoch_train_loss / (len(comb_feat_train) // batch_size)
    train_accuracy = train_correct_predictions / train_total_samples

    train_losses.append(average_train_loss)
    train_accuracies.append(train_accuracy)

    # Validation
    model.eval()  # Mode évaluation
    val_loss = 0.0
    val_correct_predictions = 0
    val_total_samples = 0

    with torch.no_grad():
        for i in range(0, comb_feat_train.size(0), batch_size):
            # Extraction d'un lot
            batch_inputs = comb_feat_val[i:i + batch_size]
            batch_labels = y_val[i:i + batch_size]

            # Prédictions
            probabilities = model(batch_inputs)

            # Calcul de la perte
            loss = criterion(probabilities, batch_labels)
            val_loss += loss.item()

            # Calcul de l'accuracy
            _, predicted_classes = torch.max(probabilities, dim=1)
            val_correct_predictions += (predicted_classes == batch_labels).sum().item()
            val_total_samples += batch_labels.size(0)

    # Calcul de la perte moyenne et de l'accuracy pour la validation
    average_val_loss = val_loss / (len(comb_feat_val) // batch_size)
    val_accuracy = val_correct_predictions / val_total_samples

    val_losses.append(average_val_loss)
    val_accuracies.append(val_accuracy)

    # Afficher les résultats pour cette époque
    print(f"Époque {epoch + 1}, Perte Moyenne (Train): {average_train_loss:.4f}, Accuracy (Train): {train_accuracy:.4f}, "
          f"Perte Moyenne (Val): {average_val_loss:.4f}, Accuracy (Val): {val_accuracy:.4f}")


In [None]:
# Tracer les pertes et accuracies
plt.figure(figsize=(12, 5))

# Pertes
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Pertes Entraînement')
plt.plot(val_losses, label='Pertes Validation')
plt.xlabel("Époques")
plt.ylabel("Perte")
plt.title("Évolution des pertes")
plt.legend()

# Accuracy
plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label='Accuracy Entraînement')
plt.plot(val_accuracies, label='Accuracy Validation')
plt.xlabel("Époques")
plt.ylabel("Accuracy")
plt.title("Évolution des accuracy")
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# Calcul de la matrice de confusion sur les données de test
model.eval()  # Mode évaluation
with torch.no_grad():
    test_probabilities = model(comb_feat_test)
    _, test_predictions = torch.max(test_probabilities, dim=1)
#Matrice de confusion sur les donnees de test
conf_matrix = confusion_matrix(y_test, test_predictions)
print("\nMatrice de confusion sur les données de test :\n", conf_matrix)
print("\nRapport de classification :\n", classification_report(y_test, test_predictions))

In [None]:
# Sauvegarder le meilleur modele pour les futurs predictions
xgb_model.save_model('model.json')