In [1]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout


In [None]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
df = pd.read_csv('db/questions_db.csv')
print(df.head)

In [None]:
# Nettoie le texte 
def clean_text(text):
    if isinstance(text, str):
        # Enleve le HTML
        text = re.sub(r'<.*?>', '', text)
        # Enleve les caractères spéciaux
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        # Converti en minuscule
        text = text.lower()
    return text


def process_tags(tags):
    if isinstance(tags, str):
        return tags.strip('<>').split('><')
    else:
        return []

df['Tags'] = df['Tags'].apply(process_tags)



# Nettoyage
df['Title'] = df['Title'].apply(clean_text)
df['Body'] = df['Body'].apply(clean_text)


print(df.head())


In [None]:
def check_missing_values(df):
    missing_values = df.isnull().sum()
    print("Nombre de valeurs manquantes par colonne:")
    print(missing_values)
    if missing_values.any():
        print("\nColonnes avec valeurs manquantes:")
        print(missing_values[missing_values > 0])
    else:
        print("Aucune valeur manquante trouvée.")

check_missing_values(df)

In [None]:

all_tags = [tag for tags in df['Tags'] for tag in tags]
tag_counts = pd.Series(all_tags).value_counts().head(100)

plt.figure(figsize=(12, 18))
sns.barplot(x=tag_counts.values, y=tag_counts.index)
plt.title('Top 20 des Tags les plus fréquents')
plt.xlabel('Fréquence')
plt.ylabel('Tags')
plt.show()

In [None]:
def check_html_special_chars(text):
    if isinstance(text, str):
        html_tags = re.findall(r'<.*?>', text)
        special_chars = re.findall(r'[^a-zA-Z0-9\s]', text)
        return len(html_tags) + len(special_chars)
    return 0

df['Title_html_special'] = df['Title'].apply(check_html_special_chars)
df['Body_html_special'] = df['Body'].apply(check_html_special_chars)

print("Occurrences de caractères HTML ou spéciaux restants dans les titres")
print(df['Title_html_special'].value_counts())
print("Occurrences de caractères HTML ou spéciaux restants dans les corps")
print(df['Body_html_special'].value_counts())

# Vérification des tags
print("Vérification des tags")
print(df['Tags'].head(10))

# Afficher le tableau complet sans limite
print(df)

In [None]:
def add_features(df):
    # Longueur du titre
    df['Title_length'] = df['Title'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)
    # Nombre de mots dans le corps
    df['Body_word_count'] = df['Body'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)
    # Nombre de tags
    df['Tag_count'] = df['Tags'].apply(lambda x: len(x) if isinstance(x, list) else 0)
    return df

# Ajoute les features supplémentaires
df = add_features(df)
print(df.head(10))


In [None]:
df.info()


In [None]:
df = df.drop(columns=['FavoriteCount'])


## Analyse multivariée


In [None]:
cols_to_analyze = ['Score', 'ViewCount', 'AnswerCount', 'Title_length', 'Body_word_count', 'Tag_count']
score_col = 'Score'
view_col = 'ViewCount'
favorite_col = 'FavoriteCount'

In [None]:
def analyse_univariee_multivariee(df, cols):
    print("Analyse Univariée")
    for column in cols:
        if column in df.columns:
            plt.figure(figsize=(10, 6))
            sns.histplot(df[column], kde=True)
            plt.title(f'Distribution de {column}')
            plt.xlabel(column)
            plt.ylabel('Fréquence')
            plt.show()
        
    print("Analyse Multivariée")
    sns.pairplot(df[cols].select_dtypes(include=['int64', 'float64']))
    plt.show()


analyse_univariee_multivariee(df, cols_to_analyze)


In [None]:
print(df.describe())

In [None]:
def distribution_scores_vues_favoris(df, score_col, view_col, favorite_col):
    if score_col in df.columns:
        plt.figure(figsize=(10, 6))
        sns.histplot(df[score_col], bins=30, kde=True)
        plt.title(f'Distribution de {score_col}')
        plt.xlabel(score_col)
        plt.ylabel('Fréquence')
        plt.show()
    
    if view_col in df.columns:
        plt.figure(figsize=(10, 6))
        sns.histplot(df[view_col], bins=30, kde=True)
        plt.title(f'Distribution de {view_col}')
        plt.xlabel(view_col)
        plt.ylabel('Fréquence')
        plt.show()
    
    if favorite_col in df.columns and df[favorite_col].notnull().sum() > 0:
        plt.figure(figsize=(10, 6))
        sns.histplot(df[favorite_col], bins=30, kde=True)
        plt.title(f'Distribution de {favorite_col}')
        plt.xlabel(favorite_col)
        plt.ylabel('Fréquence')
        plt.show()
    else:
        print(f"La colonne {favorite_col} est vide ou n'existe pas.")


In [None]:
def correlation_variables(df, cols):
    corr_matrix = df[cols].corr()
    plt.figure(figsize=(12, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
    plt.title('Matrice de Corrélation')
    plt.show()
    
    
correlation_variables(df, cols_to_analyze)


### Tokenization + stopwords + lemmatization

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


# Nettoie le texte des mots vides
def treatments_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Suppression des stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

# Colonnes Title et Body
def clean_dataframe(df):
    df['Cleaned_Title'] = df['Title'].apply(treatments_text)
    df['Cleaned_Body'] = df['Body'].apply(treatments_text)
    return df

df = clean_dataframe(df)

# Verification
print(df[['Title', 'Cleaned_Title', 'Body', 'Cleaned_Body']].head())


In [None]:

def transform_text_to_bow(df):
    # Bag of Words : convertit le texte en vecteurs de caractéristiques
    vectorizer = CountVectorizer(max_df=0.95, min_df=2)
    X = vectorizer.fit_transform(df['Cleaned_Body'])
    
    # Réduction de dimension avec TruncatedSVD : réduit la dimensionnalité des vecteurs BoW
    svd = TruncatedSVD(n_components=100)
    X_reduced = svd.fit_transform(X)
    
    # Transformation des tags : convertit les tags en une forme utilisable pour l'apprentissage supervisé
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(df['Tags'])
    
    return X_reduced, y, vectorizer, svd, mlb

In [None]:

def build_and_train_model(X, y):
    # Divise les données pour l'entraînement et le test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Modèle TensorFlow
    model = Sequential()
    model.add(Dense(128, input_dim=X.shape[1], activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(y.shape[1], activation='sigmoid'))

    # Compilation du modèle
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Entraînement du modèle
    history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

    # Évaluation du modèle
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f'Loss: {loss}, Accuracy: {accuracy}')
    
    return model, history

In [None]:
def plot_training_history(history):
    # Visuel de l'historique d'entraînement
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Loss over epochs')
    
    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.title('Accuracy over epochs')
    
    plt.show()


In [None]:
# Transformation en BoW et réduction de dimension
X, y, vectorizer, svd, mlb = transform_text_to_bow(df)

# Entraînement et évaluation du modèle
model, history = build_and_train_model(X, y)

# Visualisation de l'historique d'entraînement
plot_training_history(history)