In [None]:
# Import
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict


In [None]:
# Configuration pour la prédiction future
PREDICTION_OFFSET = 3  # Prédire X versions dans le futur (modifiable)
print(f"Configuration: Prédiction à {PREDICTION_OFFSET} versions dans le futur")


In [None]:
# Load the Dataset
df = pd.read_csv("./../../Dataset/Final-dataset-binary/transformers.csv")
print("Dataset original shape:", df.shape)
print("Versions uniques:", sorted(df['version'].unique()))


In [None]:
# Créer les données pour la prédiction future
def create_future_prediction_dataset(df, prediction_offset):
    """
    Crée un dataset où les features à la version N prédisent le label à la version N+offset
    """
    # Convertir les versions en format numérique pour faciliter le tri
    df_sorted = df.copy()
    df_sorted['version_numeric'] = df_sorted['version'].str.extract('(\d+\.\d+\.\d+)').iloc[:, 0]
    df_sorted = df_sorted.sort_values(['path', 'version_numeric'])
    
    future_data = []
    
    # Grouper par fichier (path)
    for file_path in df_sorted['path'].unique():
        file_data = df_sorted[df_sorted['path'] == file_path].copy()
        file_data = file_data.sort_values('version_numeric')
        
        versions = file_data['version_numeric'].unique()
        
        # Pour chaque version, essayer de trouver la version future correspondante
        for i, current_version in enumerate(versions):
            if i + prediction_offset < len(versions):
                future_version = versions[i + prediction_offset]
                
                # Données actuelles (features)
                current_data = file_data[file_data['version_numeric'] == current_version].iloc[0]
                
                # Label futur (target)
                future_data_row = file_data[file_data['version_numeric'] == future_version].iloc[0]
                future_label = future_data_row['has_smell']
                
                # Créer la nouvelle lgigne
                new_row = {
                    'current_version': current_data['version'],
                    'future_version': future_data_row['version'],
                    'path': current_data['path'],
                    'file_type': current_data['file_type'],
                    'line_count': current_data['line_count'],
                    'method_count': current_data['method_count'],
                    'coupling_score': current_data['coupling_score'],
                    'current_has_smell': current_data['has_smell'],
                    'future_has_smell': future_label  # Notre nouveau target
                }
                
                future_data.append(new_row)
    
    return pd.DataFrame(future_data)

# Créer le nouveau dataset
df_future = create_future_prediction_dataset(df, PREDICTION_OFFSET)
print(f"Nouveau dataset shape: {df_future.shape}")
print(f"Nombre de paires (version actuelle -> version +{PREDICTION_OFFSET}): {len(df_future)}")


In [None]:
# Vérifier la distribution des nouvelles données
print("Distribution du target future_has_smell:")
print(df_future["future_has_smell"].value_counts())

df_future["future_has_smell"].value_counts().plot(kind="bar")
plt.title(f"Distribution du target future_has_smell (prédiction à +{PREDICTION_OFFSET} versions)")
plt.xlabel("Value")
plt.ylabel("Count")
plt.show()


In [None]:
# Encoding categorical data pour le nouveau dataset
categorical_features = ["current_version", "path", "file_type"]
numerical_features = ["line_count", "method_count", "coupling_score"]
additional_features = ["current_has_smell"]  # Ajouter l'état actuel comme feature
target_feature = "future_has_smell"

# Preprocessing
onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_cat = onehot_encoder.fit_transform(df_future[categorical_features])

X_num = df_future[numerical_features].values
X_additional = df_future[additional_features].values

X_combined = np.concatenate([X_cat, X_num, X_additional], axis=1)
y = df_future[target_feature].values

print(f"Shape des features combinées: {X_combined.shape}")
print(f"Shape du target: {y.shape}")


In [None]:
# Splitting the dataset into the Training set and the Test set
split_idx = int(0.8 * len(df_future))
X_train, X_test = X_combined[:split_idx], X_combined[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")


In [None]:
# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
# ANN Model pour prédiction future
# Architecture adaptée pour la prédiction temporelle
ann_model = tf.keras.models.Sequential()
ann_model.add(tf.keras.layers.Dense(units=128, activation='relu', input_shape=(X_train.shape[1],)))
ann_model.add(tf.keras.layers.Dropout(0.3))
ann_model.add(tf.keras.layers.Dense(units=64, activation='relu'))
ann_model.add(tf.keras.layers.Dropout(0.3))
ann_model.add(tf.keras.layers.Dense(units=32, activation='relu'))
ann_model.add(tf.keras.layers.Dropout(0.2))
ann_model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

ann_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print("Architecture du modèle:")
ann_model.summary()


In [None]:
# Entraînement avec validation et early stopping
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
history = ann_model.fit(
    X_train, y_train, 
    validation_split=0.2, 
    epochs=150, 
    batch_size=64, 
    callbacks=[callback],
    verbose=1
)


In [None]:
# Evaluation du modèle ANN
y_pred_ann = (ann_model.predict(X_test) > 0.5).astype(int).flatten()
print("=== RÉSULTATS ANN (Prédiction Future) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_ann))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_ann))
print("Classification Report:\n", classification_report(y_test, y_pred_ann))


In [None]:
# Fonction pour faire des prédictions sur de nouvelles données
def predict_future_smell(current_data, prediction_offset=PREDICTION_OFFSET):
    """
    Prédire si un smell sera présent dans le futur pour de nouvelles données
    
    current_data: dict contenant les features actuelles
    prediction_offset: nombre de versions dans le futur à prédire
    """
    # Créer un DataFrame temporaire
    temp_df = pd.DataFrame([current_data])
    
    # Preprocessing (même pipeline que l'entraînement)
    X_cat_new = onehot_encoder.transform(temp_df[categorical_features])
    X_num_new = temp_df[numerical_features].values
    X_additional_new = temp_df[additional_features].values
    
    X_combined_new = np.concatenate([X_cat_new, X_num_new, X_additional_new], axis=1)
    X_scaled_new = scaler.transform(X_combined_new)
    
    # Prédiction
    prediction = ann_model.predict(X_scaled_new)[0][0]
    
    return {
        'prediction_probability': float(prediction),
        'predicted_class': int(prediction >= 0.5),
        'confidence': abs(prediction - 0.5) * 2  # Confiance entre 0 et 1
    }

# Exemple d'utilisation
example_data = {
    'current_version': df_future['current_version'].iloc[0],
    'path': df_future['path'].iloc[0],
    'file_type': df_future['file_type'].iloc[0],
    'line_count': df_future['line_count'].iloc[0],
    'method_count': df_future['method_count'].iloc[0],
    'coupling_score': df_future['coupling_score'].iloc[0],
    'current_has_smell': df_future['current_has_smell'].iloc[0]
}

prediction_result = predict_future_smell(example_data)
print("=== EXEMPLE DE PRÉDICTION ===")
print("Données d'entrée:", example_data)
print("Résultat de prédiction:", prediction_result)
print(f"Le modèle prédit que ce fichier {'AURA' if prediction_result['predicted_class'] == 1 else 'N\\'AURA PAS'} de smell dans {PREDICTION_OFFSET} versions")
