# Phase 3: Prediction Experiments
## Code Summarization and Generation Project

This notebook implements and evaluates prediction models:
- Random Forest
- LSTM
- Transformer (CodeBERT)

In [None]:
import sys
sys.path.append('../src')

import yaml
import pandas as pd
import numpy as np
import json
from prediction_models import RandomForestPipeline, LSTMPipeline
from visualization import ClusteringVisualizer, MetricsVisualizer
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## 1. Load Data and Configuration

In [None]:
with open('../config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

df = pd.read_csv('../data/processed/all_features.csv')

print(f"Dataset shape: {df.shape}")
print(f"Languages: {df['language'].value_counts()}")

## 2. Task Definition

We'll predict the programming language based on code complexity features.

In [None]:
# Define target and features
target_col = 'language'

# Select complexity features
complexity_features = [
    'ast_node_count', 'ast_depth', 'ast_leaf_count', 'ast_branching_factor_avg',
    'ast_distinct_node_types', 'cc_mccabe', 'halstead_volume', 'halstead_difficulty',
    'halstead_effort', 'num_if', 'num_for', 'num_while', 'loc'
]

print(f"Predicting: {target_col}")
print(f"Using {len(complexity_features)} complexity features")

## 3. Random Forest Classification

In [None]:
rf_pipeline = RandomForestPipeline(config, task='classification')

# Prepare data
X_train, X_test, y_train, y_test = rf_pipeline.prepare_data(
    df, target_col, feature_cols=complexity_features
)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

In [None]:
# Train Random Forest
train_results = rf_pipeline.train(X_train, y_train)

print("\nTraining Results:")
print(f"Best parameters: {train_results['best_params']}")
print(f"Best CV score: {train_results['best_score']:.4f}")

In [None]:
# Evaluate on test set
test_results = rf_pipeline.evaluate(X_test, y_test)

print("\nTest Set Evaluation:")
print(f"Accuracy: {test_results['accuracy']:.4f}")
print(f"Precision: {test_results['precision']:.4f}")
print(f"Recall: {test_results['recall']:.4f}")
print(f"F1-Score: {test_results['f1_score']:.4f}")

In [None]:
# Classification report
if 'classification_report' in test_results:
    report_df = pd.DataFrame(test_results['classification_report']).transpose()
    print("\nClassification Report:")
    print(report_df)

## 4. Feature Importance Analysis

In [None]:
# Get feature importance
importance_df = rf_pipeline.get_feature_importance(complexity_features, top_k=15)

print("\nTop 15 Most Important Features:")
print(importance_df)

# Plot
visualizer = ClusteringVisualizer(config)
visualizer.plot_feature_importance(
    complexity_features,
    np.array(test_results['feature_importance']),
    top_k=15
)

## 5. Confusion Matrix

In [None]:
y_pred = rf_pipeline.model.predict(X_test)
labels = rf_pipeline.label_encoder.classes_

visualizer.plot_confusion_matrix(y_test, y_pred, labels)

## 6. LSTM Classification

In [None]:
lstm_pipeline = LSTMPipeline(config)

# Prepare data
X_train, X_test, y_train, y_test, input_size, num_classes = lstm_pipeline.prepare_data(
    df, target_col, feature_cols=complexity_features
)

print(f"Input size: {input_size}")
print(f"Number of classes: {num_classes}")

In [None]:
# Train LSTM
lstm_history = lstm_pipeline.train(X_train, y_train, input_size, num_classes)

# Plot training history
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

ax1.plot(lstm_history['train_loss'])
ax1.set_title('LSTM Training Loss')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.grid(True)

ax2.plot(lstm_history['train_acc'])
ax2.set_title('LSTM Training Accuracy')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.grid(True)

plt.tight_layout()
plt.savefig('../results/visualizations/lstm_training_history.png', dpi=300)
plt.show()

In [None]:
# Evaluate LSTM
lstm_results = lstm_pipeline.evaluate(X_test, y_test)

print("\nLSTM Test Set Evaluation:")
print(f"Accuracy: {lstm_results['accuracy']:.4f}")
print(f"Precision: {lstm_results['precision']:.4f}")
print(f"Recall: {lstm_results['recall']:.4f}")
print(f"F1-Score: {lstm_results['f1_score']:.4f}")

## 7. Model Comparison

In [None]:
# Compare all models
comparison_df = pd.DataFrame({
    'Model': ['Random Forest', 'LSTM'],
    'Accuracy': [test_results['accuracy'], lstm_results['accuracy']],
    'Precision': [test_results['precision'], lstm_results['precision']],
    'Recall': [test_results['recall'], lstm_results['recall']],
    'F1-Score': [test_results['f1_score'], lstm_results['f1_score']]
})

print("\nModel Comparison:")
print(comparison_df)

# Save comparison
comparison_df.to_csv('../results/metrics/model_comparison.csv', index=False)

In [None]:
# Visualize comparison
comparison_df_melted = comparison_df.melt(id_vars='Model', var_name='Metric', value_name='Score')

plt.figure(figsize=(12, 6))
sns.barplot(data=comparison_df_melted, x='Metric', y='Score', hue='Model')
plt.title('Model Performance Comparison', fontsize=14)
plt.ylim([0, 1])
plt.ylabel('Score')
plt.legend(title='Model')
plt.tight_layout()
plt.savefig('../results/visualizations/model_comparison.png', dpi=300)
plt.show()

## 8. Save Results

In [None]:
# Save all results
all_results = {
    'random_forest': {
        'train': train_results,
        'test': test_results
    },
    'lstm': {
        'history': lstm_history,
        'test': lstm_results
    }
}

with open('../results/metrics/prediction_results.json', 'w') as f:
    json.dump(all_results, f, indent=2, default=lambda x: float(x) if isinstance(x, (np.floating, np.integer)) else x)

print("Results saved successfully!")