# 07 - Results Analysis and Model Comparison

**Objective**: Comprehensive comparison of Baseline and Full Model, evaluate OCEAN feature value

## Analysis Contents:
1. Load evaluation metrics from both models
2. Performance comparison analysis
3. Deep dive into OCEAN feature importance
4. Business insights and recommendations
5. Final conclusions
6. Generate comprehensive report

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

print("Libraries loaded successfully")

## Step 1: Load Model Evaluation Metrics

In [None]:
# Load baseline metrics
print("Loading Baseline model metrics...")
with open('../../baseline_metrics.json', 'r') as f:
    baseline_metrics = json.load(f)

print("Baseline Model (without OCEAN):")
print(json.dumps(baseline_metrics, indent=2))

# Load full model metrics
print("\nLoading Full Model metrics...")
with open('../../full_model_metrics.json', 'r') as f:
    full_model_metrics = json.load(f)

print("\nFull Model (with OCEAN):")
print(json.dumps(full_model_metrics, indent=2))

## Step 2: Performance Comparison Analysis

In [None]:
# Create performance comparison table
print("=" * 80)
print("Detailed Model Performance Comparison")
print("=" * 80)

comparison_data = {
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC-AUC', 'N Features'],
    'Baseline': [
        baseline_metrics['accuracy'],
        baseline_metrics['precision'],
        baseline_metrics['recall'],
        baseline_metrics['f1_score'],
        baseline_metrics['roc_auc'],
        baseline_metrics['n_features']
    ],
    'Full Model': [
        full_model_metrics['accuracy'],
        full_model_metrics['precision'],
        full_model_metrics['recall'],
        full_model_metrics['f1_score'],
        full_model_metrics['roc_auc'],
        full_model_metrics['n_features']
    ]
}

comparison_df = pd.DataFrame(comparison_data)

# Calculate absolute difference and percentage improvement
comparison_df['Absolute Diff'] = comparison_df['Full Model'] - comparison_df['Baseline']
comparison_df['Improvement %'] = (comparison_df['Absolute Diff'] / comparison_df['Baseline']) * 100

# For N Features, improvement percentage is not meaningful
comparison_df.loc[comparison_df['Metric'] == 'N Features', 'Improvement %'] = np.nan

print("\nPerformance comparison table:")
print(comparison_df.to_string(index=False))

# Save comparison results
comparison_df.to_csv('../../final_model_comparison.csv', index=False)
print("\nComparison results saved: final_model_comparison.csv")

# Statistical analysis
print("\n" + "=" * 80)
print("Performance Improvement Statistics")
print("=" * 80)

# Exclude N Features
metric_comparison = comparison_df[comparison_df['Metric'] != 'N Features']

avg_improvement = metric_comparison['Improvement %'].mean()
max_improvement = metric_comparison['Improvement %'].max()
min_improvement = metric_comparison['Improvement %'].min()
best_metric = metric_comparison.loc[metric_comparison['Improvement %'].idxmax(), 'Metric']
worst_metric = metric_comparison.loc[metric_comparison['Improvement %'].idxmin(), 'Metric']

print(f"Average improvement: {avg_improvement:+.3f}%")
print(f"Maximum improvement: {max_improvement:+.3f}% ({best_metric})")
print(f"Minimum improvement: {min_improvement:+.3f}% ({worst_metric})")
print(f"\nOCEAN feature count: {full_model_metrics['n_ocean_features']}")
print(f"Feature increase: {full_model_metrics['n_features'] - baseline_metrics['n_features']}")

## Step 3: OCEAN Feature Importance Analysis

In [None]:
# Load full model feature importance
print("Loading feature importance data...\n")
full_importance = pd.read_csv('../../full_model_feature_importance.csv')

# Extract OCEAN features
ocean_features = full_importance[full_importance['is_ocean'] == True].copy()
non_ocean_features = full_importance[full_importance['is_ocean'] == False].copy()

print("=" * 80)
print("Detailed OCEAN Feature Importance Analysis")
print("=" * 80)

print("\nOCEAN feature rankings:")
ocean_sorted = ocean_features.sort_values('importance', ascending=False)
for idx, row in ocean_sorted.iterrows():
    rank_in_all = full_importance[full_importance['importance'] >= row['importance']].shape[0]
    trait_name = row['feature'].title()
    print(f"  {trait_name:20s}: {row['importance']:.6f} (Overall rank: {rank_in_all}/{len(full_importance)})")

# Statistical analysis
print("\n" + "=" * 80)
print("OCEAN vs Non-OCEAN Feature Comparison")
print("=" * 80)

ocean_total = ocean_features['importance'].sum()
non_ocean_total = non_ocean_features['importance'].sum()
total = full_importance['importance'].sum()

ocean_avg = ocean_features['importance'].mean()
non_ocean_avg = non_ocean_features['importance'].mean()

print(f"\nOCEAN features:")
print(f"  Count: {len(ocean_features)}")
print(f"  Total importance: {ocean_total:.6f}")
print(f"  Average importance: {ocean_avg:.6f}")
print(f"  Contribution: {ocean_total/total*100:.2f}%")

print(f"\nNon-OCEAN features:")
print(f"  Count: {len(non_ocean_features)}")
print(f"  Total importance: {non_ocean_total:.6f}")
print(f"  Average importance: {non_ocean_avg:.6f}")
print(f"  Contribution: {non_ocean_total/total*100:.2f}%")

# OCEAN personality dimension interpretations
print("\n" + "=" * 80)
print("OCEAN Personality Dimension Interpretations")
print("=" * 80)

ocean_descriptions = {
    'openness': 'Openness - Curiosity, imagination, willingness to try new things',
    'conscientiousness': 'Conscientiousness - Responsibility, self-discipline, organization',
    'extraversion': 'Extraversion - Sociability, energy, positive emotions',
    'agreeableness': 'Agreeableness - Cooperation, trust, altruism',
    'neuroticism': 'Neuroticism - Emotional instability, anxiety, vulnerability'
}

for idx, row in ocean_sorted.iterrows():
    trait = row['feature']
    print(f"\n{trait}:")
    print(f"  {ocean_descriptions.get(trait, 'Unknown dimension')}")
    print(f"  Importance: {row['importance']:.6f}")

## Step 4: Confusion Matrix Comparison

In [None]:
# Extract confusion matrices
cm_baseline = np.array(baseline_metrics['confusion_matrix'])
cm_full = np.array(full_model_metrics['confusion_matrix'])

print("=" * 80)
print("Confusion Matrix Comparison")
print("=" * 80)

print("\nBaseline Model:")
print(cm_baseline)
print(f"  TN: {cm_baseline[0,0]:,}  |  FP: {cm_baseline[0,1]:,}")
print(f"  FN: {cm_baseline[1,0]:,}  |  TP: {cm_baseline[1,1]:,}")

print("\nFull Model (with OCEAN):")
print(cm_full)
print(f"  TN: {cm_full[0,0]:,}  |  FP: {cm_full[0,1]:,}")
print(f"  FN: {cm_full[1,0]:,}  |  TP: {cm_full[1,1]:,}")

# Calculate differences
cm_diff = cm_full - cm_baseline
print("\nDifference (Full - Baseline):")
print(cm_diff)
print(f"  TN: {cm_diff[0,0]:+,}  |  FP: {cm_diff[0,1]:+,}")
print(f"  FN: {cm_diff[1,0]:+,}  |  TP: {cm_diff[1,1]:+,}")

# Business impact analysis
print("\n" + "=" * 80)
print("Business Impact Analysis")
print("=" * 80)

# Assume average loan amount (can be calculated from actual data)
avg_loan_amount = 15000  # Example value
default_loss_rate = 0.7  # Assume 70% default loss rate

# Calculate misclassification costs
fp_reduction = cm_baseline[0,1] - cm_full[0,1]  # Reduction in false positives
fn_reduction = cm_baseline[1,0] - cm_full[1,0]  # Reduction in false negatives

# FP: Rejected good customers, lost interest income
fp_cost_saved = fp_reduction * avg_loan_amount * 0.10  # Assume 10% interest rate

# FN: Approved bad customers, bear default losses
fn_cost_saved = fn_reduction * avg_loan_amount * default_loss_rate

total_savings = fp_cost_saved + fn_cost_saved

print(f"\nFalse Positive (FP) improvement: {fp_reduction:+,} cases")
print(f"  Potential revenue increase: ${fp_cost_saved:,.2f}")

print(f"\nFalse Negative (FN) improvement: {fn_reduction:+,} cases")
print(f"  Potential loss reduction: ${fn_cost_saved:,.2f}")

print(f"\nTotal potential savings: ${total_savings:,.2f}")

if total_savings > 0:
    print("\nFull Model has superior business value over Baseline")
elif total_savings < 0:
    print("\nFull Model has slightly inferior business value compared to Baseline")
else:
    print("\nBoth models have comparable business value")

## Step 5: Comprehensive Visualization

In [None]:
# Create comprehensive comparison visualization
fig = plt.figure(figsize=(20, 12))
gs = fig.add_gridspec(3, 4, hspace=0.35, wspace=0.35)

# 1. Performance metrics comparison (Radar chart)
ax1 = fig.add_subplot(gs[0, :2], projection='polar')
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC-AUC']
baseline_vals = [baseline_metrics['accuracy'], baseline_metrics['precision'],
                 baseline_metrics['recall'], baseline_metrics['f1_score'],
                 baseline_metrics['roc_auc']]
full_vals = [full_model_metrics['accuracy'], full_model_metrics['precision'],
             full_model_metrics['recall'], full_model_metrics['f1_score'],
             full_model_metrics['roc_auc']]

angles = np.linspace(0, 2 * np.pi, len(metrics), endpoint=False).tolist()
baseline_vals += baseline_vals[:1]
full_vals += full_vals[:1]
angles += angles[:1]

ax1.plot(angles, baseline_vals, 'o-', linewidth=2, label='Baseline', color='#3498db')
ax1.fill(angles, baseline_vals, alpha=0.15, color='#3498db')
ax1.plot(angles, full_vals, 'o-', linewidth=2, label='Full Model', color='#e74c3c')
ax1.fill(angles, full_vals, alpha=0.15, color='#e74c3c')

ax1.set_xticks(angles[:-1])
ax1.set_xticklabels(metrics, fontsize=10)
ax1.set_ylim(0, 1)
ax1.set_title('Performance Comparison (Radar Chart)', fontsize=13, fontweight='bold', pad=20)
ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1), fontsize=10)
ax1.grid(True)

# 2. Improvement bar chart
ax2 = fig.add_subplot(gs[0, 2:])
improvements = metric_comparison['Improvement %'].values
colors_bar = ['#2ecc71' if x > 0 else '#e74c3c' for x in improvements]
bars = ax2.bar(range(len(metrics)), improvements, color=colors_bar, alpha=0.7, edgecolor='black')
ax2.set_xticks(range(len(metrics)))
ax2.set_xticklabels(metrics, rotation=45, ha='right', fontsize=10)
ax2.set_ylabel('Improvement (%)', fontsize=11, fontweight='bold')
ax2.set_title('Performance Improvement by Metric', fontsize=13, fontweight='bold')
ax2.axhline(y=0, color='black', linestyle='-', linewidth=1)
ax2.grid(axis='y', alpha=0.3)
# Add value labels
for bar, val in zip(bars, improvements):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height,
             f'{val:+.2f}%', ha='center', va='bottom' if height > 0 else 'top',
             fontweight='bold', fontsize=9)

# 3 & 4. Confusion matrix comparison
ax3 = fig.add_subplot(gs[1, 0])
sns.heatmap(cm_baseline, annot=True, fmt='d', cmap='Blues', ax=ax3,
            xticklabels=['Fully Paid', 'Charged Off'],
            yticklabels=['Fully Paid', 'Charged Off'],
            cbar_kws={'label': 'Count'})
ax3.set_title('Baseline - Confusion Matrix', fontsize=12, fontweight='bold')
ax3.set_ylabel('True', fontsize=10, fontweight='bold')
ax3.set_xlabel('Predicted', fontsize=10, fontweight='bold')

ax4 = fig.add_subplot(gs[1, 1])
sns.heatmap(cm_full, annot=True, fmt='d', cmap='Reds', ax=ax4,
            xticklabels=['Fully Paid', 'Charged Off'],
            yticklabels=['Fully Paid', 'Charged Off'],
            cbar_kws={'label': 'Count'})
ax4.set_title('Full Model - Confusion Matrix', fontsize=12, fontweight='bold')
ax4.set_ylabel('True', fontsize=10, fontweight='bold')
ax4.set_xlabel('Predicted', fontsize=10, fontweight='bold')

# 5. OCEAN feature importance
ax5 = fig.add_subplot(gs[1, 2:])
ocean_sorted_plot = ocean_sorted.copy()
ocean_sorted_plot['trait_name'] = ocean_sorted_plot['feature'].str.title()
bars_ocean = ax5.bar(range(len(ocean_sorted_plot)), ocean_sorted_plot['importance'].values,
                     color='#9b59b6', alpha=0.7, edgecolor='black')
ax5.set_xticks(range(len(ocean_sorted_plot)))
ax5.set_xticklabels(ocean_sorted_plot['trait_name'].values, rotation=45, ha='right', fontsize=10)
ax5.set_ylabel('Importance', fontsize=11, fontweight='bold')
ax5.set_title('OCEAN Features Importance Ranking', fontsize=13, fontweight='bold')
ax5.grid(axis='y', alpha=0.3)
# Add value labels
for bar, val in zip(bars_ocean, ocean_sorted_plot['importance'].values):
    height = bar.get_height()
    ax5.text(bar.get_x() + bar.get_width()/2., height,
             f'{val:.4f}', ha='center', va='bottom',
             fontweight='bold', fontsize=8)

# 6. Top 10 feature comparison
ax6 = fig.add_subplot(gs[2, :])
top_10 = full_importance.head(10)
colors_top10 = ['#e74c3c' if is_ocean else '#3498db' for is_ocean in top_10['is_ocean']]
y_pos = np.arange(len(top_10))
ax6.barh(y_pos, top_10['importance'].values, color=colors_top10, alpha=0.7, edgecolor='black')
ax6.set_yticks(y_pos)
ax6.set_yticklabels(top_10['feature'].values, fontsize=10)
ax6.invert_yaxis()
ax6.set_xlabel('Importance', fontsize=11, fontweight='bold')
ax6.set_title('Top 10 Most Important Features (Red = OCEAN, Blue = Others)', 
              fontsize=13, fontweight='bold')
ax6.grid(axis='x', alpha=0.3)

plt.savefig('../../comprehensive_results_analysis.png', dpi=300, bbox_inches='tight')
print("\nComprehensive visualization saved: comprehensive_results_analysis.png")
plt.show()

## Step 6: Business Insights and Recommendations

In [None]:
print("=" * 80)
print("Business Insights and Recommendations")
print("=" * 80)

print("\n1. OCEAN Feature Value Assessment")
print("-" * 80)

if avg_improvement > 1:
    print("Conclusion: OCEAN personality features significantly improved credit risk prediction model performance")
    print(f"   - Average performance improvement: {avg_improvement:.2f}%")
    print(f"   - OCEAN feature contribution: {ocean_total/total*100:.2f}%")
    print("   - Recommendation: Strongly recommend using full model with OCEAN features in production")
elif avg_improvement > 0:
    print("Conclusion: OCEAN personality features slightly improved model performance")
    print(f"   - Average performance improvement: {avg_improvement:.2f}%")
    print(f"   - OCEAN feature contribution: {ocean_total/total*100:.2f}%")
    print("   - Recommendation: Consider using OCEAN features, but weigh implementation costs")
else:
    print("Conclusion: OCEAN personality features did not improve model performance")
    print(f"   - Average performance improvement: {avg_improvement:.2f}%")
    print("   - Recommendation: Not recommended to use OCEAN features, or improve feature extraction method")

print("\n2. Most Valuable OCEAN Dimension")
print("-" * 80)

top_ocean_trait = ocean_sorted.iloc[0]
trait_name = top_ocean_trait['feature'].title()
print(f"Most important dimension: {trait_name}")
print(f"Importance score: {top_ocean_trait['importance']:.6f}")
print(f"Business meaning: {ocean_descriptions.get(top_ocean_trait['feature'], 'Unknown')}")

print("\n3. Model Deployment Recommendations")
print("-" * 80)
print("Data requirements:")
print("  - Applicants must provide loan purpose description (desc field)")
print("  - Text length recommendation: 50-500 characters")
print("  - Text quality: Authentic, detailed personal statement")

print("\nImplementation steps:")
print("  1. Collect applicant loan application descriptions")
print("  2. Use OCEAN feature extraction model to process text")
print("  3. Combine OCEAN features with traditional features")
print("  4. Use full model for credit scoring")
print("  5. Make approval decisions based on predictions and business rules")

print("\n4. Potential Risks and Limitations")
print("-" * 80)
print("Data quality risks:")
print("  - Applicants may provide false or embellished descriptions")
print("  - Text length and quality may vary significantly")
print("  - Different cultural backgrounds may affect expression patterns")

print("\nTechnical limitations:")
print("  - Current dictionary-based method may not be as accurate as deep learning models")
print("  - OCEAN extraction depends on text quality")
print("  - Model requires regular updates and validation")

print("\n5. Future Improvement Directions")
print("-" * 80)
print("Feature extraction improvements:")
print("  - Use pre-trained Transformer models (BERT, RoBERTa)")
print("  - Train specialized personality prediction models")
print("  - Combine other text fields (title, purpose)")

print("\nModel optimization:")
print("  - Hyperparameter tuning")
print("  - Try other algorithms (LightGBM, CatBoost)")
print("  - Ensemble learning methods")
print("  - Feature engineering and selection")

## Step 7: Final Conclusions

In [None]:
print("=" * 80)
print("Final Conclusions")
print("=" * 80)

print("\nResearch Objective")
print("-" * 80)
print("Evaluate the value of OCEAN personality features in credit risk prediction")

print("\nKey Findings")
print("-" * 80)
print(f"1. Model performance comparison:")
print(f"   - Baseline model ROC-AUC: {baseline_metrics['roc_auc']:.4f}")
print(f"   - Full Model ROC-AUC: {full_model_metrics['roc_auc']:.4f}")
print(f"   - Improvement: {(full_model_metrics['roc_auc'] - baseline_metrics['roc_auc'])/baseline_metrics['roc_auc']*100:+.2f}%")

print(f"\n2. OCEAN feature analysis:")
print(f"   - Feature count: {len(ocean_features)}")
print(f"   - Total contribution: {ocean_total/total*100:.2f}%")
print(f"   - Average importance: {ocean_avg:.6f}")
print(f"   - Most important dimension: {ocean_sorted.iloc[0]['feature'].title()}")

print(f"\n3. Business value:")
if total_savings > 0:
    print(f"   - Potential savings: ${total_savings:,.2f} (based on test set)")
    print(f"   - FP improvement: {fp_reduction:+,} cases")
    print(f"   - FN improvement: {fn_reduction:+,} cases")
else:
    print(f"   - Business value assessment: Requires further analysis")

print("\nFinal Recommendation")
print("-" * 80)

if avg_improvement > 1 and total_savings > 0:
    print("Strongly recommend using full model with OCEAN features:")
    print("  - Significant performance improvement")
    print("  - Clear business value")
    print("  - OCEAN features provide unique predictive information")
elif avg_improvement > 0:
    print("Consider using OCEAN features, but weigh costs:")
    print("  - Performance improved but not significantly")
    print("  - Need to collect and process text data")
    print("  - Recommend small-scale pilot test first")
else:
    print("Currently not recommended to use OCEAN features:")
    print("  - No performance improvement observed")
    print("  - Increased model complexity")
    print("  - Suggest improving feature extraction method before re-evaluation")

print("\nResearch Contributions")
print("-" * 80)
print("1. Validated application value of psycholinguistic features in financial risk control")
print("2. Provided complete feature engineering and modeling workflow")
print("3. Established rigorous data leakage prevention mechanism")
print("4. Developed reusable OCEAN feature extraction method")

print("\nFuture Research Directions")
print("-" * 80)
print("1. Use advanced NLP models to extract OCEAN features")
print("2. Explore other psychological features (e.g., moral judgment, risk preference)")
print("3. Study OCEAN feature variations across different customer segments")
print("4. Develop real-time personality trait assessment system")

print("\n" + "=" * 80)
print("Analysis report generation complete")
print("=" * 80)

## Step 8: Generate Text Report

In [None]:
# Generate text report
report_content = f"""
{'='*80}
OCEAN Personality Features in Credit Risk Prediction
Final Analysis Report
{'='*80}

1. Research Overview
{'-'*80}
This study evaluates the application value of the OCEAN Big Five personality traits
(Openness, Conscientiousness, Extraversion, Agreeableness, Neuroticism) in credit risk
prediction models. We used the Prosper loan dataset to compare XGBoost model performance
with and without OCEAN features.

Dataset Information:
- Total samples: {baseline_metrics['train_size'] + baseline_metrics['test_size']:,}
- Training set: {baseline_metrics['train_size']:,}
- Test set: {baseline_metrics['test_size']:,}
- Samples with description text: 5.58%

2. Model Performance Comparison
{'-'*80}
Baseline Model (without OCEAN features):
  - Accuracy:  {baseline_metrics['accuracy']:.4f}
  - Precision: {baseline_metrics['precision']:.4f}
  - Recall:    {baseline_metrics['recall']:.4f}
  - F1 Score:  {baseline_metrics['f1_score']:.4f}
  - ROC-AUC:   {baseline_metrics['roc_auc']:.4f}
  - Features:  {baseline_metrics['n_features']}

Full Model (with OCEAN features):
  - Accuracy:  {full_model_metrics['accuracy']:.4f}
  - Precision: {full_model_metrics['precision']:.4f}
  - Recall:    {full_model_metrics['recall']:.4f}
  - F1 Score:  {full_model_metrics['f1_score']:.4f}
  - ROC-AUC:   {full_model_metrics['roc_auc']:.4f}
  - Features:  {full_model_metrics['n_features']}
  - OCEAN features: {full_model_metrics['n_ocean_features']}

Performance Improvement:
  - Average improvement: {avg_improvement:+.2f}%
  - ROC-AUC improvement: {(full_model_metrics['roc_auc'] - baseline_metrics['roc_auc'])/baseline_metrics['roc_auc']*100:+.2f}%

3. OCEAN Feature Importance Analysis
{'-'*80}
OCEAN total contribution: {ocean_total/total*100:.2f}%
OCEAN average importance: {ocean_avg:.6f}
Non-OCEAN average importance: {non_ocean_avg:.6f}

Importance ranking by dimension:
"""

for idx, row in ocean_sorted.iterrows():
    trait_name = row['feature'].title()
    report_content += f"  {trait_name:20s}: {row['importance']:.6f}\n"

report_content += f"""
4. Business Impact Assessment
{'-'*80}
Confusion matrix changes:
  - False Positive change: {cm_diff[0,1]:+,}
  - False Negative change: {cm_diff[1,0]:+,}
  - True Positive change: {cm_diff[1,1]:+,}
  - True Negative change: {cm_diff[0,0]:+,}

Potential business value:
  - Total potential savings: ${total_savings:,.2f}
  - From FP improvement: ${fp_cost_saved:,.2f}
  - From FN improvement: ${fn_cost_saved:,.2f}

5. Conclusions and Recommendations
{'-'*80}
"""

if avg_improvement > 1:
    report_content += """Conclusion: OCEAN personality features significantly improved credit risk prediction model performance

Recommendations:
  - Deploy full model with OCEAN features in production environment
  - Collect applicant loan purpose descriptions as model input
  - Regularly monitor and update OCEAN feature extraction model
  - Consider using more advanced NLP techniques to further improve performance
"""
elif avg_improvement > 0:
    report_content += """Conclusion: OCEAN personality features slightly improved model performance

Recommendations:
  - Consider using OCEAN features, but weigh implementation costs
  - Recommend small-scale A/B testing to validate business value first
  - Explore better feature extraction methods
"""
else:
    report_content += """Conclusion: Current OCEAN features did not significantly improve model performance

Recommendations:
  - Not recommended to use current OCEAN features in production
  - Improve feature extraction method (use deep learning models)
  - Increase training sample size
  - Explore other psychological features
"""

report_content += f"""
6. Research Limitations
{'-'*80}
  - Only used dictionary-based method to extract OCEAN features
  - Only 5.58% of samples contain description text
  - Did not consider text authenticity and quality
  - Did not conduct stratified analysis for different customer segments

7. Future Work
{'-'*80}
  - Use pre-trained Transformer models (BERT/RoBERTa) to extract features
  - Collect more samples with description text
  - Develop text quality assessment mechanism
  - Research OCEAN feature interpretability
  - Explore other psycholinguistic features

{'='*80}
Report generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
{'='*80}
"""

# Save report
with open('../../FINAL_ANALYSIS_REPORT.txt', 'w', encoding='utf-8') as f:
    f.write(report_content)

print("\nFinal analysis report generated: FINAL_ANALYSIS_REPORT.txt")
print("\n" + report_content)