In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [5]:
def load_novelty_data(file_path):
    """Load the novelty scores from CSV file"""
    df = pd.read_csv(file_path)
    print(f"Loaded data for {len(df)} patents")
    return df

def compute_novelty_statistics(df):
    """Compute basic statistics about the novelty scores"""
    # Get statistics for each q-value column
    stats = {}
    for col in df.columns:
        if col.startswith('novelty_q'):
            stats[col] = {
                'mean': df[col].mean(),
                'median': df[col].median(),
                'std': df[col].std(),
                'min': df[col].min(),
                'max': df[col].max(),
                '25%': df[col].quantile(0.25),
                '75%': df[col].quantile(0.75),
                '90%': df[col].quantile(0.90),
                '95%': df[col].quantile(0.95)
            }
    
    # Convert to DataFrame for easier viewing
    stats_df = pd.DataFrame(stats)
    return stats_df

def identify_novel_patents(df, q_col='novelty_q100', threshold=0.95):
    """
    Identify potentially novel patents using a threshold
    
    Args:
        df: DataFrame with novelty scores
        q_col: Column to use for filtering (default: novelty_q100)
        threshold: Percentile threshold for novelty (default: 0.95)
        
    Returns:
        DataFrame with patents above the threshold
    """
    cutoff = df[q_col].quantile(threshold)
    novel_patents = df[df[q_col] >= cutoff].sort_values(by=q_col, ascending=False)
    print(f"Identified {len(novel_patents)} patents with {q_col} >= {cutoff:.4f} (top {threshold*100:.1f}%)")
    return novel_patents

def classify_patents_by_novelty(df, q_col='novelty_q100'):
    """
    Classify patents into novelty categories
    
    Args:
        df: DataFrame with novelty scores
        q_col: Column to use for classification (default: novelty_q100)
        
    Returns:
        DataFrame with novelty classifications
    """
    # Make a copy of the DataFrame to avoid modifying the original
    result_df = df.copy()
    
    # Define thresholds
    high_threshold = df[q_col].quantile(0.90)
    medium_threshold = df[q_col].quantile(0.50)
    
    # Classify patents
    result_df['novelty_class'] = 'Low'
    result_df.loc[result_df[q_col] >= medium_threshold, 'novelty_class'] = 'Medium'
    result_df.loc[result_df[q_col] >= high_threshold, 'novelty_class'] = 'High'
    
    # Count patents in each class
    counts = result_df['novelty_class'].value_counts()
    print(f"Patent classifications based on {q_col}:")
    for category in ['High', 'Medium', 'Low']:
        if category in counts:
            print(f"  {category}: {counts[category]} patents")
    
    return result_df

def plot_novelty_distributions(df):
    """Plot distributions of novelty scores for different q-values"""
    # Extract q-value columns
    q_cols = [col for col in df.columns if col.startswith('novelty_q')]
    
    # Create subplots
    fig, axes = plt.subplots(len(q_cols), 1, figsize=(12, 3*len(q_cols)))
    
    # Plot histograms
    for i, col in enumerate(q_cols):
        q_value = col.replace('novelty_q', '')
        ax = axes[i] if len(q_cols) > 1 else axes
        sns.histplot(df[col], ax=ax, kde=True)
        ax.set_title(f'Distribution of Novelty Scores (q={q_value})')
        ax.set_xlabel('Novelty Score')
        ax.set_ylabel('Frequency')
        
        # Add vertical lines for thresholds
        ax.axvline(df[col].quantile(0.90), color='r', linestyle='--', label='90th percentile')
        ax.axvline(df[col].median(), color='g', linestyle='--', label='Median')
        ax.legend()
    
    plt.tight_layout()
    plt.savefig('/home/thiesen/Documents/Projekt_EDV-TEK/AP 2 - Entstehung von TEKs/results/edv_tek_diffusion_patent_novelty_distributions.png', dpi=300)
    plt.close()
    print("Saved novelty distributions plot to 'edv_tek_diffusion_patent_novelty_distributions.png'")

def create_interactive_scatter(df, classified_df):
    """Create an interactive scatter plot of novelty scores"""
    # Create a figure with subplots
    fig = make_subplots(rows=1, cols=2, 
                     subplot_titles=("Novelty q100 vs q90", "Novelty q100 vs q50"))
    
    # Define colors for novelty classes
    colors = {'High': 'red', 'Medium': 'orange', 'Low': 'blue'}
    
    # Add scatter plot for q100 vs q90
    for category in ['High', 'Medium', 'Low']:
        subset = classified_df[classified_df['novelty_class'] == category]
        fig.add_trace(
            go.Scatter(
                x=subset['novelty_q100'], 
                y=subset['novelty_q90'],
                mode='markers',
                name=category,
                marker=dict(color=colors[category]),
                text=subset['patent_id'].astype(str),
                hovertemplate="Patent ID: %{text}<br>q100: %{x:.4f}<br>q90: %{y:.4f}"
            ),
            row=1, col=1
        )
    
    # Add scatter plot for q100 vs q50
    for category in ['High', 'Medium', 'Low']:
        subset = classified_df[classified_df['novelty_class'] == category]
        fig.add_trace(
            go.Scatter(
                x=subset['novelty_q100'], 
                y=subset['novelty_q50'],
                mode='markers',
                name=category,
                marker=dict(color=colors[category]),
                text=subset['patent_id'].astype(str),
                hovertemplate="Patent ID: %{text}<br>q100: %{x:.4f}<br>q50: %{y:.4f}",
                showlegend=False
            ),
            row=1, col=2
        )
    
    # Update layout
    fig.update_layout(
        title="Patent Novelty Analysis",
        height=600,
        width=1200,
        legend_title="Novelty Class"
    )
    
    # Update axes labels
    fig.update_xaxes(title_text="Novelty q100 (Maximum Distance)", row=1, col=1)
    fig.update_yaxes(title_text="Novelty q90", row=1, col=1)
    fig.update_xaxes(title_text="Novelty q100 (Maximum Distance)", row=1, col=2)
    fig.update_yaxes(title_text="Novelty q50 (Median Distance)", row=1, col=2)
    
    # Save to HTML file
    fig.write_html("/home/thiesen/Documents/Projekt_EDV-TEK/AP 2 - Entstehung von TEKs/results/edv_tek_diffusion_patent_novelty_analysis.html")
    print("Saved interactive scatter plot to 'edv_tek_diffusion_patent_novelty_analysis.html'")

def create_correlation_matrix(df):
    """Create a correlation matrix between different q-values"""
    # Extract q-value columns
    q_cols = [col for col in df.columns if col.startswith('novelty_q')]
    
    # Compute correlation matrix
    corr = df[q_cols].corr()
    
    # Create heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
    plt.title('Correlation Between Different Novelty q-values')
    plt.tight_layout()
    plt.savefig('/home/thiesen/Documents/Projekt_EDV-TEK/AP 2 - Entstehung von TEKs/results/edv_tek_diffusion_patent_novelty_correlation.png', dpi=300)
    plt.close()
    print("Saved correlation matrix to 'edv_tek_diffusion_patent_novelty_correlation.png'")

def main():
    # Load data
    df = load_novelty_data('/home/thiesen/Documents/Projekt_EDV-TEK/AP 2 - Entstehung von TEKs/results/edv_tek_diffusion_patent_novelty_scores.csv')
    
    # Compute statistics
    stats = compute_novelty_statistics(df)
    print("\nNovelty Score Statistics:")
    print(stats)
    
    # Identify novel patents
    novel_patents = identify_novel_patents(df, threshold=0.95)
    print("\nTop 5 Most Novel Patents:")
    print(novel_patents.head(5)[['patent_id', 'novelty_q100', 'novelty_q90', 'novelty_q50']])
    
    # Classify patents
    classified_df = classify_patents_by_novelty(df)
    
    # Save classifications to CSV
    classified_df.to_csv('/home/thiesen/Documents/Projekt_EDV-TEK/AP 2 - Entstehung von TEKs/results/edv_tek_diffusion_patent_novelty_scores.csv', index=False)
    print("Saved classified patents to 'edv_tek_diffusion_patent_novelty_classified_patents.csv'")
    
    # Create visualizations
    plot_novelty_distributions(df)
    create_correlation_matrix(df)
    create_interactive_scatter(df, classified_df)
    
    # Print detailed analysis of the most novel patent
    most_novel_patent = novel_patents.iloc[0]
    print(f"\nMost Novel Patent Analysis:")
    print(f"Patent ID: {most_novel_patent['patent_id']}")
    for col in most_novel_patent.index:
        if col.startswith('novelty_q'):
            q_value = col.replace('novelty_q', '')
            percentile = (df[col] < most_novel_patent[col]).mean() * 100
            print(f"  {col}: {most_novel_patent[col]:.4f} (higher than {percentile:.1f}% of patents)")

if __name__ == "__main__":
    main()

Loaded data for 318104 patents

Novelty Score Statistics:
        novelty_q100   novelty_q99   novelty_q95   novelty_q90   novelty_q80  \
mean    7.828048e-01  7.590337e-01  7.273379e-01  7.029529e-01  6.665899e-01   
median  8.136359e-01  8.008227e-01  7.682367e-01  7.397409e-01  6.955283e-01   
std     2.471160e-01  2.258277e-01  2.061745e-01  1.941677e-01  1.796376e-01   
min    -1.192093e-07 -1.192093e-07 -1.192093e-07 -1.192093e-07 -1.192093e-07   
max     1.344882e+00  1.243452e+00  1.236221e+00  1.227182e+00  1.226836e+00   
25%     6.018012e-01  5.993468e-01  5.895924e-01  5.782231e-01  5.557623e-01   
75%     9.872853e-01  9.461754e-01  8.891302e-01  8.434938e-01  7.853377e-01   
90%     1.082509e+00  1.020041e+00  9.617546e-01  9.265894e-01  8.765201e-01   
95%     1.122165e+00  1.050279e+00  1.001528e+00  9.726209e-01  9.349262e-01   

         novelty_q50  
mean    5.820950e-01  
median  5.912667e-01  
std     1.588439e-01  
min    -1.192093e-07  
max     1.226836e+00  
25%