In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings

class FleissKappaAnalyzer:
    """
    A comprehensive Fleiss' Kappa calculator for assessing inter-rater agreement
    among multiple AI models on indigenous plant classification.
    
    Formula: K = (P₀ - Pₑ) / (1 - Pₑ)
    Where:
      P₀ = observed agreement
      Pₑ = expected agreement by chance
    """
    
    def __init__(self):
        self.kappa = None
        self.p_value = None
        self.interpretation = None
        
    def validate_input(self, ratings):
        """Validate input data format and values"""
        if not isinstance(ratings, (np.ndarray, pd.DataFrame, list)):
            raise ValueError("Input must be numpy array, pandas DataFrame, or list")
            
        ratings = np.array(ratings)
        
        if ratings.ndim != 2:
            raise ValueError("Input must be 2-dimensional matrix")
            
        if ratings.size == 0:
            raise ValueError("Input matrix cannot be empty")
            
        # Check for non-integer values
        if not np.all(np.equal(np.mod(ratings, 1), 0)):
            raise ValueError("All ratings must be integer values")
            
        return ratings.astype(int)
    
    def calculate_fleiss_kappa(self, ratings, categories=None):
        """
        Calculate Fleiss' Kappa for multiple raters.
        
        Parameters:
        -----------
        ratings : array-like
            Matrix where rows are subjects and columns are raters
        categories : list, optional
            Category labels for interpretation
            
        Returns:
        --------
        dict : Contains kappa, p_value, interpretation, and detailed results
        """
        try:
            # Input validation and preprocessing
            ratings = self.validate_input(ratings)
            n, k = ratings.shape  # n subjects, k raters
            
            if n < 2 or k < 2:
                raise ValueError("Need at least 2 subjects and 2 raters")
            
            # Determine categories automatically if not provided
            if categories is None:
                categories = list(range(int(np.min(ratings)), int(np.max(ratings)) + 1))
            m = len(categories)
            
            # Build frequency matrix
            freq_matrix = np.zeros((n, m))
            for i in range(n):
                for j in range(m):
                    freq_matrix[i, j] = np.sum(ratings[i] == categories[j])
            
            # Calculate observed agreement (P₀)
            p0_numerator = 0
            for i in range(n):
                p0_numerator += np.sum(freq_matrix[i] * (freq_matrix[i] - 1))
            P0 = p0_numerator / (n * k * (k - 1))
            
            # Calculate expected agreement (Pₑ)
            p_j = np.sum(freq_matrix, axis=0) / (n * k)
            Pe = np.sum(p_j ** 2)
            
            # Handle edge cases
            if Pe == 1:
                self.kappa = 1.0  # Perfect agreement
            else:
                self.kappa = (P0 - Pe) / (1 - Pe)
            
            # Calculate statistical significance
            self.p_value = self._calculate_significance(n, k, P0, Pe)
            
            # Interpret results
            self.interpretation = self._interpret_kappa(self.kappa)
            
            return {
                'kappa': self.kappa,
                'p_value': self.p_value,
                'interpretation': self.interpretation,
                'observed_agreement': P0,
                'expected_agreement': Pe,
                'n_subjects': n,
                'n_raters': k,
                'categories': categories
            }
            
        except Exception as e:
            print(f"Error calculating Fleiss' Kappa: {str(e)}")
            return None
    
    def _calculate_significance(self, n, k, P0, Pe):
        """Calculate approximate p-value for Fleiss' Kappa"""
        if Pe == 1:
            return 0.0
            
        # Standard error approximation
        se = np.sqrt((2 * (1 - Pe)) / (n * k * (k - 1)))
        z_score = self.kappa / se if se > 0 else 0
        p_value = 2 * (1 - stats.norm.cdf(abs(z_score)))
        
        return p_value
    
    def _interpret_kappa(self, kappa):
        """Interpret Kappa value using Landis & Koch scale"""
        if kappa < 0:
            return "Poor agreement (less than chance)"
        elif kappa <= 0.2:
            return "Slight agreement"
        elif kappa <= 0.4:
            return "Fair agreement"
        elif kappa <= 0.6:
            return "Moderate agreement"
        elif kappa <= 0.8:
            return "Substantial agreement"
        else:
            return "Almost perfect agreement"
    
    def create_agreement_heatmap(self, ratings, plant_names, model_names):
        """Create a heatmap visualization of agreement patterns"""
        plt.figure(figsize=(15, 10))
        
        # Convert to agreement matrix (1 = all agree, 0 = disagree)
        agreement_matrix = np.zeros(len(ratings))
        
        for i in range(len(ratings)):
            # Check if all models agree (excluding no results)
            valid_ratings = [r for r in ratings[i] if r != -1]
            if len(valid_ratings) > 0:
                agreement_matrix[i] = len(set(valid_ratings)) == 1
            else:
                agreement_matrix[i] = 0  # No valid ratings = disagreement
        
        # Reshape for heatmap
        agreement_display = np.tile(agreement_matrix, (3, 1)).T
        
        plt.figure(figsize=(12, 8))
        sns.heatmap(agreement_display, 
                   xticklabels=model_names,
                   yticklabels=plant_names,
                   cmap=['red', 'green'],
                   cbar_kws={'label': 'Agreement (Red=Disagree, Green=Agree)'})
        
        plt.title('Inter-Model Agreement Patterns on Indigenous Plant Classification')
        plt.xlabel('AI Models')
        plt.ylabel('Indigenous Plants')
        plt.tight_layout()
        plt.show()
    
    def create_detailed_heatmap(self, ratings, plant_names, model_names):
        """Create a detailed heatmap showing actual classifications"""
        # Convert numerical ratings to descriptive labels for visualization
        label_map = {0: 'Medicinal', 1: 'Edible', 2: 'Poisonous', -1: 'No Results'}
        
        rating_labels = []
        for plant_ratings in ratings:
            labels = [label_map[r] for r in plant_ratings]
            rating_labels.append(labels)
        
        rating_df = pd.DataFrame(rating_labels, 
                               index=plant_names, 
                               columns=model_names)
        
        plt.figure(figsize=(14, 10))
        
        # Create custom colormap
        from matplotlib.colors import ListedColormap
        cmap = ListedColormap(['#2E8B57', '#FFD700', '#DC143C', '#696969'])
        
        sns.heatmap(rating_df.apply(lambda x: pd.Categorical(x).codes), 
                   cmap=cmap,
                   xticklabels=model_names,
                   yticklabels=plant_names,
                   cbar_kws={'ticks': [0, 1, 2, 3], 
                           'label': 'Classification'})
        
        # Customize colorbar labels
        cbar = plt.gca().collections[0].colorbar
        cbar.set_ticklabels(['Medicinal', 'Edible', 'Poisonous', 'No Results'])
        
        plt.title('Detailed AI Model Classifications of Indigenous Plants\n(Actual Data from Table 1)')
        plt.xlabel('AI Models')
        plt.ylabel('Indigenous Plants')
        plt.tight_layout()
        plt.show()
        
        return rating_df
    
    def generate_report(self, results, plant_names):
        """Generate a comprehensive analysis report"""
        if results is None:
            return "Analysis failed - invalid input data"
        
        report = f"""
        FLEISS' KAPPA ANALYSIS REPORT - INDIGENOUS PLANT CLASSIFICATION
        {'=' * 60}
        
        Dataset Summary:
        - Number of indigenous plants: {results['n_subjects']}
        - Number of AI models: {results['n_raters']}
        - Categories: {['No Results', 'Medicinal', 'Edible', 'Poisonous']}
        
        Agreement Statistics:
        - Fleiss' Kappa: {results['kappa']:.3f}
        - Observed Agreement (P₀): {results['observed_agreement']:.3f}
        - Expected Agreement (Pₑ): {results['expected_agreement']:.3f}
        - Statistical Significance: p = {results['p_value']:.4f}
        
        Interpretation:
        - {results['interpretation']}
        - This indicates {('significant' if results['p_value'] < 0.05 else 'non-significant')} inter-model reliability
        - The models show {results['interpretation'].lower()} in classifying indigenous plants
        
        RESEARCH IMPLICATIONS:
        • This kappa value demonstrates the level of consistency across AI models
        • Low agreement highlights bias and lack of standardized IKS knowledge
        • Supports the need for integrating Indigenous Knowledge Systems into AI training
        """
        
        return report

# ACTUAL DATA FROM TABLE 1 - Converted to numerical format
def prepare_actual_classification_data():
    """
    Convert actual Table 1 data into numerical format for Fleiss' Kappa
    Encoding: 0 = Medicinal, 1 = Edible, 2 = Poisonous, -1 = No results/Not accurate
    """
    
    # Plant names from Table 1
    plant_names = [
        "Aloe ferox", "African ginger", "Wild rosemary", "Devil's claw", 
        "African wormwood", "Pepperbark tree", "Pineapple flower", "Spekboom",
        "False horsewood", "Sand raisin", "Mountain nettle", "Acacia",
        "River karee", "Kudu lily", "Waterberg raisin", "Sweet wild garlic",
        "Cyrtanthus sanguineus", "Ruttya fruticosa", "Sesamum trilobum", "Aloe hahnii"
    ]
    
    # Actual classifications from Table 1 converted to numerical codes
    # Columns: [ChatGPT, Gemini, Mistral AI]
    # Encoding: 0=Medicinal, 1=Edible, 2=Poisonous, -1=No results/Not accurate
    actual_classifications = [
        [0, 0, 1],       # Aloe ferox: Medicinal, Medicinal, Edible
        [0, 0, 0],       # African ginger: All Medicinal
        [-1, -1, -1],    # Wild rosemary: All No results
        [0, 0, 0],       # Devil's claw: All Medicinal
        [0, 0, 0],       # African wormwood: All Medicinal
        [-1, -1, -1],    # Pepperbark tree: All Not accurate (treated as disagreement)
        [0, 0, 0],       # Pineapple flower: All Medicinal
        [-1, -1, -1],    # Spekboom: All Not accurate
        [2, -1, -1],     # False horsewood: Poisonous, No results, Not accurate
        [-1, -1, 1],     # Sand raisin: No results, Not accurate, Edible
        [-1, -1, -1],    # Mountain nettle: All Not accurate
        [0, 2, 2],       # Acacia: Medicinal, Poisonous, Poisonous
        [0, -1, -1],     # River karee: Medicinal, No results, Not accurate
        [0, -1, 0],      # Kudu lily: Medicinal, Not accurate, Medicinal
        [-1, -1, -1],    # Waterberg raisin: All Not accurate
        [-1, -1, -1],    # Sweet wild garlic: All No results
        [-1, -1, 1],     # Cyrtanthus sanguineus: Not accurate, Not accurate, Edible
        [-1, 0, -1],     # Ruttya fruticosa: No results, Medicinal, Not accurate
        [-1, -1, 1],     # Sesamum trilobum: No results, No results, Edible
        [-1, 0, -1]      # Aloe hahnii: No results, Medicinal, Not accurate
    ]
    
    return actual_classifications, plant_names

# MAIN EXECUTION
if __name__ == "__main__":
    # Initialize analyzer
    analyzer = FleissKappaAnalyzer()
    
    # Prepare ACTUAL data from Table 1
    ratings, plant_names = prepare_actual_classification_data()
    model_names = ["ChatGPT", "Gemini", "Mistral AI"]
    
    print("ANALYZING ACTUAL AI CLASSIFICATION DATA FROM TABLE 1")
    print("=" * 55)
    
    # Calculate Fleiss' Kappa with actual data
    results = analyzer.calculate_fleiss_kappa(ratings, categories=[-1, 0, 1, 2])
    
    # Generate and display comprehensive report
    if results:
        report = analyzer.generate_report(results, plant_names)
        print(report)
        
        # Create visualizations
        print("\nGenerating Agreement Visualization...")
        analyzer.create_agreement_heatmap(ratings, plant_names, model_names)
        
        print("\nGenerating Detailed Classification Heatmap...")
        detailed_df = analyzer.create_detailed_heatmap(ratings, plant_names, model_names)
        
        # Save results for dissertation
        results_df = pd.DataFrame({
            'Metric': ['Fleiss Kappa', 'P-value', 'Observed Agreement', 'Expected Agreement', 'Interpretation'],
            'Value': [f"{results['kappa']:.3f}", 
                     f"{results['p_value']:.4f}", 
                     f"{results['observed_agreement']:.3f}", 
                     f"{results['expected_agreement']:.3f}",
                     results['interpretation']]
        })
        
        print("\nRESULTS SUMMARY FOR DISSERTATION:")
        print(results_df)
        
        # Additional analysis: Agreement rate
        total_agreements = 0
        for plant_ratings in ratings:
            valid_ratings = [r for r in plant_ratings if r != -1]
            if len(valid_ratings) > 0 and len(set(valid_ratings)) == 1:
                total_agreements += 1
        
        agreement_rate = total_agreements / len(ratings)
        print(f"\nAdditional Metrics:")
        print(f"• Overall Agreement Rate: {agreement_rate:.1%} ({total_agreements}/{len(ratings)} plants)")
        print(f"• Number of Plants with Full Consensus: {total_agreements}")
        print(f"• Number of Plants with Disagreement: {len(ratings) - total_agreements}")
        
        # Save detailed results to CSV for inclusion in appendix
        detailed_results = []
        for i, plant in enumerate(plant_names):
            detailed_results.append({
                'Plant_Name': plant,
                'ChatGPT': ['No Results', 'Medicinal', 'Edible', 'Poisonous'][ratings[i][0] + 1],
                'Gemini': ['No Results', 'Medicinal', 'Edible', 'Poisonous'][ratings[i][1] + 1],
                'Mistral_AI': ['No Results', 'Medicinal', 'Edible', 'Poisonous'][ratings[i][2] + 1],
                'Consensus': 'Yes' if len(set([r for r in ratings[i] if r != -1])) == 1 else 'No'
            })
        
        detailed_df = pd.DataFrame(detailed_results)
        detailed_df.to_csv('ai_model_classification_analysis.csv', index=False)
        print(f"\nDetailed results saved to 'ai_model_classification_analysis.csv'")
   

ModuleNotFoundError: No module named 'pandas'