In [2]:
import pandas as pd
df = pd.read_csv('/Users/serenapei/llmination-reasoning/results_deepseek-r1-distill-llama-70b.csv')
print(f"Total samples: {len(df)}")
print(f"Problem types: {df['problem_type'].value_counts()}")

Total samples: 604
Problem types: problem_type
large_numbers         121
impossible_context    121
ambiguous             121
paradox               121
irrelevant_info       120
Name: count, dtype: int64


In [11]:
#!/usr/bin/env python3
"""
Simplified DeepSeek R1 Model Analysis - Focused on Memorization vs Reasoning Analysis
"""

import pandas as pd

class DeepSeekAnalyzer:
    def __init__(self, csv_path):
        """Initialize analyzer with CSV data"""
        self.df = pd.read_csv(csv_path)
        self.prepare_data()
    
    def prepare_data(self):
        """Prepare data for analysis"""
        # Convert numeric columns
        numeric_cols = ['confidence', 'response_time', 'memorization_pct', 
                       'reasoning_pct', 'exploration_pct', 'uncertainty_pct', 
                       'computation_pct', 'total_sentences']
        
        for col in numeric_cols:
            if col in self.df.columns:
                self.df[col] = pd.to_numeric(self.df[col], errors='coerce')
        
        # Identify high-memorization cases
        self.df['high_memorization'] = self.df['memorization_pct'] > 30
        
        # Identify high-computation cases
        self.df['high_computation'] = self.df['computation_pct'] > 50
        
    def analyze_behavior_patterns(self):
        """Run focused analysis on memorization and computation"""
        print("\n--- Memorization vs Reasoning Analysis ---")
        high_mem_cases = self.df[self.df['high_memorization']]
        print(f"Cases with high memorization (>30%): {len(high_mem_cases)} ({len(high_mem_cases)/len(self.df)*100:.1f}%)")
        
        if len(high_mem_cases) > 0:
            print("\nProblem types with highest memorization:")
            high_mem_by_type = high_mem_cases['problem_type'].value_counts()
            for prob_type, count in high_mem_by_type.items():
                pct = (count / len(high_mem_cases)) * 100
                print(f"  {prob_type}: {count} ({pct:.1f}%)")
        
        print("\n--- Computation Correlation Analysis ---")
        print(f"High computation cases (>50%): {self.df['high_computation'].sum()} ({self.df['high_computation'].mean()*100:.1f}%)")
        
        comp_stats = self.df.groupby('high_computation').agg({
            'response_time': ['mean', 'std'],
            'confidence': ['mean', 'std'],
            'reasoning_pct': 'mean'
        }).round(2)
        print("\nComparison of high vs low computation cases:")
        print(comp_stats)

# Main execution
if __name__ == "__main__":
    # TODO: change this to the path to the model to test
    analyzer = DeepSeekAnalyzer('/Users/serenapei/llmination-reasoning/results_gemini-2.0-flash.csv')
    analyzer.analyze_behavior_patterns()


--- Memorization vs Reasoning Analysis ---
Cases with high memorization (>30%): 27 (4.5%)

Problem types with highest memorization:
  ambiguous: 8 (29.6%)
  large_numbers: 7 (25.9%)
  paradox: 5 (18.5%)
  irrelevant_info: 4 (14.8%)
  impossible_context: 3 (11.1%)

--- Computation Correlation Analysis ---
High computation cases (>50%): 120 (19.8%)

Comparison of high vs low computation cases:
                 response_time       confidence        reasoning_pct
                          mean   std       mean    std          mean
high_computation                                                    
False                     6.38  3.09      94.25  13.94         54.64
True                      5.00  2.81      95.44  14.07         21.17


In [17]:
# added levels of memorization
#!/usr/bin/env python3
"""
Simplified DeepSeek R1 Model Analysis - Focused on Memorization vs Reasoning Analysis
Now with memorization comparison between high/low computation cases
"""

import pandas as pd

class DeepSeekAnalyzer:
    def __init__(self, csv_path):
        """Initialize analyzer with CSV data"""
        self.df = pd.read_csv(csv_path)
        self.prepare_data()
    
    def prepare_data(self):
        """Prepare data for analysis"""
        # Convert numeric columns
        numeric_cols = ['confidence', 'response_time', 'memorization_pct', 
                       'reasoning_pct', 'exploration_pct', 'uncertainty_pct', 
                       'computation_pct', 'total_sentences']
        
        for col in numeric_cols:
            if col in self.df.columns:
                self.df[col] = pd.to_numeric(self.df[col], errors='coerce')
        
        # Identify high-memorization cases
        self.df['high_memorization'] = self.df['memorization_pct'] > 30
        
        # Identify high-computation cases
        self.df['high_computation'] = self.df['computation_pct'] > 50  # Fixed typo from original
        
    def analyze_behavior_patterns(self):
        """Run focused analysis on memorization and computation"""
        print("\n--- Memorization vs Reasoning Analysis ---")
        high_mem_cases = self.df[self.df['high_memorization']]
        print(f"Cases with high memorization (>30%): {len(high_mem_cases)} ({len(high_mem_cases)/len(self.df)*100:.1f}%)")
        
        if len(high_mem_cases) > 0:
            print("\nProblem types with highest memorization:")
            high_mem_by_type = high_mem_cases['problem_type'].value_counts()
            for prob_type, count in high_mem_by_type.items():
                pct = (count / len(high_mem_cases)) * 100
                print(f"  {prob_type}: {count} ({pct:.1f}%)")
        
        print("\n--- Computation Correlation Analysis ---")
        print(f"High computation cases (>50%): {self.df['high_computation'].sum()} ({self.df['high_computation'].mean()*100:.1f}%)")
        
        # Extended analysis with memorization percentages
        comp_stats = self.df.groupby('high_computation').agg({
            'response_time': ['mean', 'std'],
            'confidence': ['mean', 'std'],
            'reasoning_pct': 'mean',
            'memorization_pct': 'mean'  # Added memorization percentage
        }).round(2)
        
        print("\nComparison of high vs low computation cases:")
        print(comp_stats)
        
        # Additional memorization comparison
        print("\nMemorization by computation intensity:")
        print(f"Low computation cases avg memorization: {comp_stats.loc[False, ('memorization_pct', 'mean')]}%")
        print(f"High computation cases avg memorization: {comp_stats.loc[True, ('memorization_pct', 'mean')]}%")
# Main execution
if __name__ == "__main__":
    # TODO: change this to the path to the model to test
    analyzer = DeepSeekAnalyzer('/Users/serenapei/llmination-reasoning/results_llama3-70b-8192.csv')
    analyzer.analyze_behavior_patterns()


--- Memorization vs Reasoning Analysis ---
Cases with high memorization (>30%): 47 (7.8%)

Problem types with highest memorization:
  large_numbers: 13 (27.7%)
  ambiguous: 10 (21.3%)
  impossible_context: 9 (19.1%)
  irrelevant_info: 8 (17.0%)
  paradox: 7 (14.9%)

--- Computation Correlation Analysis ---
High computation cases (>50%): 117 (19.3%)

Comparison of high vs low computation cases:
                 response_time       confidence        reasoning_pct  \
                          mean   std       mean    std          mean   
high_computation                                                       
False                     5.42  2.84      92.16  18.54         66.58   
True                      5.43  3.09      91.02  22.01         13.76   

                 memorization_pct  
                             mean  
high_computation                   
False                        5.36  
True                         3.66  

Memorization by computation intensity:
Low computation cases