In [4]:
import sqlglot
from sqlglot import exp
from sql_nameguard.analyze import SQLAnalyzer

class SSCSCalculator:
    def __init__(self):
        # Configuration for weights
        self.weights = {
            exp.Join: 1,
            exp.Where: 1,
            exp.Group: 1,
            exp.Having: 1,
            exp.Order: 1,
            exp.Case: 2,           # Branching logic = higher load
            exp.Window: 2,         # Window functions are complex
            exp.Connector: 1,      # AND / OR
            exp.Subquery: 1        # Base penalty for existence of subquery
        }
        
        # Configuration for Semantic Penalty
        self.semantic_weight = 0.5  # Alpha in the formula
        self.min_alias_length = 3
        self.generic_aliases = {'temp', 'data', 't', 'x', 'val', 'obj', 'row'}
        self.analyzer = SQLAnalyzer()

    def calculate(self, sql: str):
        """
        Parses SQL and returns the SSCS score along with a detailed breakdown.
        """
        try:
            parsed = sqlglot.parse_one(sql)
        except Exception as e:
            return {"error": f"Parse Error: {e}"}

        # 1. Isolate CTEs and Main Query
        ctes = []
        main_query = parsed

        # If there is a WITH clause, extract CTEs
        if parsed.find(exp.CTE):
            # We treat CTEs as independent "functions" for complexity
            # Note: sqlglot stores CTEs in the 'with' arg of the main expression
            ctes = parsed.find_all(exp.CTE)
                # We analyze the main query as if the CTEs are just tables
                # (The complexity of defining the CTE is handled separately)


        
        
        # 2. Calculate Structural Complexity (C_struct)
        # Sum of CTE complexities + Main Query complexity
        struct_score = 0
        component_scores = []
        sscs_scores = {}

        # Analyze CTEs (Depth starts at 0 for each, promoting modularity)
        for cte in ctes:
            cte_score = self._compute_structural_score(cte.this, depth=0)
            struct_score += cte_score
            component_scores.append(f"CTE '{cte.alias}': {cte_score}")
            cte_penalty, _ = self._compute_semantic_penalty(cte.this)
            cte_sscs = cte_score * (1 + cte_penalty)
            sscs_scores[cte.alias] = {
                "SSCS": round(cte_sscs, 2),
                "Structural Score": round(cte_score, 2),
                "Semantic Penalty": round(cte_penalty, 2)
            }

        # Analyze Main Query (Depth starts at 0)
        # We explicitly exclude the WITH clause from traversal to avoid double counting
        main_score = self._compute_structural_score(main_query, depth=0, exclude_node=exp.With)
        struct_score += main_score
        component_scores.append(f"Main Query: {main_score}")
        main_penalty, _ = self._compute_semantic_penalty(main_query)
        main_sscs = main_score * (1 + main_penalty)

        # 3. Calculate Semantic Penalty (P_sem)
        # We look at all aliases across the entire parsed tree globally
        semantic_penalty, alias_stats = self._compute_semantic_penalty(parsed)

        main_sscs = main_score * (1 + semantic_penalty)

        # 4. Final Formula: SSCS = C_struct * (1 + P_sem)
        final_sscs = struct_score * (1 + semantic_penalty)

        sscs_scores["final SELECT"] = {
            "SSCS": round(main_sscs, 2),
            "Structural Score": round(main_score, 2),
            "Semantic Penalty": round(semantic_penalty, 2)
        }

        sscs_scores["overall"] = {
            "SSCS": round(final_sscs, 2),
            "Structural Score": round(struct_score, 2),
            "Semantic Penalty": round(semantic_penalty, 2)
        }

        return {
            "sscs_scores": sscs_scores,
            "breakdown": component_scores,
            "alias_analysis": alias_stats
        }
    
    def _compute_structural_score(self, node, depth, exclude_node=None):
        """
        Recursive visitor to calculate complexity weights based on AST nodes.
        Increases depth penalty for nested subqueries.
        """
        score = 0
        
        # If this node is the one we want to exclude (e.g. the CTE definitions block), stop recursion
        if exclude_node and isinstance(node, exclude_node):
            return 0

        # Apply Weight if node type is in our config
        if type(node) in self.weights:
            base_weight = self.weights[type(node)]
            # Formula: Weight + Depth Penalty
            # We add depth to the weight. Deeper logic is heavier.
            score += base_weight + (0.5 * depth)

        # Check for nesting triggers
        # If we enter a Subquery (SELECT inside FROM/WHERE), increment depth
        next_depth = depth
        if isinstance(node, exp.Subquery):
            next_depth += 1
        
        # Recursively visit children
        # sqlglot's args.values() gives us lists of children or single children
        for child_list in node.args.values():
            if isinstance(child_list, list):
                for child in child_list:
                    if isinstance(child, exp.Expression):
                        score += self._compute_structural_score(child, next_depth, exclude_node)
            elif isinstance(child_list, exp.Expression):
                score += self._compute_structural_score(child_list, next_depth, exclude_node)
                
        return score
    
    def _compute_semantic_penalty(self, root_node, threshold=0.7):
        """
        Uses semantic similarity analysis to compute penalty for poorly named aliases.
        
        Converts the parsed AST back to SQL, analyzes it with the SQLAnalyzer,
        and penalizes aliases with low similarity scores between the alias name
        and the code they represent.
        """
        # Convert parsed node back to SQL string for analyzer
        sql_str = root_node.sql()
        
        # Get semantic analysis from the analyzer
        analysis = self.analyzer.analyze(sql_str, threshold=threshold)
        
        if not analysis:
            return 0.0, {"total": 0, "low_similarity": [], "average_similarity": 0.0}
        
        # Calculate penalty based on similarity scores
        # Lower similarity = higher penalty
        similarities = [item['similarity'] for item in analysis]
        low_similarity_items = [item for item in analysis if item['similarity'] < threshold]
        
        # Penalty formula: average the "badness" (1 - similarity) for items below threshold
        if low_similarity_items:
            avg_badness = sum(1.0 - item['similarity'] for item in low_similarity_items) / len(analysis)
            penalty = self.semantic_weight * avg_badness
        else:
            penalty = 0.0
        
        avg_similarity = sum(similarities) / len(similarities) if similarities else 0.0
        
        return penalty, {
            "total": len(analysis), 
            "low_similarity_count": len(low_similarity_items),
            "low_similarity_examples": [
                {
                    "alias": item['alias'], 
                    "similarity": round(item['similarity'], 3),
                    "type": item['type']
                } 
                for item in low_similarity_items[:5]
            ],
            "average_similarity": round(avg_similarity, 3)
        }

In [5]:
# --- Example Usage ---

complex_sql = """
WITH revenue_cte AS (
    SELECT 
        c.id, 
        sum(o.amount) as total_rev
    FROM customers c
    JOIN orders o ON c.id = o.customer_id
    GROUP BY 1
),
risky_users AS (
    SELECT 
        id 
    FROM revenue_cte r
    WHERE r.total_rev > 10000 
      AND (CASE WHEN r.total_rev > 50000 THEN 1 ELSE 0 END) = 1
)
SELECT 
    t1.id, 
    t1.total_rev
FROM revenue_cte t1
LEFT JOIN (
    SELECT user_id, count(*) as c FROM logs GROUP BY 1
) t2 ON t1.id = t2.user_id
WHERE t1.id IN (SELECT id FROM risky_users)
"""

calc = SSCSCalculator()
result = calc.calculate(complex_sql)

Alias 'total_rev' appears to be a poor name for the code (similarity=0.460)

code:
SUM(o.amount)

Alias 'c' appears to be a poor name for the code (similarity=0.414)

code:
customers AS c

Alias 'o' appears to be a poor name for the code (similarity=0.484)

code:
orders AS o

Alias 'r' appears to be a poor name for the code (similarity=0.356)

code:
revenue_cte AS r

Alias 'revenue_cte' appears to be a poor name for the code (similarity=0.414)

code:
SELECT c.id, SUM(o.amount) AS total_rev FROM customers AS c JOIN orders AS o ON c.id = o.customer_id GROUP BY 1

Alias 'risky_users' appears to be a poor name for the code (similarity=0.270)

code:
SELECT id FROM revenue_cte AS r WHERE r.total_rev > 10000 AND (CASE WHEN r.total_rev > 50000 THEN 1 ELSE 0 END) = 1

Alias 'c' appears to be a poor name for the code (similarity=0.304)

code:
COUNT(*)

Alias 'total_rev' appears to be a poor name for the code (similarity=0.460)

code:
SUM(o.amount)

Alias 't1' appears to be a poor name for the co

In [7]:
result['sscs_scores']

{'revenue_cte': {'SSCS': 2.55,
  'Structural Score': 2.0,
  'Semantic Penalty': 0.27},
 'risky_users': {'SSCS': 3.97,
  'Structural Score': 3.0,
  'Semantic Penalty': 0.32},
 'final SELECT': {'SSCS': 5.84,
  'Structural Score': 4.5,
  'Semantic Penalty': 0.3},
 'overall': {'SSCS': 12.33, 'Structural Score': 9.5, 'Semantic Penalty': 0.3}}