# Analyzing differences between the tools
This script calculates differences between the output of each tools. You can control the tools being compared by changing the `TOOL_A` and `TOOL_B` variables

In [None]:
from helpers.sqlite_helpers import sql_query_to_pd

(TOOL_A, TOOL_B) = ('jdime', 'last_merge')
# (TOOL_A, TOOL_B) = ('spork', 'mergiraf')
# (TOOL_A, TOOL_B) = ('last_merge', 'mergiraf')

## Situation 1 - Tools integrate without conflicts and are syntactically equivalent

In [None]:
scenarios_in_which_both_tools_integrated_without_conflict_and_outputs_are_equivalent_query = f"""
  SELECT
    gepc.scenario_id
  FROM
    global_executions_per_commit_filtered gepc
  JOIN
    {TOOL_A}_{TOOL_B}_equivalency_per_commit epc
  ON
    gepc.scenario_id = epc.scenario_id
  WHERE
    outputs_equivalent = 1
    AND {TOOL_A}_result = 'SUCCESS_WITHOUT_CONFLICTS'
    AND {TOOL_B}_result = 'SUCCESS_WITHOUT_CONFLICTS'
"""

sql_query_to_pd(scenarios_in_which_both_tools_integrated_without_conflict_and_outputs_are_equivalent_query)

## Situation 2 - Tools integrate without conflicts, but there are syntactic differences

In [None]:
scenarios_in_which_both_tools_integrated_without_conflict_and_outputs_are_different_query = f"""
  SELECT
    gepc.scenario_id
  FROM
    global_executions_per_commit_filtered gepc
  JOIN
    {TOOL_A}_{TOOL_B}_equivalency_per_commit epc
  ON
    gepc.scenario_id = epc.scenario_id
  WHERE
    outputs_equivalent = 0
    AND {TOOL_A}_result = 'SUCCESS_WITHOUT_CONFLICTS'
    AND {TOOL_B}_result = 'SUCCESS_WITHOUT_CONFLICTS'
"""

sql_query_to_pd(scenarios_in_which_both_tools_integrated_without_conflict_and_outputs_are_different_query)

## Situation 3 - With conflicts and the conflicts are equal

In [None]:
scenarios_in_which_tools_found_conflict_on_different_files_query = f"""
  SELECT
    scenario_id,
    file_path,
    {TOOL_A}_result,
    {TOOL_B}_result
  FROM
    global_executions
  WHERE
    scenario_id IN (
      SELECT
        gepc.scenario_id
      FROM
        global_executions_per_commit_filtered gepc
      WHERE
        {TOOL_A}_result = 'SUCCESS_WITH_CONFLICTS'
        AND {TOOL_B}_result = 'SUCCESS_WITH_CONFLICTS'
    )
    AND
    {TOOL_A}_result != {TOOL_B}_result
"""
scenarios_in_which_tools_found_conflict_on_different_files = sql_query_to_pd(scenarios_in_which_tools_found_conflict_on_different_files_query)
scenarios_in_which_tools_found_conflict_on_different_files

In [None]:
scenarios_in_which_both_tools_found_conflict_and_they_are_equal_query = f"""
  SELECT
    scenario_id,
    count(file_path),
    sum(all_conflicts_match)
  FROM
    {TOOL_A}_{TOOL_B}_global_conflicts_information
  WHERE
    scenario_id IN (
      SELECT
        gepc.scenario_id
      FROM
        global_executions_per_commit_filtered gepc
      WHERE
        {TOOL_A}_result = 'SUCCESS_WITH_CONFLICTS'
        AND {TOOL_B}_result = 'SUCCESS_WITH_CONFLICTS'
    )
    AND scenario_id NOT IN (SELECT scenario_id FROM ({scenarios_in_which_tools_found_conflict_on_different_files_query}))
    GROUP BY
      scenario_id
    HAVING
      count(distinct file_path) == sum(all_conflicts_match)
"""

sql_query_to_pd(scenarios_in_which_both_tools_found_conflict_and_they_are_equal_query)

## Situation 4 - With conflicts but the conflicts are different

In [None]:
scenarios_in_which_both_tools_found_conflict_on_the_same_file_but_they_are_different_query = f"""
  SELECT
    scenario_id,
    count(file_path),
    sum(all_conflicts_match)
  FROM
    {TOOL_A}_{TOOL_B}_global_conflicts_information
  WHERE
    scenario_id IN (
      SELECT
        gepc.scenario_id
      FROM
        global_executions_per_commit_filtered gepc
      WHERE
        {TOOL_A}_result = 'SUCCESS_WITH_CONFLICTS'
        AND {TOOL_B}_result = 'SUCCESS_WITH_CONFLICTS'
    )
    GROUP BY
      scenario_id
    HAVING
      count(distinct file_path) != sum(all_conflicts_match)
"""

scenarios_in_which_both_tools_found_conflict_but_they_are_different_query = f"""
  SELECT
    scenario_id
  FROM
    ({scenarios_in_which_both_tools_found_conflict_on_the_same_file_but_they_are_different_query})
  UNION
  SELECT
    scenario_id
  FROM
    ({scenarios_in_which_tools_found_conflict_on_different_files_query})
"""

scenarios_in_which_both_tools_found_conflict_but_they_are_different = sql_query_to_pd(scenarios_in_which_both_tools_found_conflict_but_they_are_different_query)
scenarios_in_which_both_tools_found_conflict_but_they_are_different

## Situation 5 - Disagreement about the existence of conflicts

In [None]:
scenarios_with_distinct_results_query = f"""
  SELECT
    scenario_id, project, merge_sha, {TOOL_A}_result, {TOOL_B}_result
  FROM
    global_executions_per_commit_filtered gepc
  WHERE
    {TOOL_A}_result != {TOOL_B}_result
"""

sql_query_to_pd(scenarios_with_distinct_results_query)

In [None]:
count_of_distinct_results_by_tool_query = f"""
  SELECT
    '{TOOL_A.upper()}' as tool,
    count(scenario_id) as exclusive_conflicts_count
  FROM
    ({scenarios_with_distinct_results_query})
  WHERE
    {TOOL_A}_result = 'SUCCESS_WITH_CONFLICTS'
  UNION
  SELECT
    '{TOOL_B.upper()}' as tool,
    count(scenario_id) as exclusive_conflicts_count
  FROM
    ({scenarios_with_distinct_results_query})
  WHERE
    {TOOL_B}_result = 'SUCCESS_WITH_CONFLICTS'
"""

sql_query_to_pd(count_of_distinct_results_by_tool_query)

## Condensed visualization

In [None]:
scenarios_percentage_subquery = f"""ROUND(COUNT(*) * 100.0 / (SELECT COUNT(scenario_id) FROM global_executions_per_commit_filtered), 2) as Percentage"""

sql_query_to_pd(f"""
  SELECT
    'Same results' as Situation, COUNT(*) as Total, {scenarios_percentage_subquery}
  FROM
    (
      SELECT scenario_id FROM ({scenarios_in_which_both_tools_integrated_without_conflict_and_outputs_are_equivalent_query})
      UNION
      SELECT scenario_id FROM ({scenarios_in_which_both_tools_found_conflict_and_they_are_equal_query})
    )
  UNION
  SELECT
    'Different results' as Situation, COUNT(*) as Total, {scenarios_percentage_subquery}
  FROM
    (
      SELECT scenario_id FROM ({scenarios_in_which_both_tools_integrated_without_conflict_and_outputs_are_different_query})
      UNION
      SELECT scenario_id FROM ({scenarios_with_distinct_results_query})
      UNION
      SELECT scenario_id FROM ({scenarios_in_which_both_tools_found_conflict_but_they_are_different_query})
    )
""")

## Granular visualization

In [None]:
scenarios_percentage_subquery = f"""ROUND(COUNT(distinct scenario_id) * 100.0 / (SELECT COUNT(scenario_id) FROM global_executions_per_commit_filtered), 2) as Percentage"""

scenario_resume_query = f"""
  SELECT
    '1. No conflicts and syntactically equivalent' as situation, scenario_id
  FROM
    ({scenarios_in_which_both_tools_integrated_without_conflict_and_outputs_are_equivalent_query})
  UNION
  SELECT
    '2. No conflicts but syntactically different' as situation, scenario_id
  FROM
    ({scenarios_in_which_both_tools_integrated_without_conflict_and_outputs_are_different_query})
  UNION
  SELECT
    '3. With conflicts and equal/subset textually' as situation, scenario_id
  FROM
    ({scenarios_in_which_both_tools_found_conflict_and_they_are_equal_query})
  UNION
  SELECT
    '4. With conflicts but not equal/subset textually' as situation, scenario_id
  FROM
    ({scenarios_in_which_both_tools_found_conflict_but_they_are_different_query})
  UNION
  SELECT
    '5. Disagreement on the existence of conflicts' as situation, scenario_id
  FROM
    ({scenarios_with_distinct_results_query})
"""

visualization_query = f"""
  SELECT
    situation,
    COUNT(scenario_id) as Total,
    {scenarios_percentage_subquery}
  FROM
    ({scenario_resume_query})
  GROUP BY
    situation
  ORDER BY
    situation
"""
sql_query_to_pd(visualization_query)