In [1]:
# Does not need to be executed if
# ~/.ipython/profile_default/ipython_config.py
# exists and contains:
# c.InteractiveShell.ast_node_interactivity = 'all'

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
# # 1.0 Read files
# file_1 = 'file_1_1.txt'
# file_2 = 'file_1_2.txt'

# def shorten_lines(file):    
#     line_templates = {
#         "A changed line":'C',
#         "A line to delete":'D',
#         "A line to insert":'I',
#         "A line to change": 'TC',
#         "A line that stays":'S'}

#     shortened_lines = []
#     with open(file) as f1:
#         f1_lines = f1.readlines()
#         for line in f1_lines:
#             line_split = line.split(": ")
#             shortened_lines.append(line_templates[line_split[0]] + line_split[1].strip())
#     return shortened_lines


# file_1_shortened = shorten_lines(file_1)
# file_2_shortened = shorten_lines(file_2)
# print(file_1_shortened)
# print(file_2_shortened)

In [3]:
# Working Space
import re

def valid_file_lines(file_lines):
    # 4.1 If we have more than one comma, return false
    if len(re.findall("[,]",file_lines)) > 1:
        return False
    if file_lines.__contains__(','):
        n1, n2 = file_lines.split(',')
        # 4.2 If we have any non-numeric, return False
        if not n1.isnumeric() or not n2.isnumeric():
            return False
        # 4.3 If the 2nd number is smaller than the first, return False
        if int(n2) <= int(n1):
            return False
    return True


def check_input_lines(lines):
    valid_input = True
    lcs_size = 0
    file_1_prev = 0
    file_2_prev = 0
    
    # 0. Check for empty file
    if len(lines) == 0:
        return False
    
    for line in lines:
        
        # 1. Check for spaces in the line 
        # print(line.strip('\n'))
        if line.__contains__(' '):
            return False
        
        # 2. Check if we have a valid command
        command = re.findall("[A-z]",line)
        # print(command)
        if len(command) != 1 or not command[0] in ['a', 'd', 'c']:
            return False
        
        # 3. Retrieve 2x variables for the first file lines, command, and second file lines
        command = command[0]
        first_file_lines, second_file_lines = line.split(command)
        second_file_lines = second_file_lines.strip('\n')
        # print(first_file_lines + " " + command + " " + second_file_lines)
        
        # 4. Need to check if line numbers are valid
        if not valid_file_lines(first_file_lines) or not valid_file_lines(second_file_lines):
            return False
        
        # 5. Depending on the command we need to check different things
        # 5.1 Check for add commandd is of the format: [1]+[a]+[2.1,2.2]
        if command == 'a' : 
            if first_file_lines.__contains__(','):
                return False
        
        # 5.3 Check for delete commandd is of the format: [1.1,1.2]+[d]+[1]
        if command == 'd' : 
            if second_file_lines.__contains__(','):
                return False
            
        # 6. Check if the distances between previous commands is constant (i.e. checking for the 'same' lines)
        file_1_command_start, file_1_command_end = get_command_positions(first_file_lines, command, 1)
        file_2_command_start, file_2_command_end = get_command_positions(second_file_lines, command, 2)    
            
        # print(f'File 1: Prev = {file_1_prev} Command Start = {file_1_command_start}, Command End = {file_1_command_end}')
        # print(f'File 2: Prev = {file_2_prev} Command Start = {file_2_command_start}, Command End = {file_2_command_end}')
        # print(f'F1: Start - Prev = {file_1_command_start - file_1_prev}')
        # print(f'F2: Start - Prev = {file_2_command_start - file_2_prev}')
        
        if (file_1_command_start - file_1_prev) != (file_2_command_start - file_2_prev):
            return False
        else:
            # The difference is the lines that stay the same --> i.e. contribute to the # of common lines
            lcs_size += file_1_command_start - file_1_prev

        # Start of 2nd command is not strictly greater than the end of the previous command    
        if line != lines[0] and (file_1_command_start == file_1_prev or file_2_command_start == file_2_prev):
            # print('Start of 2nd command is not strictly greater than the end of the previous command')
            return False

        file_1_prev = file_1_command_end
        file_2_prev = file_2_command_end
        # print(f'LCS Implied = {lcs_size}')
        # print()
            
    return True
    
def get_command_positions(line_commands, command, file):
    # Initiailise the start and end positions depending on if we have a single line or a set (tuple) of lines
    if line_commands.__contains__(','):
        file_command_start = int(line_commands.split(',')[0])
        file_command_end = int(line_commands.split(',')[1])
    else:
        file_command_start = int(line_commands)
        file_command_end = int(line_commands)
        
    # Need to adjust start/end position based on what file and command we're working with 
    if command == "a" and file == 2:
        file_command_start -= 1
    if command == "c":
        file_command_start -= 1
    if command == "d" and file == 1:
        file_command_start -= 1
    return file_command_start, file_command_end
     
class DiffCommandsError(Exception):
    pass

class DiffCommands:
    def __init__(self, filename):
        with open(filename, 'r') as f:
            self.lines = f.readlines()
        if not check_input_lines(self.lines):
            raise DiffCommandsError('Cannot possibly be the commands for the diff of two files')
            
    def __str__(self):
        # TODO: last line has a '\n' that may need to be ignored
        return ''.join(self.lines).strip('\n')
    

class OriginalNewFiles:
    def __init__(self, original_filename, new_filename):
        with open(original_filename, 'r') as f1, open(new_filename, 'r') as f2:
            self.first_file = f1.readlines()
            self.second_file = f2.readlines()
            
    def print_first_file(self):
        print(''.join(self.first_file).strip('\n'))
    
    def print_second_file(self):
        print(''.join(self.second_file).strip('\n'))
    
    
 
    
    def output_diff(self, diff_file):
        for line in diff_file.lines:
            print(line.strip('\n'))
            command = re.findall("[A-z]",line)[0]
            first_file_lines, second_file_lines = line.split(command)
            second_file_lines = second_file_lines.strip('\n')

            file_1_command_start, file_1_command_end = get_command_positions(first_file_lines, command, 1)    
            file_2_command_start, file_2_command_end = get_command_positions(second_file_lines, command, 2)    

            if command == 'a':
                for i in range(file_2_command_start, file_2_command_end):
                    print(">", self.second_file[i].strip('\n'))

            if command == 'd':
                for i in range(file_1_command_start, file_1_command_end):
                    print("<", self.first_file[i].strip('\n'))

            if command == 'c':
                for i in range(file_1_command_start, file_1_command_end):
                    print("<", self.first_file[i].strip('\n'))
                print("---")
                for i in range(file_2_command_start, file_2_command_end):
                    print(">", self.second_file[i].strip('\n'))
    
    def output_unmodified(self, diff_file, command_skipped, file_to_output):
        prev = 0
        file = (self.first_file, self.second_file)[file_to_output - 1]
        
        for line in diff_file.lines:
            # print(line.strip('\n'))
            command = re.findall("[A-z]",line)[0]
            if command == command_skipped:
                continue
            file_lines = line.split(command)[file_to_output - 1]
            command_start, command_end = get_command_positions(file_lines, command, file_to_output)    
            # print(f'File Stats: Prev = {prev} Command Start = {command_start}, Command End = {command_end}')
            for i in range(prev, command_start):
                print(file[i].strip('\n'))
            prev = command_end
            print("...")

        for i in range(prev, len(file)):
            print(file[i].strip('\n'))

    def output_unmodified_from_original(self, diff_file):
        self.output_unmodified(diff_file, 'a', 1)
    
    def output_unmodified_from_new(self, diff_file):
        self.output_unmodified(diff_file, 'd', 2)
        
    def compute_transformation_table(self):
        deletion_cost = 1
        insertion_cost = 1
        substitution_cost = 2
        F_1 = len(self.first_file) + 1
        F_2 = len(self.second_file) + 1
        table = [[(0, []) for _ in range(F_2)] for _ in range(F_1)]

        for i in range(1, F_1):
            table[i][0] = i, ['-']
        for j in range(1, F_2):
            table[0][j] = j, ['|']

        d = {}

        for i in range(1, F_1):
            for j in range(1, F_2):
                d['-'] = table[i - 1][j][0] + deletion_cost
                d['|'] = table[i][j - 1][0] + insertion_cost
                d['/'] = table[i - 1][j - 1][0] if self.first_file[i - 1] == self.second_file[j - 1] else table[i - 1][j - 1][0] + substitution_cost
                minimal_cost = min(d.values())        
                table[i][j] = minimal_cost, [x for x in d if d[x] == minimal_cost]
        return table

    def compute_levenshtein_distance(self):
#         deletion_cost = 1
#         insertion_cost = 1
#         substitution_cost = 2
#         F_1 = len(self.first_file) + 1
#         F_2 = len(self.second_file) + 1
#         table = [[(0, []) for _ in range(F_2)] for _ in range(F_1)]

#         for i in range(1, F_1):
#             table[i][0] = i, ['-']
#         for j in range(1, F_2):
#             table[0][j] = j, ['|']

#         d = {}

#         for i in range(1, F_1):
#             for j in range(1, F_2):
#                 d['-'] = table[i - 1][j][0] + deletion_cost
#                 d['|'] = table[i][j - 1][0] + insertion_cost
#                 d['/'] = table[i - 1][j - 1][0] if self.first_file[i - 1] == self.second_file[j - 1] else table[i - 1][j - 1][0] + substitution_cost
#                 minimal_cost = min(d.values())        
#                 table[i][j] = minimal_cost, [x for x in d if d[x] == minimal_cost]

#         return table[len(self.first_file)][len(self.second_file)][0]
        return self.compute_transformation_table()[len(self.first_file)][len(self.second_file)][0]

    def compute_diff_file_cost(self, diff_file):
        cost = 0
        for line in diff_file.lines:
            command = re.findall("[A-z]",line)[0]
            first_file_lines, second_file_lines = line.split(command)
            second_file_lines = second_file_lines.strip('\n')

            file_1_command_start, file_1_command_end = get_command_positions(first_file_lines, command, 1)    
            file_2_command_start, file_2_command_end = get_command_positions(second_file_lines, command, 2)    

            if command == 'd':
                cost += file_1_command_end - file_1_command_start

            if command == 'a':
                cost += file_2_command_end - file_2_command_start    

            if command == 'c':
                cost += file_1_command_end - file_1_command_start
                cost += file_2_command_end - file_2_command_start    
            
        return cost
    
    def is_a_possible_diff(self, diff_file):
        if self.compute_diff_file_cost(diff_file) == self.compute_levenshtein_distance():
            return True
        else:
            return False
        
    def get_all_diff_commands():
        print("Testing get_all_diff_commands")
        return ["Diff_Files"]


In [4]:
# Wrong5, Wrong6, Wrong7
diff_1 = DiffCommands('diff_1.txt')
diff_2 = DiffCommands('diff_2.txt')
diff_3 = DiffCommands('diff_3.txt')
# DiffCommands('wrong_4.txt')
# DiffCommands('wrong_5.txt')
# DiffCommands('wrong_6.txt')
# DiffCommands('wrong_7.txt')

In [5]:
pair_of_files = OriginalNewFiles('file_1_1.txt', 'file_1_2.txt')

In [6]:
def compute_levenshtein_distance(self):
    return self.compute_transformation_table()[len(self.first_file)][len(self.second_file)][0]

def get_all_diff_commands(self):
    table = self.compute_transformation_table()
    backtraces = [[table[i][j][1] for j in range(len(self.second_file) + 1)
              ] for i in range(len(self.first_file) + 1)
             ]
    return backtraces

In [7]:
def compute_alignments(self, i, j, steps=[]):
    backtraces = get_all_diff_commands(pair_of_files)
    if i == j == 0:
        yield '', '', ''
    if '/' in backtraces[i][j]:
        for pair in compute_alignments(pair_of_files, i - 1, j - 1, steps):
            
            if self.first_file[i - 1] == self.second_file[j - 1]:
#                 print("s = ", end="")
#                 print("f1:" + self.first_file[i - 1], end="")
                pass
            else:
#                 print("c = ", end="")
#                 print("f1:" + self.first_file[i - 1].strip('\n') + "          " + "f2:" + self.second_file[j - 1], end="")
                
#                 steps.append(("C",self.first_file[i - 1], self.second_file[j-1]))
                steps.append(("C",i, j))
                
            yield pair[0] + self.first_file[i - 1], pair[1] + self.second_file[j - 1], steps
            
    if '-' in backtraces[i][j]:
        for pair in compute_alignments(pair_of_files, i - 1, j, steps):
#             print("d = ", end="")
#             print("f1:" + self.first_file[i - 1], end="")
            
#             steps.append(("D",self.first_file[i - 1], self.second_file[j-1]))
            steps.append(("D",i, j))
            
            yield pair[0] + self.first_file[i - 1], pair[1] + '_', steps
            
    if '|' in backtraces[i][j]:
        for pair in compute_alignments(pair_of_files, i, j - 1, steps):
#             print("a = ", end="")
#             print("f1:                             " + "f2:" + self.second_file[j - 1], end="")
#             steps.append(("A",self.first_file[i - 1], self.second_file[j-1]))
            steps.append(("A",i, j))
            yield pair[0] + '_', pair[1] + self.second_file[j - 1], steps
    steps = []
            

In [8]:
generator_list = compute_alignments(pair_of_files, len(pair_of_files.first_file), len(pair_of_files.second_file))

line1, line2, line3 = next(generator_list)


In [9]:
def convert_to_diff(first_start, first_prev, second_start, second_prev, prev_command):
    if first_start == first_prev:
        first_range = first_start
    else:
        first_range = f'{first_start},{first_prev}'

    if second_start == second_prev:
        second_range = second_start
    else: 
        second_range = f'{second_start},{second_prev}'
    
    return f'{first_range}{prev_command}{second_range}'
    
def convert_steps_to_diff_command(line_actions):
    current_command = ''
    first_start = 0
    first_prev = 0
    second_start = 0
    second_prev = 0
    first_range = None
    second_range = None
    prev_command = ''
    for line in line_actions:
        if prev_command == '':
            # Kick off the loop by setting the first command
            prev_command = line[0]
            
            first_start = line[1]
            first_prev = first_start
            
            second_start = line[2]
            second_prev = second_start
        else:
            if prev_command != line[0]:
                print(convert_to_diff(first_start, first_prev, second_start, second_prev, prev_command))
                prev_command = line[0]
                first_start = line[1]
                first_prev = line[1]
                second_start = line[2]
                second_prev = line[2]
            else:
                first_prev = line[1]
                second_prev = line[2]
    print(convert_to_diff(first_start, first_prev, second_start, second_prev, prev_command))
        
                
convert_steps_to_diff_command(line3)

1,2D0
3A2
5C4
7A7,8
10,13D10,12
14,16C13,15
