In [3]:
import difflib

def check_similarity(file1, file2, threshold=0.7):
    """
    Checks for identical or similar lines between two text files.

    Args:
        file1 (str): Path to the first .txt file.
        file2 (str): Path to the second .txt file.
        threshold (float): Similarity threshold to consider lines as similar (default is 0.9).

    Returns:
        list: A list of tuples (line_file1, line_file2, similarity_percentage) for similar or identical lines.
    """
    similar_lines = []

    # Read the files
    with open(file1, 'r', encoding='utf-8') as f1, open(file2, 'r', encoding='utf-8') as f2:
        lines1 = [line.strip() for line in f1.readlines()]  # Read and strip each line from file1
        lines2 = [line.strip() for line in f2.readlines()]  # Read and strip each line from file2

    # Compare each line in file1 with each line in file2
    for line1 in lines1:
        for line2 in lines2:
            # Calculate the similarity ratio between the two lines
            similarity = difflib.SequenceMatcher(None, line1, line2).ratio()
            if similarity >= threshold:
                # If similarity exceeds the threshold, add to the result list
                similar_lines.append((line1, line2, similarity))

    return similar_lines


# Example usage
file1_path = 'MLR_Selected_query.txt'  # Replace with the path to the first file
file2_path = 'MLR_snowballing.txt'  # Replace with the path to the second file

# Call the function to check for similar or identical lines
result = check_similarity(file1_path, file2_path)

# Output the results
if result:
    print("Identical or similar lines were found:")
    for line1, line2, similarity in result:
        print(f"Line 1: {line1}\nLine 2: {line2}\nSimilarity: {similarity:.2%}\n")
else:
    print("No identical or similar lines found.")


Identical or similar lines were found:
Line 1: Title
Line 2: Title
Similarity: 100.00%

Line 1: Architecting Digital Twins
Line 2: Architecting Digital Twins
Similarity: 100.00%

Line 1: Cloud-Based Battery Digital Twin Middleware Using Model-Based Development
Line 2: Cloud-Based Battery Digital Twin Middleware Using Model-Based Development
Similarity: 100.00%

