In [1]:
import pandas as pd
from sqlalchemy import create_engine
import difflib
from difflib import ndiff, unified_diff
from nltk import sent_tokenize
import re

#### Step 1: Read data from db, do simple checks and generate a file to save the suspicious content.

In [None]:
def clean_text(text):
    # Eliminate markup from speaker text
    text = re.sub(r"]]></Body>", "", text)

    # rm \xef from speaker text
    #text = re.sub(r"\xef", "", text)
    #text = re.sub(r"\xc3", "", text)

    # Convert newlines to spaces
    text = re.sub(r"\n", " ", text)

    # Eliminate multiple spaces (perhaps created by previous step)
    text = re.sub(r"\s{2,}", " ", text)
    return text.strip()

engine = create_engine("postgresql://aaz.chicagobooth.edu/postgres")

for round in range(10):
    df = pd.read_sql_query("SELECT * FROM speaker_data_merged_zjy ORDER BY RANDOM() LIMIT 100000", con=engine)

    df["speaker_text.y.clean"] = df["speaker_text.x"].map(clean_text)
    df["speaker_text.x.clean"] = df["speaker_text.y"].map(clean_text)

    tmp = []
    fname = '/home/zjy/result_' + str(round) + '.txt'
    f = open(fname, "a")

    for i in range(len(df)):
        sents_x = sent_tokenize(df["speaker_text.y.clean"][i])
        sents_y = sent_tokenize(df["speaker_text.x.clean"][i])
        if sents_x == sents_y:
            continue

        for diff in unified_diff(sents_x, sents_y):
            if diff and type(diff) == unicode:
                if diff.startswith("+") or diff.startswith("-"):
                    #print i
                    tmpdiff = diff.split()
                    tmpdiff = ' '.join(tmpdiff)
                    f.write(tmpdiff.encode('utf-8'))
                    # f.read().decode('utf8')
    
    f.close()

    print 'End of round', round


#### Step 2: Double check the result file and report difference. 

In [79]:
from os import listdir
fpath = '/home/zjy/git/compare_result/'
for fname in listdir(fpath):
    fname = fpath + fname
    flg = 0
    
    # read file
    f = open(fname, 'r')
    content = f.read().decode('utf-8')
    f.close()
    
    if len(content) == 0:
        print 'No difference. Empty file:', fname
        continue
        
    tmp_content = re.split('\.\-|\.\+|\?\-|\?\+', content)

    # rm the first +/- sign
    tmp_content[0] = tmp_content[0][1:]
    # rm the last sign
    tmp_content[-1] = tmp_content[-1][:len(tmp_content[-1])-1]

    if len(tmp_content) % 2 != 0:
        flg = 1
        print 'Error: odd sentences in file', fname
        print tmp_content

    n = len(tmp_content) / 2
    for i in range(n):
        if tmp_content[i * 2] != tmp_content[i * 2 + 1]:
            flg = 1
            print 'Attn: difference detected in file', fname
            print tmp_content[i * 2]
            print tmp_content[i * 2 + 1]
    if flg == 0:
        print 'No difference:', fname


No difference. Empty file: /home/zjy/git/compare_result/result_3.txt
No difference: /home/zjy/git/compare_result/result_5.txt
No difference: /home/zjy/git/compare_result/result_8.txt
No difference: /home/zjy/git/compare_result/result_9.txt
No difference. Empty file: /home/zjy/git/compare_result/result_0.txt
No difference. Empty file: /home/zjy/git/compare_result/result_4.txt
No difference: /home/zjy/git/compare_result/result_2.txt
No difference. Empty file: /home/zjy/git/compare_result/result_6.txt
No difference. Empty file: /home/zjy/git/compare_result/result_7.txt
No difference. Empty file: /home/zjy/git/compare_result/result_1.txt
