In [22]:
import os
import pandas as pd
import pprint
from functools import reduce

In [29]:
file_paths = ['expert_fully_aligned.txt',
              'expert_partially_aligned.txt',
              'automatic_fully_aligned.txt']

def read_data(file_paths):
    dataframes = []
    
    for file_path in file_paths:
        print(file_path)
        df = pd.read_csv(file_path, sep="\t", header=None, names=["Wiki", "SimpleWiki", "score"])
        df.drop(columns={'SimpleWiki', 'score'}, inplace=True)
        df.Wiki = df.Wiki.str.replace("\\s+(?=[),'.;%:])", "").str.replace("\\`` ", "''").str.replace("\\(\s+", "(").str.replace("\\`\s+", "'").str.replace("\\\/", "/")
        dataframes.append(df)

    return dataframes

In [30]:
dataframes = read_data(file_paths) #[df1, df2, df3]
dataframes

expert_fully_aligned.txt
expert_partially_aligned.txt
automatic_fully_aligned.txt


[                                                   Wiki
 0     Under conditions of high humidity, the rate of...
 1     The lack of oxygen above 2,400 metres (8,000 f...
 2     The human body can adapt to high altitude by b...
 3     For example, hemoglobin and myoglobin contain ...
 4     Schistosomiasis, caused by one genus of tremat...
 ...                                                 ...
 2262  Involved medical researchers at the University...
 2263  Her condition is similar to many other conditi...
 2264  Velazquez has a condition that is so rare that...
 2265  Two polio vaccines are used throughout the wor...
 2266  Therefore, interruption of person to person tr...
 
 [2267 rows x 1 columns],
                                                    Wiki
 0     Because humans perceive the rate of heat trans...
 1     Creatine has the ability to increase muscle st...
 2     The Mayo Clinic states that creatine has been ...
 3     A study, involving 18 vegetarians and 24 non-v...
 4 

In [36]:
dataframes[2].loc[dataframes[2].Wiki=='Lower urinary tract infection is also referred to as a bladder infection.']

Unnamed: 0,origin,Wiki
156,wiki_auto,Lower urinary tract infection is also referred...
157,wiki_auto,Lower urinary tract infection is also referred...
158,wiki_auto,Lower urinary tract infection is also referred...
159,wiki_auto,Lower urinary tract infection is also referred...
160,wiki_auto,Lower urinary tract infection is also referred...
161,wiki_auto,Lower urinary tract infection is also referred...
162,wiki_auto,Lower urinary tract infection is also referred...
163,wiki_auto,Lower urinary tract infection is also referred...
164,wiki_auto,Lower urinary tract infection is also referred...
165,wiki_auto,Lower urinary tract infection is also referred...


In [31]:
file_origin_names = ['wiki_fully', 'wiki_partially', 'wiki_auto']

In [32]:
for i,df in enumerate(dataframes):
#     df["origin"] = file_origin_names[i]
    df.insert(0, "origin", file_origin_names[i])
    
dataframes

[          origin                                               Wiki
 0     wiki_fully  Under conditions of high humidity, the rate of...
 1     wiki_fully  The lack of oxygen above 2,400 metres (8,000 f...
 2     wiki_fully  The human body can adapt to high altitude by b...
 3     wiki_fully  For example, hemoglobin and myoglobin contain ...
 4     wiki_fully  Schistosomiasis, caused by one genus of tremat...
 ...          ...                                                ...
 2262  wiki_fully  Involved medical researchers at the University...
 2263  wiki_fully  Her condition is similar to many other conditi...
 2264  wiki_fully  Velazquez has a condition that is so rare that...
 2265  wiki_fully  Two polio vaccines are used throughout the wor...
 2266  wiki_fully  Therefore, interruption of person to person tr...
 
 [2267 rows x 2 columns],
               origin                                               Wiki
 0     wiki_partially  Because humans perceive the rate of heat trans..

In [33]:
merged_df = reduce(lambda left,right: pd.merge(left, right, how='outer'), dataframes)
print(len(merged_df))
merged_df

9212


Unnamed: 0,origin,Wiki
0,wiki_fully,"Under conditions of high humidity, the rate of..."
1,wiki_fully,"The lack of oxygen above 2,400 metres (8,000 f..."
2,wiki_fully,The human body can adapt to high altitude by b...
3,wiki_fully,"For example, hemoglobin and myoglobin contain ..."
4,wiki_fully,"Schistosomiasis, caused by one genus of tremat..."
...,...,...
9207,wiki_auto,The deletion may range from 5 million to 16 mi...
9208,wiki_auto,Privacy is the ability of an individual or gro...
9209,wiki_auto,All countries have laws which in some way limi...
9210,wiki_auto,Some cancer cells also have abnormal numbers o...


In [34]:
merged_df.drop_duplicates(subset=['Wiki'], inplace=True)
print(len(merged_df))
merged_df

# 9212 - 8618 = 594


8618


Unnamed: 0,origin,Wiki
0,wiki_fully,"Under conditions of high humidity, the rate of..."
1,wiki_fully,"The lack of oxygen above 2,400 metres (8,000 f..."
2,wiki_fully,The human body can adapt to high altitude by b...
3,wiki_fully,"For example, hemoglobin and myoglobin contain ..."
4,wiki_fully,"Schistosomiasis, caused by one genus of tremat..."
...,...,...
9207,wiki_auto,The deletion may range from 5 million to 16 mi...
9208,wiki_auto,Privacy is the ability of an individual or gro...
9209,wiki_auto,All countries have laws which in some way limi...
9210,wiki_auto,Some cancer cells also have abnormal numbers o...


In [35]:
merged_df.loc[merged_df.Wiki == 'Lower urinary tract infection is also referred to as a bladder infection.']

Unnamed: 0,origin,Wiki
1466,wiki_fully,Lower urinary tract infection is also referred...
