In [1]:
import pandas as pd
from io import StringIO

In [2]:
source_io = ('PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked\n'
             '892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q\n'
             '893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47,1,0,363272,7,,S\n'
             '894,2,"Myles, Mr. Thomas Francis",male,62,0,0,240276,9.6875,,Q\n'
             '895,3,"Wirz, Mr. Albert",male,27,0,0,315154,8.6625,,S\n'
             '896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22,1,1,3101298,12.2875,,S\n'
             '897,3,"Svensson, Mr. Johan Cervin",male,14,0,0,7538,9.225,,S\n'
             '898,3,"Connolly, Miss. Kate",female,30,0,0,330972,7.6292,,Q\n'
             '899,2,"Caldwell, Mr. Albert Francis",male,26,1,1,248738,29,,S\n'
             '900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18,0,0,2657,7.2292,,C\n')

target_io = ('PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked\n'
             '892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q\n'
             '893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47,1,0,363272,7,,S\n'
             '894,2,"Myles, Mr. Thomas Francis",male,62,0,0,240276,9.6875,,Q\n'
             '895,3,"Wirz, Mr. Albert",male,27,0,0,315154,8.6625,,S\n'
             '896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22,1,1,3101298,12.2875,,S\n'
             )

In [3]:
compare_columns = ['PassengerId','Pclass', 'Name', 'Sex', 'Age', 'SibSp']

In [4]:
source = pd.read_csv(StringIO(source_io))
target = pd.read_csv(StringIO(target_io))

In [5]:
first_df_suffix = "_source"  
second_df_suffix = "_target"  

In [6]:
join_df = source.merge(target,how ="outer", on= compare_columns,suffixes=(first_df_suffix, second_df_suffix),indicator=True)


In [7]:
join_df =join_df[join_df['_merge']== 'both']

In [8]:
join_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch_source,Ticket_source,Fare_source,Cabin_source,Embarked_source,Parch_target,Ticket_target,Fare_target,Cabin_target,Embarked_target,_merge
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0.0,330911.0,7.8292,,Q,both
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0.0,363272.0,7.0,,S,both
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0.0,240276.0,9.6875,,Q,both
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0.0,315154.0,8.6625,,S,both
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1.0,3101298.0,12.2875,,S,both


In [9]:
join_df= join_df.reindex(columns=compare_columns + sorted([column for column in join_df.columns if column not in compare_columns]))
 

In [10]:
join_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Cabin_source,Cabin_target,Embarked_source,Embarked_target,Fare_source,Fare_target,Parch_source,Parch_target,Ticket_source,Ticket_target,_merge
0,892,3,"Kelly, Mr. James",male,34.5,0,,,Q,Q,7.8292,7.8292,0,0.0,330911,330911.0,both
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,,,S,S,7.0,7.0,0,0.0,363272,363272.0,both
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,,,Q,Q,9.6875,9.6875,0,0.0,240276,240276.0,both
3,895,3,"Wirz, Mr. Albert",male,27.0,0,,,S,S,8.6625,8.6625,0,0.0,315154,315154.0,both
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,,,S,S,12.2875,12.2875,1,1.0,3101298,3101298.0,both


In [11]:
[column for column in join_df.columns.values  if column not in compare_columns and column[0]!='_']

['Cabin_source',
 'Cabin_target',
 'Embarked_source',
 'Embarked_target',
 'Fare_source',
 'Fare_target',
 'Parch_source',
 'Parch_target',
 'Ticket_source',
 'Ticket_target']

In [12]:
common_columns= list(set(source.columns) & set(target.columns))
common_columns

['Fare',
 'Embarked',
 'Age',
 'Cabin',
 'Pclass',
 'Parch',
 'Sex',
 'SibSp',
 'Ticket',
 'Name',
 'PassengerId']

In [13]:
columns_to_compare = [(column+"_" + 'source' ,column+ "_" + 'target','compared_'+column)
                              for column in set(common_columns) - set(compare_columns) ]

In [14]:
columns_to_compare

[('Fare_source', 'Fare_target', 'compared_Fare'),
 ('Embarked_source', 'Embarked_target', 'compared_Embarked'),
 ('Cabin_source', 'Cabin_target', 'compared_Cabin'),
 ('Parch_source', 'Parch_target', 'compared_Parch'),
 ('Ticket_source', 'Ticket_target', 'compared_Ticket')]

In [15]:
for source_col,target_col,result_col in columns_to_compare:
    join_df[result_col] = join_df[source_col] == join_df[target_col]


In [16]:
join_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Cabin_source,Cabin_target,Embarked_source,Embarked_target,...,Parch_source,Parch_target,Ticket_source,Ticket_target,_merge,compared_Fare,compared_Embarked,compared_Cabin,compared_Parch,compared_Ticket
0,892,3,"Kelly, Mr. James",male,34.5,0,,,Q,Q,...,0,0.0,330911,330911.0,both,True,True,False,True,True
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,,,S,S,...,0,0.0,363272,363272.0,both,True,True,False,True,True
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,,,Q,Q,...,0,0.0,240276,240276.0,both,True,True,False,True,True
3,895,3,"Wirz, Mr. Albert",male,27.0,0,,,S,S,...,0,0.0,315154,315154.0,both,True,True,False,True,True
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,,,S,S,...,1,1.0,3101298,3101298.0,both,True,True,False,True,True


In [24]:
summary = pd.DataFrame()
summary_source = source.describe().T
summary_target = target.describe().T

In [25]:
describe_functions= ['count']

In [26]:
summary_columns = set(summary_source.columns) & set(summary_target.columns)
for column in summary_columns:
    if column in describe_functions:
        summary['source_'+column] = summary_source[column]
        summary['target_'+column] = summary_target[column]
        summary['variance_in_'+column] =summary['source_'+column] -summary['target_'+column]

In [27]:
summary

Unnamed: 0,source_count,target_count,variance_in_count
PassengerId,9.0,5.0,4.0
Pclass,9.0,5.0,4.0
Age,9.0,5.0,4.0
SibSp,9.0,5.0,4.0
Parch,9.0,5.0,4.0
Ticket,9.0,5.0,4.0
Fare,9.0,5.0,4.0
Cabin,0.0,0.0,0.0
