In [None]:
import pandas as pd
import numpy as np
import re

def search_match(row):
    matches_actor = [row['actorName'] in test_value for test_value in row['imdbActors']]
    #matches_date = row['imdbDate'] == row['yagoDate']    
    if any(matches_actor) and row['matchYear'] == 'Yes':        
        return "Yes"
    else:
        return "No"

def match_actor(row):
    matches_actor = [row['actorName'] in test_value for test_value in row['imdbActors']]
    
    if any(matches_actor):
        return "Yes"
    else:
        return "No"


cols = ["yagoID", "actorName", "predicate", "yagoTitle", "temporalPred", "yagoValidDate", "imdbID", "imdbTitle", "imdbRelease", "imdbYear", "imdbActors"]
df = pd.read_csv('imdb/actedIn_1-5205.csv', header=None, sep='\t', names=cols)
print("Tamanho df {}".format(len(df)))
# convert each row of column imdbActors to a list
df['imdbActors'] = df['imdbActors'].str.split(',')

# format imdb Date to Y-m-d datetime, and back to string 
df['imdbDate'] = pd.to_datetime(df['imdbRelease'], yearfirst=True)
df['imdbDate'] = df['imdbDate'].dt.strftime('%Y-%m-%d')

# remove hashtag and - from yago date
p = re.compile(r'[#]+')
d = re.compile(r'[-]+-')
df['yagoDate'] =  [d.sub('', x) for x in [p.sub('', x) for x in df['yagoValidDate'].tolist()]]

# remove NaNs and '_'
df = df.dropna(subset=['actorName']) 
df = df.dropna(subset=['imdbActors']) 
df['actorName'] = [re.sub('_', ' ', x) for x in df['actorName']]
print("Tamanho df depois de dropna {}".format(len(df)))
#df.dtypes

#compare if yagoDate are iqual to imdbRelease
df['matchDate'] = np.where(df['imdbDate'] == df['yagoDate'], 1, 0)

#compare if yagoActor is the same imdb returned
df['matchActor'] = df.apply(match_actor, axis=1)

#compare if yagoYear are iqual to imdbYear
df['yagoDate'] = df['yagoDate'].str.rstrip('-')
df['imdbYear'] = df['imdbYear'].str[:4].astype(np.int64)
df['matchYear'] = np.where(pd.DatetimeIndex(df['yagoDate']).year == df['imdbYear'], "Yes", "No")

#compare if yagoDate and yagoActor are iqual to imdbRelease imdbActor respectively
df['matchBoth'] = df.apply(search_match, axis=1)

print('Number of same dates {}'.format(df['matchDate'].sum()))
print('Number of same years {}'.format(len(df[df['matchYear']== 'Yes'])))
print('Number of same actors {}'.format(len(df[df['matchActor'] == 'Yes'])))
print('Number of matches actor & year: {}'.format(len(df[df['matchBoth'] == 'Yes'])))

df1 = df[['yagoID', 'actorName','yagoTitle', 'imdbTitle', 'imdbActors', 'yagoDate', 'imdbDate', 'matchDate', 'matchActor', 'matchYear', 'matchBoth']]
#write the new configuration to csv file
df1.to_csv('outputs/matches/actedIn_compare.csv', sep='\t', index=False)