In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
with open('player_dbpedia.xml', 'r') as f:
    player_dbpedia = f.read()

In [3]:
# parsing the data
# leagues, positions, teams, awards are not correctly read
df_dbpedia = pd.read_xml(player_dbpedia).drop(columns=["birthPlace", "leagues", "positions", "teams", "awards"])
df_dbpedia = df_dbpedia.drop_duplicates(subset=["name", "birthDate"])

In [4]:
len(df_dbpedia)

4154

In [6]:
df_dbpedia.head()

Unnamed: 0,id,name,birthDate,height,weight,startYear,endYear
0,DBpedia_id_1,Erik Martin (basketball),1971-05-26,200.66,99.79,1993-01-01,2002-01-01
1,DBpedia_id_2,Aleks Marić,1984-10-22,210.82,124.74,2008-01-01,2017-01-01
2,DBpedia_id_3,Aaron Miles (basketball),1983-04-13,185.42,79.38,2005-01-01,2015-01-01
3,DBpedia_id_4,Scoonie Penn,1977-01-09,180.34,81.65,2000-01-01,2011-01-01
4,DBpedia_id_5,Elston Turner,1959-06-10,195.58,86.18,1981-01-01,1995-01-01


In [7]:
with open('player_stat.xml', 'r') as f:
    player_stat = f.read()

In [8]:
# parsing the data
df_stat = pd.read_xml(player_stat).drop(columns=["position", "college"])

In [9]:
len(df_stat)

4550

In [11]:
df_stat.head()

Unnamed: 0,id,name,year_start,year_end,height,weight,birth_date
0,player_stat_id_1,Alaa Abdelnaby,1991,1995,185.93,108.84,1968-06-24
1,player_stat_id_2,Zaid Abdul-Aziz,1969,1978,210.31,106.58,1946-04-07
2,player_stat_id_3,Kareem Abdul-Jabbar,1970,1989,219.46,102.04,1947-04-16
3,player_stat_id_4,Mahmoud Abdul-Rauf,1991,2001,185.93,73.47,1969-03-09
4,player_stat_id_5,Tariq Abdul-Wahad,1998,2003,201.17,101.13,1974-11-03


In [12]:
# use a subset of the data or the size of the cartesian product will be too large
df_dbpedia_sample = df_dbpedia.sample(n=2000, random_state=1)
df_stat_sample = df_stat.sample(n=2000, random_state=1)

In [13]:
# cartesian product of datasets
df = pd.merge(df_dbpedia_sample.assign(key=1), df_stat_sample.assign(key=1), on='key').drop('key', axis=1)

In [14]:
len(df)

4000000

In [15]:
df.head()

Unnamed: 0,id_x,name_x,birthDate,height_x,weight_x,startYear,endYear,id_y,name_y,year_start,year_end,height_y,weight_y,birth_date
0,DBpedia_id_6744,Vaughn Duggins,1987-07-10,,86.0,2011-01-01,,player_stat_id_2721,Ken Menke,1950,1950,182.88,76.19,1922-10-02
1,DBpedia_id_6744,Vaughn Duggins,1987-07-10,,86.0,2011-01-01,,player_stat_id_788,Joe Cooke,1971,1971,192.02,79.37,1948-08-14
2,DBpedia_id_6744,Vaughn Duggins,1987-07-10,,86.0,2011-01-01,,player_stat_id_3133,Charlie Paulk,1969,1972,207.26,99.32,1946-06-14
3,DBpedia_id_6744,Vaughn Duggins,1987-07-10,,86.0,2011-01-01,,player_stat_id_1724,Mark Hendrickson,1997,2000,210.31,99.77,1974-06-23
4,DBpedia_id_6744,Vaughn Duggins,1987-07-10,,86.0,2011-01-01,,player_stat_id_730,Bob Cluggish,1947,1947,185.93,106.58,1917-09-18


In [16]:
def sim(str1, str2):   
    # calculate the minimum edit distance
    len1 = len(str1) + 1
    len2 = len(str2) + 1
    d = np.ndarray(shape=(len1, len2), dtype=int, buffer=np.zeros((len1, len2)))
    for i in range(len1):
        d[i, 0] = i
    for i in range(len2):
        d[0, i] = i
    for i in range(1, len1):
        for j in range(1, len2):
            if str1[i - 1] == str2[j - 1]:
                d[i][j] = d[i - 1][j - 1]
            else:
                d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + 1)
    
    # similarity
    return 1 - d[len1 - 1][len2 - 1] / max(len1, len2)

In [17]:
%%time

df["sim"] = df.apply(lambda x: sim(x['name_x'].lower().replace(" (basketball)", ""), x['name_y'].lower()), axis=1)

CPU times: user 19min 32s, sys: 17.7 s, total: 19min 50s
Wall time: 19min 35s


In [18]:
df["sim_group"] = df["sim"].round(1)

In [19]:
df["sim_group"].value_counts()

0.2    2123150
0.3     885991
0.1     717546
0.4     235303
0.5      31011
0.6       5184
0.7       1367
0.8        249
1.0        122
0.0         59
0.9         18
Name: sim_group, dtype: int64

In [20]:
# matching record pairs and some corner case matches
df1 = df[df["sim_group"] == 1.0].sample(n=60, random_state=1)
df1["result"] = True
df1["is_corner_case"] = None

In [21]:
# non matching record pairs
df2 = df[df["sim_group"] <= 0.2].sample(n=100, random_state=1)
df2["result"] = False
df2["is_corner_case"] = False

In [28]:
# corner case matches and non matches (mostly non matches)
df3 = df[(df["sim_group"] >= 0.7) & (df["sim_group"] <= 0.8)].sample(n=22, random_state=1)
df3["result"] = None
df3["is_corner_case"] = True

In [31]:
# corner case matches and non matches (mostly matches)
df4 = df[df["sim_group"] == 0.9].copy()
df4["result"] = None
df4["is_corner_case"] = True

In [32]:
df_gold_standard = pd.concat([df1, df2, df3, df4], ignore_index=True)

In [33]:
df_gold_standard.head()

Unnamed: 0,id_x,name_x,birthDate,height_x,weight_x,startYear,endYear,id_y,name_y,year_start,year_end,height_y,weight_y,birth_date,sim,sim_group,result,is_corner_case
0,DBpedia_id_6652,Cliff Levingston,1961-01-04,203.2,95.26,1982-01-01,1995-01-01,player_stat_id_2390,Cliff Levingston,1983,1995,207.26,95.24,1961-01-04,1.0,1.0,True,
1,DBpedia_id_8628,Kay Felder,1995-03-29,175.26,79.83,2016-01-01,,player_stat_id_1218,Kay Felder,2017,2018,179.83,79.82,1995-03-29,1.0,1.0,True,
2,DBpedia_id_7954,Scotty Hopson,1989-08-08,200.66,92.53,2011-01-01,,player_stat_id_1824,Scotty Hopson,2014,2014,204.22,92.52,1989-08-08,1.0,1.0,True,
3,DBpedia_id_5555,Kent Bazemore,1989-07-01,193.04,88.45,2012-01-01,,player_stat_id_251,Kent Bazemore,2013,2018,198.12,91.16,1989-07-01,1.0,1.0,True,
4,DBpedia_id_5039,Rakeem Christmas,1991-12-01,205.74,113.4,2015-01-01,,player_stat_id_704,Rakeem Christmas,2016,2017,210.31,113.38,1991-12-01,1.0,1.0,True,


In [34]:
# rename the column names
df_gold_standard.rename(columns={
    "id_x": "player_dbpedia_id", "name_x": "player_dbpedia_name", "birthDate": "player_dbpedia_birthdate",
    "height_x": "player_dbpedia_height", "weight_x": "player_dbpedia_weight", 
    "startYear": "player_dbpedia_start_year", "endYear": "player_dbpedia_end_year",
    "id_y": "player_stat_id", "name_y": "player_stat_name", "birth_date": "player_stat_birthdate",
    "height_y": "player_stat_height", "weight_y": "player_stat_weight", 
    "year_start": "player_stat_start_year", "year_end": "player_stat_end_year"}, inplace=True)

In [35]:
# reorder the columns
order = [
    "player_stat_id", "player_dbpedia_id", "player_stat_name", "player_dbpedia_name",
    "player_stat_birthdate", "player_dbpedia_birthdate", 
    "player_stat_height", "player_dbpedia_height", "player_stat_weight", "player_dbpedia_weight",
    "player_stat_start_year", "player_dbpedia_start_year", "player_stat_end_year", "player_dbpedia_end_year",
    "sim", "sim_group", "result", "is_corner_case"
]
    
df_gold_standard = df_gold_standard[order]

In [36]:
df_gold_standard.head()

Unnamed: 0,player_stat_id,player_dbpedia_id,player_stat_name,player_dbpedia_name,player_stat_birthdate,player_dbpedia_birthdate,player_stat_height,player_dbpedia_height,player_stat_weight,player_dbpedia_weight,player_stat_start_year,player_dbpedia_start_year,player_stat_end_year,player_dbpedia_end_year,sim,sim_group,result,is_corner_case
0,player_stat_id_2390,DBpedia_id_6652,Cliff Levingston,Cliff Levingston,1961-01-04,1961-01-04,207.26,203.2,95.24,95.26,1983,1982-01-01,1995,1995-01-01,1.0,1.0,True,
1,player_stat_id_1218,DBpedia_id_8628,Kay Felder,Kay Felder,1995-03-29,1995-03-29,179.83,175.26,79.82,79.83,2017,2016-01-01,2018,,1.0,1.0,True,
2,player_stat_id_1824,DBpedia_id_7954,Scotty Hopson,Scotty Hopson,1989-08-08,1989-08-08,204.22,200.66,92.52,92.53,2014,2011-01-01,2014,,1.0,1.0,True,
3,player_stat_id_251,DBpedia_id_5555,Kent Bazemore,Kent Bazemore,1989-07-01,1989-07-01,198.12,193.04,91.16,88.45,2013,2012-01-01,2018,,1.0,1.0,True,
4,player_stat_id_704,DBpedia_id_5039,Rakeem Christmas,Rakeem Christmas,1991-12-01,1991-12-01,210.31,205.74,113.38,113.4,2016,2015-01-01,2017,,1.0,1.0,True,


In [38]:
df_gold_standard.to_csv("gs_stat_2_dbpedia.csv", index=False)