In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import unicodedata
import re

### Reading the 100x100trail Data in

In [None]:
TORX_100x100_df = pd.read_excel(f'Database Data/TORX_df_100x100_trail.xlsx' )


In [None]:
TORX_100x100_df.head()

### DUV Data

In [None]:
# # Save the result
TORX_duv_df = pd.read_excel(f'Database Data/TORX_duv_df.xlsx')

# Convert 'Time' to timedelta format
TORX_duv_df['Performance'] = pd.to_timedelta(TORX_duv_df['Performance'], errors='coerce')


# Converting the date columns to datetime format with the specified format
TORX_duv_df['Start Date'] = pd.to_datetime(
    TORX_duv_df['Start Date'], 
    format='%Y-%m-%d %H:%M:%S'
)

# TORX_duv_df['DUV_Performance'] = TORX_duv_df['Finish Time for Tableau'] - TORX_duv_df['Start Date']
TORX_duv_df = TORX_duv_df.rename(columns={"Performance": "DUV_Performance",
                                          "Performance": "DUV_Performance_Seconds",
                                         }) 

TORX_duv_df.head()

In [None]:
TORX_duv_df.info()

### ITRA data

In [None]:
TORX_itra_no_DNF = pd.read_excel(f'Database Data/TORX_itra_no_DNF.xlsx',
                                dtype={'Start Date': 'string'})

# Converting the date columns to datetime format with the specified format
TORX_itra_no_DNF['Start Date'] = pd.to_datetime(
    TORX_itra_no_DNF['Start Date'], 
    format='%Y-%m-%d %H:%M:%S'
)

# Convert 'Time' to timedelta format
TORX_itra_no_DNF['Performance'] = pd.to_timedelta(TORX_itra_no_DNF['Performance'], errors='coerce')



# TORX_duv_df['DUV_Performance'] = TORX_duv_df['Finish Time for Tableau'] - TORX_duv_df['Start Date']
TORX_itra_no_DNF = TORX_itra_no_DNF.rename(columns={"Performance": "ITRA_Performance",
                                          "Performance_Seconds": "ITRA_Performance_Seconds",
                                         }) 

TORX_itra_no_DNF.head(50)

In [None]:
TORX_itra_no_DNF.info()

In [None]:
TORX_itra_no_DNF['Status'].unique()

## Defining Varibles

In [None]:
datasets = [TORX_100x100_df,
            TORX_duv_df,
            TORX_itra_no_DNF
           ]

year = ['2021', '2022', '2023', '2023']
races = ['TOR450', 'TOR330' ]
years = ['2021', '2022', '2023', '2024']

### Cleaning Year and Name

In [None]:
for df in datasets: 
    df['Year'] = df['Year'].astype('str')
    df['Name'] = df['Name'].str.replace('-', ' ')
    df['Name'] = df['Name'].str.replace(r"\s+", " ", regex=True)

### Examining which dataset does not have the same number of rows

In [None]:
for race in races:
    for year in years:
        print(race, year, '\n')
        itra_df = TORX_itra_no_DNF[(TORX_itra_no_DNF['Race'] == race) &
                                   (TORX_itra_no_DNF['Year'] == year) &
                                   # not looking at people who Finished at Rifugio Frassati or Bossess
                                   (TORX_itra_no_DNF['Status'] == 'Finished') 
                                  ]
        itra_num_rows = itra_df.shape[0]
        print("ITRA rows:", itra_num_rows)
    
        ##################################
        duv_df = TORX_duv_df[(TORX_duv_df['Race'] == race) &
                           (TORX_duv_df['Year'] == year) ]
        duv_num_rows = duv_df.shape[0]
        print("DUV rows:", duv_num_rows)
        
        ##################################
        trail_df = TORX_100x100_df[(TORX_100x100_df['Race'] == race) &
                           (TORX_100x100_df['Year'] == year) &
                                  (TORX_100x100_df['Status'] == True)]       
        trail_num_rows = trail_df.shape[0]
        print("100x100 rows:", trail_num_rows)
        ##################################

        print(itra_num_rows == duv_num_rows)
        print(itra_num_rows == trail_num_rows)
        print('*'*20)

#### Finding similar names for all finishers in 2022

In [None]:
year = '2022'
race = 'TOR330'


itra_df = TORX_itra_no_DNF[(TORX_itra_no_DNF['Race'] == race) &
                           (TORX_itra_no_DNF['Year'] == year) &
                           # not looking at people who Finished at Rifugio Frassati or Bossess
                           (TORX_itra_no_DNF['Status'] == 'Finished') 
                          ]

duv_df = TORX_duv_df[(TORX_duv_df['Race'] == race) &
                   (TORX_duv_df['Year'] == year) ]


trail_df = TORX_100x100_df[(TORX_100x100_df['Race'] == race) &
                   (TORX_100x100_df['Year'] == year) &
                          (TORX_100x100_df['Status'] == True)
                         ]   
   
print(itra_df.shape)
print(duv_df.shape)
print(trail_df.shape)

In [None]:
# itra_df.to_csv('itra.csv', index =  False)
# duv_df.to_csv('duv.csv', index =  False)
# trail_df.to_csv('trail.cv', index =  False)

### Cleaning Names

In [None]:
# Dictionary to track issues with names
name_issues = {}
name_issues_dataset = {}

dataset_names = ['TORX_100x100', 'TORX_duv', 'TORX_itra_no_DNF']  # Corresponding names of the datasets

# Create a dictionary to store the dataset origin for each name
name_issues_with_dataset = {}

for dataset, dataset_name in zip(datasets, dataset_names):
    unique_name = dataset['Name'].unique()
    print(f"Processing {dataset_name}, Unique Names: {len(unique_name)}")

    for name in unique_name: 
        name_split = name.split(' ')
        name1 = name_split[0]
        name2 = name_split[1]
        name_issues_with_dataset[name] = {'variations': [], 'dataset': dataset_name}

        try:
            if len(name_split) == 2:
                new_name = name2 + ' ' + name1
                name_issues_with_dataset[name]['variations'].append(name)
                name_issues_with_dataset[name]['variations'].append(new_name)

            elif len(name_split) == 3:
                # Regex pattern extracting 'D Haene Francois'  
                pattern = r"\b[A-Za-z]\s[A-Za-z]+\s[A-Za-z]+\b"

                name3 = name_split[2]
                if name3 == name1:
                    new_name = name1 + ' ' + name2
                    name_issues_with_dataset[name]['variations'].append(new_name)

                # D Angelo Yuri ['DAngelo Yuri','Yuri DAngelo']
                elif pd.Series([name]).str.contains(pattern, regex=True).any():
                    new_name = name.replace(f"{name1} ", f"{name1}")
                    name_issues_with_dataset[name]['variations'].append(new_name)
    
                    name_split = new_name.split(' ')
                    name1 = name_split[0]
                    name2 = name_split[1]
                    new_name = name2 + ' ' + name1
                    name_issues_with_dataset[name]['variations'].append(new_name)

                else:
                    new_name1 = name1 + ' ' + name2
                    new_name2 = name1 + ' ' + name3
                    name_issues_with_dataset[name]['variations'].append(new_name1)
                    name_issues_with_dataset[name]['variations'].append(new_name2)

            elif len(name_split) > 3:
                name_issues_with_dataset[name]['variations'].append(name)

            else:
                name_issues_with_dataset[name]['variations'].append(name)

        except Exception as e:
            print(f"Issue: {name}\nError: {e}\n")
            name_issues_with_dataset[name]['variations'].append(name)

# Convert name issues with dataset to DataFrame for easier tracking
name_issues_df = pd.DataFrame([{
    'Name': k,
    'Variations': v['variations'],
    'Dataset': v['dataset']
} for k, v in name_issues_with_dataset.items()])

print("Name Issues DataFrame with Dataset Info:")
print(name_issues_df)


In [None]:
# Iterate over each name and variations
for idx, name in enumerate(name_issues_df['Name']):
    # Access the current list of variations
    current_variations = name_issues_df.at[idx, 'Variations']
    # reorder in a alphabetical order
    current_variations = sorted(current_variations)
    print(current_variations)
    
    name_issues_df.at[idx, 'Variations'] = current_variations

In [None]:
# Step 1: Create a function to normalize names
def normalize_name(name,  name_issues_df ):
    # Find the variation list corresponding to the name
    match =  name_issues_df [ name_issues_df ['Name'] == name]
    if not match.empty:
        return match['Variations'].values[0]
    return [name]  # If no variation is found, return the name as is.

for dataset in datasets:
    # Step 2: Apply the name normalization to both datasets
    # Normalize dataset1
    dataset['Name Variations'] = dataset['Name'].apply(lambda x: normalize_name(x, name_issues_df))

TORX_100x100_df.head()



### Testing Testing Testing

### Sabrina Verjee

In [None]:
# Step 3: Iterate through the names in Dataset1 and Dataset2 to find matches
merged_rows = []

def testing_df(name):
    itra_name_df = TORX_itra_no_DNF[TORX_itra_no_DNF['Name'].str.contains(name)]
#     duv_name_df = TORX_duv_df[TORX_duv_df['Name'].str.contains(name)]
    trail_name_df = TORX_100x100_df[TORX_100x100_df['Name'].str.contains(name)]

    for idx1, row_itra in itra_name_df.iterrows():

        for idx2, row_trail in trail_name_df .iterrows():
            # Check if the name from itra_verjee_df is in the variations of trail_verjee_df and vice versa
            if any(name in row_trail['Name Variations'] for name in row_itra['Name Variations']):

                if  row_trail['Year'] ==  row_itra['Year']  :
#                     print('Same year:', row_trail['Year'], row_itra['Year'], '\n', '*'*20)
    #                 print('row_itra',row_itra)
    #                 print('row_trail',row_trail)

                    # Merge the row if a match is found
                    merged_row = {**row_itra.to_dict(), **row_trail.to_dict()}
                    merged_rows.append(merged_row)
    
                else:
                    pass
            else:
                pass

#     print(merged_rows)
    # Step 4: Create a merged DataFrame
    merged_df = pd.DataFrame(merged_rows)

    merged_df = merged_df.drop_duplicates(subset=['Name','Year','Start Date'])


    # Display the merged DataFrame
    print(merged_df)
    
testing_df('Verjee')


In [None]:
testing_df('Tierney')

### Merging 

In [None]:
# Step 3: Iterate through the names in Dataset1 and Dataset2 to find matches
merged_rows = []
subname_list= []

def testing_df(itra_name_df,trail_name_df):
    i=0
    for idx1, row_itra in itra_name_df.iterrows():
        i = i + 1
        print(i)

        for idx2, row_trail in trail_name_df .iterrows():
            # Check if the name from itra_verjee_df is in the variations of trail_verjee_df and vice versa
            if any(name in row_trail['Name Variations'] for name in row_itra['Name Variations']):

                if  row_trail['Year'] ==  row_itra['Year']  :
#                     print('Same year:', row_trail['Year'], row_itra['Year'], '\n', '*'*20)
    #                 print('row_itra',row_itra)
    #                 print('row_trail',row_trail)

                    # Merge the row if a match is found
                    merged_row = {**row_itra.to_dict(), **row_trail.to_dict()}
                    merged_rows.append(merged_row)
    
                else:
                    pass
            else:
                print(row_trail['Name'])
                subname_list.append(row_trail['Name'])

#     print(merged_rows)
    # Step 4: Create a merged DataFrame
    merged_df = pd.DataFrame(merged_rows)

    merged_df = merged_df.drop_duplicates(subset=['Name','Year','Start Date'])


    # Display the merged DataFrame
    print(merged_df)
    return merged_df, subname_list


### Testing 2022

In [None]:
print(years)
sub_TORX_itra_no_DNF = TORX_itra_no_DNF[(TORX_itra_no_DNF['Year'].isin(years)) &
                                        (TORX_itra_no_DNF['Status']== 'Finished')
                                       ]
sub_TORX_100x100_df = TORX_100x100_df[(TORX_100x100_df['Year'].isin(years)) &
                                      (TORX_100x100_df['Race'] !=  'TOR130') &
                                      (TORX_100x100_df['Status'] ==  True) 
                                     ]

print(sub_TORX_itra_no_DNF.shape)
print(sub_TORX_100x100_df.shape)

In [None]:
merged_df, subname_list = testing_df(sub_TORX_itra_no_DNF, sub_TORX_100x100_df)

In [None]:
len(subname_list)

In [None]:
# ### who is missing?

# merged_df_names = merged_df['Name'].unique() 
# sub_names =sub_TORX_100x100_df['Name'].unique()
# print(len(merged_df_names))
# print(len(sub_names))

# subname_list = []
# for sub_name in sub_names:
#     if sub_name in  merged_df_names:
#         pass
#     else:
#         subname_list.append(sub_name)
        
# print(len(subname_list))
        
        
# sub_TORX_100x100_df[sub_TORX_100x100_df['Name'].isin(subname_list)]

###  Datasets post-2021

In [None]:
post_2020_TORX_100x100_df = TORX_100x100_df[TORX_100x100_df['Year'].isin(years)]
post_2020_TORX_duv_df = TORX_duv_df[TORX_duv_df['Year'].isin(years)]
post_2020_TORX_itra_no_DNF = TORX_itra_no_DNF[TORX_itra_no_DNF['Year'].isin(years)]


print(post_2020_TORX_100x100_df.head())
print(post_2020_TORX_duv_df.head())
print(post_2020_TORX_itra_no_DNF.head())

In [None]:
# Function to clean names with three parts
def cleaning_name_with_3(name):
    # Split the name into parts
    name_split = name.split(' ')
    name1 = name_split[0]
    name2 = name_split[1]

    # If the name has exactly 3 parts
    if len(name_split) == 3:
        name3 = name_split[2]

        # Regex pattern to identify names with an initial
        pattern = r"^[A-Za-z]\s[A-Za-z]+\s[A-Za-z]+$"

        # Check for specific cases
        if name3 == name1:
            return f"{name1} {name2}"

        elif re.match(pattern, name):
            # Adjust formatting for names like 'D Angelo Yuri'
            return name.replace(f"{name1} ", f"{name1}")

        else:
            # Default to dropping the middle name
            return f"{name1} {name2}"

    # If the name does not have exactly 3 parts, return it unchanged
    return name

# Apply the function to each dataset
for dataset in datasets:
    dataset['Name'] = dataset['Name'].apply(cleaning_name_with_3)


In [None]:
TORX_itra_no_DNF

In [None]:
# Merging on two columns: 'Race' and 'Year'
merged_df = pd.merge(TORX_duv_df, TORX_itra_no_DNF, on=['Name', 'Race', 'Year'], how='outer')
print(TORX_duv_df.shape)
print(TORX_itra_no_DNF.shape)
print(merged_df.shape)

In [None]:
merged_df

In [None]:
sorted(list(merged_df['Name'][
    (merged_df['ITRA_Performance_Seconds'].isna()) |
    (merged_df['DUV_Performance_Seconds'].isna())
].unique()))

In [None]:
TORX_itra_no_DNF[TORX_itra_no_DNF['Name'] == 'Corsini Simone']