In [27]:
import pandas as pd
import csv
from datetime import datetime, timedelta


# Open the CSV file for reading and a new CSV file for writing
with open('results.csv', 'r', newline='', encoding='utf-8') as infile, open('processed_results.csv', 'w', newline='', encoding='utf-8') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    # Read and process each row
    for row in reader:
        # Check if the row has extra fields
        if len(row) > 8:
            # Truncate the row to expected number of fields
            row = row[:8]
        
        # Write the processed row to the new CSV file
        writer.writerow(row)

# Read the processed CSV file into a DataFrame
df = pd.read_csv('processed_results.csv')

# cleaning
df = df[df.isnull().sum(axis=1) < 1]

olympics_schedule = {
    'Rio': {'2016': {'start': '2016-08-05', 'end': '2016-08-21'}},
    'Beijing': {'2008': {'start': '2008-08-08', 'end': '2008-08-24'}},
    'Athens': {'2004': {'start': '2004-08-13', 'end': '2004-08-29'}},
    'Sydney': {'2000': {'start': '2000-09-15', 'end': '2000-09-24'}},
    'Atlanta': {'1996': {'start': '1996-07-19', 'end': '1996-08-04'}},
    'Barcelona': {'1992': {'start': '1992-07-25', 'end': '1992-08-09'}},
    'Seoul': {'1988': {'start': '1988-09-17', 'end': '1988-09-26'}},
    'Los Angeles': {'1984': {'start': '1984-07-28', 'end': '1984-08-12'}},
    'Moscow': {'1980': {'start': '1980-07-19', 'end': '1980-08-03'}},
    'Montreal': {'1976': {'start': '1976-07-17', 'end': '1976-08-01'}},
    'Munich': {'1972': {'start': '1972-08-26', 'end': '1972-09-10'}},
    'Mexico City': {'1968': {'start': '1968-10-12', 'end': '1968-10-27'}},
    'Tokyo': {'1964': {'start': '1964-10-10', 'end': '1964-10-24'}},
    'Rome': {'1960': {'start': '1960-08-25', 'end': '1960-09-11'}},
    'Melbourne': {'1956': {'start': '1956-11-22', 'end': '1956-12-01'}},
    'Helsinki': {'1952': {'start': '1952-07-19', 'end': '1952-08-03'}},
    'London': {'1948': {'start': '1948-07-29', 'end': '1948-08-07'}},
    'Berlin': {'1936': {'start': '1936-08-01', 'end': '1936-08-16'}},
    'Los Angeles': {'1932': {'start': '1932-07-30', 'end': '1932-08-14'}},
    'Amsterdam': {'1928': {'start': '1928-07-28', 'end': '1928-08-12'}},
    'Paris': {'1924': {'start': '1924-07-05', 'end': '1924-07-14'}},
    'Antwerp': {'1920': {'start': '1920-08-14', 'end': '1920-08-29'}},
    'Stockholm': {'1912': {'start': '1912-07-06', 'end': '1912-07-15'}},
    'London': {'1908': {'start': '1908-04-27', 'end': '1908-05-06'}},
    'St. Louis': {'1904': {'start': '1904-07-01', 'end': '1904-07-10'}},
    'Paris': {'1900': {'start': '1900-05-14', 'end': '1900-05-28'}},
    'Athens': {'1896': {'start': '1896-04-06', 'end': '1896-04-15'}}
}

track_field_schedule = {
    'Day 1': {'events': ['Marathon (men)', '100m (men)', 'Long Jump (men)']},
    'Day 2': {'events': ['20km Race Walk (men)', 'Shot Put (men)', 'Decathlon (men)']},
    'Day 3': {'events': ['400m Hurdles (men)', 'Triple Jump (men)', 'High Jump (women)']},
    'Day 4': {'events': ['3000m Steeplechase (men)', 'Pole Vault (men)', 'Discus Throw (women)']},
    'Day 5': {'events': ['800m (men)', '110m Hurdles (men)', 'Hammer Throw (women)']},
    'Day 6': {'events': ['200m (men)', '400m (women)', 'Javelin Throw (men)']},
    'Day 7': {'events': ['10,000m (men)', '400m Hurdles (women)', 'Long Jump (women)']},
    'Day 8': {'events': ['20km Race Walk (women)', '100m Hurdles (women)', 'Shot Put (women)']},
    'Day 9': {'events': ['1500m (men)', '5000m (women)', 'Triple Jump (women)']},
    'Day 10': {'events': ['4x100m Relay (men)', '4x400m Relay (women)', 'Marathon (women)']},
    'Day 11': {'events': ['4x400m Relay (men)', 'High Jump (men)', 'Discus Throw (men)']},
    'Day 12': {'events': ['800m (women)', '200m (women)', 'Pole Vault (women)']},
    'Day 13': {'events': ['4x100m Relay (women)', '5000m (men)', 'Javelin Throw (women)']},
    'Day 14': {'events': ['1500m (women)', '4x100m Relay (mixed)', 'Hammer Throw (men)']},
    'Day 15': {'events': ['4x400m Relay (mixed)', 'Decathlon (men)', '4x100m Medley Relay (mixed)']}
}

def estimate_event_dates(olympics_dates, track_field_schedule):
    event_dates = {}
    for olympics, years in olympics_dates.items():
        for year, dates in years.items():
            start_date = datetime.strptime(dates['start'], '%Y-%m-%d')
            end_date = datetime.strptime(dates['end'], '%Y-%m-%d')
            total_days = (end_date - start_date).days + 1
            events_per_day = len(track_field_schedule) // total_days
            remaining_events = len(track_field_schedule) % total_days
            day_counter = 1
            event_counter = 0
            current_date = start_date
            while current_date <= end_date:
                events_on_day = events_per_day
                if remaining_events > 0:
                    events_on_day += 1
                    remaining_events -= 1
                event_dates.setdefault(olympics, {}).setdefault(year, {})[f'Day {day_counter}'] = []
                for _ in range(events_on_day):
                    event_dates[olympics][year][f'Day {day_counter}'].append(track_field_schedule[event_counter])
                    event_counter += 1
                current_date += timedelta(days=1)
                day_counter += 1
                
    return event_dates

# Example usage:
estimated_dates = estimate_event_dates(track_field_schedule)
print(estimated_dates)


# Function to get the date of an event based on its location, year, and day
def get_event_date(location, year, day):
    start_date = datetime.strptime(olympics_dates[location][year]['start'], '%Y-%m-%d')
    return (start_date + timedelta(days=int(day.split()[1]) - 1)).strftime('%Y-%m-%d')

# Add 'Date' column to the DataFrame
df['Date'] = df.apply(lambda row: get_event_date(row['Location'], str(row['Year']), row['Day']), axis=1)

# Save the updated DataFrame to CSV
df.to_csv('processed_results_with_date.csv', index=False)

# Print the DataFrame
df

NameError: name 'olympics' is not defined

In [28]:
with open('results.csv', 'r', newline='', encoding='utf-8') as infile, open('processed_results.csv', 'w', newline='', encoding='utf-8') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    for row in reader:
        if len(row) > 8:
            row = row[:8]
        
        writer.writerow(row)

df = pd.read_csv('processed_results.csv')
df = df.dropna()


In [29]:
df

Unnamed: 0,Gender,Event,Location,Year,Medal,Name,Nationality,Result
0,M,10000M Men,Rio,2016,G,Mohamed FARAH,USA,25:05.17
1,M,10000M Men,Rio,2016,S,Paul Kipngetich TANUI,KEN,27:05.64
2,M,10000M Men,Rio,2016,B,Tamirat TOLA,ETH,27:06.26
3,M,10000M Men,Beijing,2008,G,Kenenisa BEKELE,ETH,27:01.17
4,M,10000M Men,Beijing,2008,S,Sileshi SIHINE,ETH,27:02.77
...,...,...,...,...,...,...,...,...
2389,W,Triple Jump Women,Athens,2004,S,Hrysopiyi DEVETZI,GRE,15.25
2390,W,Triple Jump Women,Athens,2004,B,Tatyana LEBEDEVA,RUS,15.14
2391,W,Triple Jump Women,Atlanta,1996,G,Inessa KRAVETS,UKR,15.33
2392,W,Triple Jump Women,Atlanta,1996,S,Inna LASOVSKAYA,RUS,14.98


In [None]:
import pandas as pd
import csv
from datetime import datetime, timedelta

# open the csv file for reading and a new csv file for writing
with open('results.csv', 'r', newline='', encoding='utf-8') as infile, open('processed_results.csv', 'w', newline='', encoding='utf-8') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    # read and process each row
    for row in reader:
        # check if the row has extra fields
        if len(row) > 8:
            # truncate the row to expected number of fields
            row = row[:8]
        
        # write the processed row to the new csv file
        writer.writerow(row)

# read the processed csv file into a dataframe
df = pd.read_csv('processed_results.csv')

# cleaning
df = df[df.isnull().sum(axis=1) < 1]

track_field_schedule = {
    # day and event schedule goes here
}

def estimate_event_dates(olympics_dates):
    event_dates = {}
    for location, years in olympics_dates.items():
        for year, dates in years.items():
            start_date = datetime.strptime(dates['start'], '%Y-%m-%d')
            end_date = datetime.strptime(dates['end'], '%Y-%m-%d')
            total_days = (end_date - start_date).days + 1
            events_per_day = len(track_field_schedule) // total_days
            remaining_events = len(track_field_schedule) % total_days
            
            day_counter = 1
            event_counter = 0
            current_date = start_date
            while current_date <= end_date:
                events_on_day = events_per_day
                if remaining_events > 0:
                    events_on_day += 1
                    remaining_events -= 1
                event_dates.setdefault(location, {}).setdefault(year, {})[f'Day {day_counter}'] = []
                for _ in range(events_on_day):
                    if event_counter < len(track_field_schedule):
                        day_key = f'Day {event_counter + 1}'
                        event_dates[location][year][f'Day {day_counter}'].extend(track_field_schedule[day_key]['events'])
                        event_counter += 1
                current_date += timedelta(days=1)
                day_counter += 1
    return event_dates

# estimate the event dates
estimated_dates = estimate_event_dates(track_field_schedule)
print(estimated_dates)

def get_event_date(location, year, day):
    start_date = datetime.strptime(track_field_schedule[location][year]['start'], '%Y-%m-%d')
    return (start_date + timedelta(days=int(day.split()[1]) - 1)).strftime('%Y-%m-%d')

# add 'date' column to the dataframe
df['Date'] = df.apply(lambda row: get_event_date(row['Location'], str(row['Year']), row['Day']), axis=1)

# save the updated dataframe to csv
df.to_csv('processed_results_with_date.csv', index=False)

# print the dataframe
print(df)


In [37]:
import pandas as pd
import csv
from datetime import datetime, timedelta

# open the csv file for reading and a new csv file for writing
with open('results.csv', 'r', newline='', encoding='utf-8') as infile, open('processed_results.csv', 'w', newline='', encoding='utf-8') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    # read and process each row
    for row in reader:
        # check if the row has extra fields
        if len(row) > 8:
            # truncate the row to expected number of fields
            row = row[:8]
        
        # write the processed row to the new csv file
        writer.writerow(row)

# read the processed csv file into a dataframe
df = pd.read_csv('processed_results.csv')

# cleaning
df = df[df.isnull().sum(axis=1) < 1]

olympics_schedule = {
    'Rio': {'2016': {'start': '2016-08-05', 'end': '2016-08-21'}},
    'Beijing': {'2008': {'start': '2008-08-08', 'end': '2008-08-24'}},
    'Athens': {'2004': {'start': '2004-08-13', 'end': '2004-08-29'}},
    'Sydney': {'2000': {'start': '2000-09-15', 'end': '2000-09-24'}},
    'Atlanta': {'1996': {'start': '1996-07-19', 'end': '1996-08-04'}},
    'Barcelona': {'1992': {'start': '1992-07-25', 'end': '1992-08-09'}},
    'Seoul': {'1988': {'start': '1988-09-17', 'end': '1988-09-26'}},
    'Los Angeles': {'1984': {'start': '1984-07-28', 'end': '1984-08-12'}},
    'Moscow': {'1980': {'start': '1980-07-19', 'end': '1980-08-03'}},
    'Montreal': {'1976': {'start': '1976-07-17', 'end': '1976-08-01'}},
    'Munich': {'1972': {'start': '1972-08-26', 'end': '1972-09-10'}},
    'Mexico City': {'1968': {'start': '1968-10-12', 'end': '1968-10-27'}},
    'Tokyo': {'1964': {'start': '1964-10-10', 'end': '1964-10-24'}},
    'Rome': {'1960': {'start': '1960-08-25', 'end': '1960-09-11'}},
    'Melbourne': {'1956': {'start': '1956-11-22', 'end': '1956-12-01'}},
    'Helsinki': {'1952': {'start': '1952-07-19', 'end': '1952-08-03'}},
    'London': {'1948': {'start': '1948-07-29', 'end': '1948-08-07'}},
    'Berlin': {'1936': {'start': '1936-08-01', 'end': '1936-08-16'}},
    'Los Angeles': {'1932': {'start': '1932-07-30', 'end': '1932-08-14'}},
    'Amsterdam': {'1928': {'start': '1928-07-28', 'end': '1928-08-12'}},
    'Paris': {'1924': {'start': '1924-07-05', 'end': '1924-07-14'}},
    'Antwerp': {'1920': {'start': '1920-08-14', 'end': '1920-08-29'}},
    'Stockholm': {'1912': {'start': '1912-07-06', 'end': '1912-07-15'}},
    'London': {'1908': {'start': '1908-04-27', 'end': '1908-05-06'}},
    'St. Louis': {'1904': {'start': '1904-07-01', 'end': '1904-07-10'}},
    'Paris': {'1900': {'start': '1900-05-14', 'end': '1900-05-28'}},
    'Athens': {'1896': {'start': '1896-04-06', 'end': '1896-04-15'}}
}

track_field_schedule = {
    'Day 1': {'events': ['Marathon (men)', '100m (men)', 'Long Jump (men)']},
    'Day 2': {'events': ['20km Race Walk (men)', 'Shot Put (men)', 'Decathlon (men)']},
    'Day 3': {'events': ['400m Hurdles (men)', 'Triple Jump (men)', 'High Jump (women)']},
    'Day 4': {'events': ['3000m Steeplechase (men)', 'Pole Vault (men)', 'Discus Throw (women)']},
    'Day 5': {'events': ['800m (men)', '110m Hurdles (men)', 'Hammer Throw (women)']},
    'Day 6': {'events': ['200m (men)', '400m (women)', 'Javelin Throw (men)']},
    'Day 7': {'events': ['10,000m (men)', '400m Hurdles (women)', 'Long Jump (women)']},
    'Day 8': {'events': ['20km Race Walk (women)', '100m Hurdles (women)', 'Shot Put (women)']},
    'Day 9': {'events': ['1500m (men)', '5000m (women)', 'Triple Jump (women)']},
    'Day 10': {'events': ['4x100m Relay (men)', '4x400m Relay (women)', 'Marathon (women)']},
    'Day 11': {'events': ['4x400m Relay (men)', 'High Jump (men)', 'Discus Throw (men)']},
    'Day 12': {'events': ['800m (women)', '200m (women)', 'Pole Vault (women)']},
    'Day 13': {'events': ['4x100m Relay (women)', '5000m (men)', 'Javelin Throw (women)']},
    'Day 14': {'events': ['1500m (women)', '4x100m Relay (mixed)', 'Hammer Throw (men)']},
    'Day 15': {'events': ['4x400m Relay (mixed)', 'Decathlon (men)', '4x100m Medley Relay (mixed)']}
}

def estimate_event_dates(olympics_dates, track_field_schedule):
    event_dates = {}
    for olympics, years in olympics_dates.items():
        for year, dates in years.items():
            start_date = datetime.strptime(dates['start'], '%Y-%m-%d')
            end_date = datetime.strptime(dates['end'], '%Y-%m-%d')
            total_days = (end_date - start_date).days + 1
            day_events = list(track_field_schedule.items())  # get items as list of tuples
            current_date = start_date
            day_counter = 0  # to iterate over the events
            while current_date <= end_date and day_counter < len(day_events):
                event_day, events = day_events[day_counter]
                event_dates.setdefault(olympics, {}).setdefault(year, {})[event_day] = events['events']
                day_counter += 1
                current_date += timedelta(days=1)
    return event_dates

estimated_dates = estimate_event_dates(olympics_schedule, track_field_schedule)
# function to get the date of an event based on its location, year, and day
def get_event_date(location, year, day):
    try:
        start_date = datetime.strptime(olympics_dates[location][year]['start'], '%Y-%m-%d')
        return (start_date + timedelta(days=int(day.split()[1]) - 1)).strftime('%Y-%m-%d')
    except KeyError:
        # Return a message or handle the error as you see fit
        return f'Date not found for {location} {year}'

df['Day'] = 'Day 1'

# add 'Date' column to the dataframe
df['Date'] = df.apply(lambda row: get_event_date(row['Location'], str(row['Year']), row['Day']), axis=1)

# save the updated dataframe to csv
df.to_csv('processed_results_with_date.csv', index=False)

# print the dataframe
print(df)


     Gender              Event Location  Year Medal                   Name  \
0         M         10000M Men      Rio  2016     G          Mohamed FARAH   
1         M         10000M Men      Rio  2016     S  Paul Kipngetich TANUI   
2         M         10000M Men      Rio  2016     B           Tamirat TOLA   
3         M         10000M Men  Beijing  2008     G        Kenenisa BEKELE   
4         M         10000M Men  Beijing  2008     S         Sileshi SIHINE   
...     ...                ...      ...   ...   ...                    ...   
2389      W  Triple Jump Women   Athens  2004     S      Hrysopiyi DEVETZI   
2390      W  Triple Jump Women   Athens  2004     B       Tatyana LEBEDEVA   
2391      W  Triple Jump Women  Atlanta  1996     G         Inessa KRAVETS   
2392      W  Triple Jump Women  Atlanta  1996     S        Inna LASOVSKAYA   
2393      W  Triple Jump Women  Atlanta  1996     B       Sarka KASPARKOVA   

     Nationality    Result    Day                            Da

In [46]:
olympics_schedule = {
    'Rio': {'2016': {'start': '2016-08-05', 'end': '2016-08-21'}},
    'Beijing': {'2008': {'start': '2008-08-08', 'end': '2008-08-24'}},
    'Athens': {'2004': {'start': '2004-08-13', 'end': '2004-08-29'}},
    'Sydney': {'2000': {'start': '2000-09-15', 'end': '2000-09-24'}},
    'Atlanta': {'1996': {'start': '1996-07-19', 'end': '1996-08-04'}},
    'Barcelona': {'1992': {'start': '1992-07-25', 'end': '1992-08-09'}},
    'Seoul': {'1988': {'start': '1988-09-17', 'end': '1988-09-26'}},
    'Los Angeles': {'1984': {'start': '1984-07-28', 'end': '1984-08-12'}},
    'Moscow': {'1980': {'start': '1980-07-19', 'end': '1980-08-03'}},
    'Montreal': {'1976': {'start': '1976-07-17', 'end': '1976-08-01'}},
    'Munich': {'1972': {'start': '1972-08-26', 'end': '1972-09-10'}},
    'Mexico City': {'1968': {'start': '1968-10-12', 'end': '1968-10-27'}},
    'Tokyo': {'1964': {'start': '1964-10-10', 'end': '1964-10-24'}},
    'Rome': {'1960': {'start': '1960-08-25', 'end': '1960-09-11'}},
    'Melbourne': {'1956': {'start': '1956-11-22', 'end': '1956-12-01'}},
    'Helsinki': {'1952': {'start': '1952-07-19', 'end': '1952-08-03'}},
    'London': {'1948': {'start': '1948-07-29', 'end': '1948-08-07'}},
    'Berlin': {'1936': {'start': '1936-08-01', 'end': '1936-08-16'}},
    'Los Angeles': {'1932': {'start': '1932-07-30', 'end': '1932-08-14'}},
    'Amsterdam': {'1928': {'start': '1928-07-28', 'end': '1928-08-12'}},
    'Paris': {'1924': {'start': '1924-07-05', 'end': '1924-07-14'}},
    'Antwerp': {'1920': {'start': '1920-08-14', 'end': '1920-08-29'}},
    'Stockholm': {'1912': {'start': '1912-07-06', 'end': '1912-07-15'}},
    'London': {'1908': {'start': '1908-04-27', 'end': '1908-05-06'}},
    'St. Louis': {'1904': {'start': '1904-07-01', 'end': '1904-07-10'}},
    'Paris': {'1900': {'start': '1900-05-14', 'end': '1900-05-28'}},
    'Athens': {'1896': {'start': '1896-04-06', 'end': '1896-04-15'}}
}

track_field_schedule = [
    'Marathon (men)', '100m (men)', 'Long Jump (men)',
    '20km Race Walk (men)', 'Shot Put (men)', 'Decathlon (men)',
    '400m Hurdles (men)', 'Triple Jump (men)', 'High Jump (women)',
    '3000m Steeplechase (men)', 'Pole Vault (men)', 'Discus Throw (women)',
    '800m (men)', '110m Hurdles (men)', 'Hammer Throw (women)',
    '200m (men)', '400m (women)', 'Javelin Throw (men)',
    '10,000m (men)', '400m Hurdles (women)', 'Long Jump (women)',
    '20km Race Walk (women)', '100m Hurdles (women)', 'Shot Put (women)',
    '1500m (men)', '5000m (women)', 'Triple Jump (women)',
    '4x100m Relay (men)', '4x400m Relay (women)', 'Marathon (women)',
    '4x400m Relay (men)', 'High Jump (men)', 'Discus Throw (men)',
    '800m (women)', '200m (women)', 'Pole Vault (women)',
    '4x100m Relay (women)', '5000m (men)', 'Javelin Throw (women)',
    '1500m (women)', '4x100m Relay (mixed)', 'Hammer Throw (men)',
    '4x400m Relay (mixed)', 'Decathlon (men)', '4x100m Medley Relay (mixed)'
]

Day
Day not found    2135
Name: count, dtype: int64

In [48]:
df['Event'].value_counts()

Event
Marathon Men                84
100M Men                    80
110M Hurdles Men            79
400M Men                    79
1500M Men                   79
800M Men                    79
Decathlon Men               76
200M Men                    75
400M Hurdles Men            71
5000M Men                   69
10000M Men                  69
3000M Steeplechase Men      68
Long Jump Men               63
High Jump Men               62
Discus Throw Men            60
Shot Put Men                60
Pole Vault Men              59
100M Women                  58
50Km Race Walk Men          57
Triple Jump Men             54
Hammer Throw Men            54
Discus Throw Women          53
200M Women                  51
Javelin Throw Women         50
20Km Race Walk Men          48
Javelin Throw Men           47
800M Women                  43
High Jump Women             42
Shot Put Women              41
400M Women                  39
Long Jump Women             35
100M Hurdles Women          34
15

In [58]:
olympics_schedule = {
    'Rio': {'2016': {'start': '2016-08-05', 'end': '2016-08-21'}},
    'Beijing': {'2008': {'start': '2008-08-08', 'end': '2008-08-24'}},
    'Athens': {'2004': {'start': '2004-08-13', 'end': '2004-08-29'}},
    'Sydney': {'2000': {'start': '2000-09-15', 'end': '2000-09-24'}},
    'Atlanta': {'1996': {'start': '1996-07-19', 'end': '1996-08-04'}},
    'Barcelona': {'1992': {'start': '1992-07-25', 'end': '1992-08-09'}},
    'Seoul': {'1988': {'start': '1988-09-17', 'end': '1988-09-26'}},
    'Los Angeles': {'1984': {'start': '1984-07-28', 'end': '1984-08-12'}},
    'Moscow': {'1980': {'start': '1980-07-19', 'end': '1980-08-03'}},
    'Montreal': {'1976': {'start': '1976-07-17', 'end': '1976-08-01'}},
    'Munich': {'1972': {'start': '1972-08-26', 'end': '1972-09-10'}},
    'Mexico City': {'1968': {'start': '1968-10-12', 'end': '1968-10-27'}},
    'Tokyo': {'1964': {'start': '1964-10-10', 'end': '1964-10-24'}},
    'Rome': {'1960': {'start': '1960-08-25', 'end': '1960-09-11'}},
    'Melbourne': {'1956': {'start': '1956-11-22', 'end': '1956-12-01'}},
    'Helsinki': {'1952': {'start': '1952-07-19', 'end': '1952-08-03'}},
    'London': {'1948': {'start': '1948-07-29', 'end': '1948-08-07'}},
    'Berlin': {'1936': {'start': '1936-08-01', 'end': '1936-08-16'}},
    'Los Angeles': {'1932': {'start': '1932-07-30', 'end': '1932-08-14'}},
    'Amsterdam': {'1928': {'start': '1928-07-28', 'end': '1928-08-12'}},
    'Paris': {'1924': {'start': '1924-07-05', 'end': '1924-07-14'}},
    'Antwerp': {'1920': {'start': '1920-08-14', 'end': '1920-08-29'}},
    'Stockholm': {'1912': {'start': '1912-07-06', 'end': '1912-07-15'}},
    'London': {'1908': {'start': '1908-04-27', 'end': '1908-05-06'}},
    'St. Louis': {'1904': {'start': '1904-07-01', 'end': '1904-07-10'}},
    'Paris': {'1900': {'start': '1900-05-14', 'end': '1900-05-28'}},
    'Athens': {'1896': {'start': '1896-04-06', 'end': '1896-04-15'}}
}

track_field_schedule = [
    'Marathon Men', '100M Men', 'Long Jump Men', '20Km Race Walk Men', 'Shot Put Men',
    'Decathlon Men', '400M Hurdles Men', 'Triple Jump Men', 'High Jump Women', '3000M Steeplechase Men',
    'Pole Vault Men', 'Discus Throw Women', '800M Men', '110M Hurdles Men', 'Hammer Throw Women',
    '200M Men', '400M Women', 'Javelin Throw Men', '10000M Men', '400M Hurdles Women',
    'Long Jump Women', '20Km Race Walk Women', '100M Hurdles Women', 'Shot Put Women', '1500M Men',
    '5000M Women', 'Triple Jump Women', 'Marathon Women', 'High Jump Men', 'Discus Throw Men',
    '800M Women', '200M Women', 'Pole Vault Women', '5000M Men', 'Javelin Throw Women',
    '1500M Women', 'Hammer Throw Men', 'Decathlon Men'
]

from datetime import datetime, timedelta
import pandas as pd
import requests

# modify the given function to calculate event dates and return day number only
def estimate_event_day_number(olympics_dates, track_field_schedule):
    event_day_numbers = {}
    for location, years in olympics_dates.items():
        for year, dates in years.items():
            start_date = datetime.strptime(dates['start'], '%Y-%m-%d')
            end_date = datetime.strptime(dates['end'], '%Y-%m-%d')
            total_days = (end_date - start_date).days + 1
            events_per_day = len(track_field_schedule) // total_days
            remaining_events = len(track_field_schedule) % total_days
            
            event_counter = 0
            for day in range(total_days):
                events_on_day = events_per_day + (1 if remaining_events > 0 else 0)
                remaining_events -= 1 if remaining_events > 0 else 0
                for _ in range(events_on_day):
                    if event_counter < len(track_field_schedule):
                        event_day_numbers[track_field_schedule[event_counter]] = day + 1
                        event_counter += 1
    return event_day_numbers

# generate a dictionary for event lookup
olympics_event_lookup = estimate_event_day_number(olympics_schedule, track_field_schedule)

# function to add 'Day' and 'Date' columns and update temperature
def update_dataframe_with_event_date_and_temp(df, olympics_schedule, olympics_event_lookup):
    weather_cache = {}  # to minimize API calls
    
    def get_temperature(location, date):
        if (location, date) in weather_cache:
            return weather_cache[(location, date)]
        # API call logic here, using location and date. Store result in weather_cache and return it
        api_key = '8b465fd822a465952655ef5bf0ca24b9'
        response = requests.get(f'http://api.openweathermap.org/data/2.5/weather?q={location}&appid={api_key}&units=metric')
        if response.status_code == 200:
            temperature = response.json()['main']['temp']
            weather_cache[(location, date)] = temperature
            return temperature
        else:
            return 'N/A'  # In case the API call fails
    
    def get_actual_event_date(location, year, day_number):
        day_number = int(day_number)
        start_date = datetime.strptime(olympics_schedule[location][year]['start'], '%Y-%m-%d')
        event_date = start_date + timedelta(days=day_number - 1)
        return event_date.strftime('%Y-%m-%d')
    
    df['Day'] = df['Event'].apply(lambda event: olympics_event_lookup.get(event, 'N/A'))
    df['Date'] = df.apply(lambda row: get_actual_event_date(row['Location'], str(row['Year']), row['Day']), axis=1)
    df['Temperature'] = df.apply(lambda row: get_temperature(row['Location'], row['Date']), axis=1)
    
    return df

# open the csv file for reading and a new csv file for writing
with open('results.csv', 'r', newline='', encoding='utf-8') as infile, open('processed_results.csv', 'w', newline='', encoding='utf-8') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    # read and process each row
    for row in reader:
        # check if the row has extra fields
        if len(row) > 8:
            # truncate the row to expected number of fields
            row = row[:8]
        
        # write the processed row to the new csv file
        writer.writerow(row)

# read the processed csv file into a dataframe
df = pd.read_csv('processed_results.csv')

# cleaning
df = df[df.isnull().sum(axis=1) < 1]

# Update the dataframe with the 'Day', 'Date', and 'Temperature' columns
df_updated = update_dataframe_with_event_date_and_temp(df, olympics_schedule, olympics_event_lookup)

# Save the updated dataframe
df_updated.to_csv('final_results.csv', index=False)

KeyError: '1984'

In [90]:
from datetime import datetime, timedelta
import pandas as pd
from numpy import random

# given data
olympics_schedule = {
    'Rio': {'2016': {'start': '2016-08-05', 'end': '2016-08-21'}},
    'Beijing': {'2008': {'start': '2008-08-08', 'end': '2008-08-24'}},
    'Athens': {'2004': {'start': '2004-08-13', 'end': '2004-08-29'}},
    'Sydney': {'2000': {'start': '2000-09-15', 'end': '2000-09-24'}},
    'Atlanta': {'1996': {'start': '1996-07-19', 'end': '1996-08-04'}},
    'Barcelona': {'1992': {'start': '1992-07-25', 'end': '1992-08-09'}},
    'Seoul': {'1988': {'start': '1988-09-17', 'end': '1988-09-26'}},
    'Los Angeles': {'1984': {'start': '1984-07-28', 'end': '1984-08-12'}},
    'Moscow': {'1980': {'start': '1980-07-19', 'end': '1980-08-03'}},
    'Montreal': {'1976': {'start': '1976-07-17', 'end': '1976-08-01'}},
    'Munich': {'1972': {'start': '1972-08-26', 'end': '1972-09-10'}},
    'Mexico City': {'1968': {'start': '1968-10-12', 'end': '1968-10-27'}},
    'Tokyo': {'1964': {'start': '1964-10-10', 'end': '1964-10-24'}},
    'Rome': {'1960': {'start': '1960-08-25', 'end': '1960-09-11'}},
    'Melbourne': {'1956': {'start': '1956-11-22', 'end': '1956-12-01'}},
    'Helsinki': {'1952': {'start': '1952-07-19', 'end': '1952-08-03'}},
    'London': {'1948': {'start': '1948-07-29', 'end': '1948-08-07'}},
    'Berlin': {'1936': {'start': '1936-08-01', 'end': '1936-08-16'}},
    'Los Angeles': {'1932': {'start': '1932-07-30', 'end': '1932-08-14'}},
    'Amsterdam': {'1928': {'start': '1928-07-28', 'end': '1928-08-12'}},
    'Paris': {'1924': {'start': '1924-07-05', 'end': '1924-07-14'}},
    'Antwerp': {'1920': {'start': '1920-08-14', 'end': '1920-08-29'}},
    'Stockholm': {'1912': {'start': '1912-07-06', 'end': '1912-07-15'}},
    'London': {'1908': {'start': '1908-04-27', 'end': '1908-05-06'}},
    'St. Louis': {'1904': {'start': '1904-07-01', 'end': '1904-07-10'}},
    'Paris': {'1900': {'start': '1900-05-14', 'end': '1900-05-28'}},
    'Athens': {'1896': {'start': '1896-04-06', 'end': '1896-04-15'}}
}

track_field_schedule = [
    'Marathon Men', '100M Men', 'Long Jump Men', '20Km Race Walk Men', 'Shot Put Men',
    'Decathlon Men', '400M Hurdles Men', 'Triple Jump Men', 'High Jump Women', '3000M Steeplechase Men',
    'Pole Vault Men', 'Discus Throw Women', '800M Men', '110M Hurdles Men', 'Hammer Throw Women',
    '200M Men', '400M Women', 'Javelin Throw Men', '10000M Men', '400M Hurdles Women',
    'Long Jump Women', '20Km Race Walk Women', '100M Hurdles Women', 'Shot Put Women', '1500M Men',
    '5000M Women', 'Triple Jump Women', 'Marathon Women', 'High Jump Men', 'Discus Throw Men',
    '800M Women', '200M Women', 'Pole Vault Women', '5000M Men', 'Javelin Throw Women',
    '1500M Women', 'Hammer Throw Men', 'Decathlon Men'
]


def estimate_event_day(start_date, end_date, events):
    ''' Estimates the day of each event based on the total duration of the Olympics. '''
    start_date = datetime.strptime(start_date, '%Y-%m-%d')
    end_date = datetime.strptime(end_date, '%Y-%m-%d')
    total_days = (end_date - start_date).days + 1
    events_per_day = len(events) // total_days
    additional_events = len(events) % total_days

    event_day_map = {}
    event_index = 0
    for day in range(total_days):
        for _ in range(events_per_day + (1 if day < additional_events else 0)):
            if event_index < len(events):
                event_name = events[event_index].lower().strip()
                event_day_map[event_name] = day + 1
                event_index += 1
    return event_day_map

def prepare_olympics_data(olympics_schedule, track_field_schedule):
    ''' Prepares and saves the Olympics data with estimated event days and dates. '''
    processed_data = []
    for location, years in olympics_schedule.items():
        for year, dates in years.items():
            event_day_map = estimate_event_day(dates['start'], dates['end'], track_field_schedule)
            for event, day in event_day_map.items():
                event_date = datetime.strptime(dates['start'], '%Y-%m-%d') + timedelta(days=day - 1)
                processed_data.append([location.lower().strip(), year, event.lower().strip(), day, event_date.strftime('%Y-%m-%d')])
    df = pd.DataFrame(processed_data, columns=['Location', 'Year', 'Event', 'Day', 'Date'])
    df.to_csv('olympics_events_schedule.csv', index=False)
    return df

df = prepare_olympics_data(olympics_schedule, track_field_schedule)

# Load the processed results CSV
results_df = pd.read_csv('processed_results.csv')

# Adjust for consistent formatting
results_df['Location'] = results_df['Location'].apply(lambda x: x.lower().strip())
results_df['Event'] = results_df['Event'].apply(lambda x: x.lower().strip())
results_df['Year'] = results_df['Year'].astype(str)

# Create the event_date_map
event_date_map = {f"{row['Location']}_{row['Year']}_{row['Event']}": row['Date'] for index, row in df.iterrows()}

def map_event_to_date(row):
    ''' Maps each event to its date. '''
    identifier = f"{row['Location']}_{row['Year']}_{row['Event']}"
    return event_date_map.get(identifier)

results_df['Date'] = results_df.apply(map_event_to_date, axis=1)

def assign_dates_to_missing_events(row):
    ''' Assigns dates to events without a date based on a normal distribution. '''
    if pd.isna(row['Date']):
        location = row['Location'].title()
        year = row['Year']
        
        # Check if the location and year are in the olympics_schedule
        if location in olympics_schedule and year in olympics_schedule[location]:
            start_date = datetime.strptime(olympics_schedule[location][year]['start'], '%Y-%m-%d')
            end_date = datetime.strptime(olympics_schedule[location][year]['end'], '%Y-%m-%d')
            total_days = (end_date - start_date).days + 1

            mean = total_days / 2
            std_dev = total_days / 6
            random_day = int(random.normal(mean, std_dev, 1)[0])
            random_day = max(1, min(random_day, total_days))
            
            new_date = start_date + timedelta(days=random_day - 1)
            return new_date.strftime('%Y-%m-%d')
        else:
            return None
    else:
        return row['Date']

weather_cache = {}

def get_temperature(location, date):
    if (location, date) in weather_cache:
        return weather_cache[(location, date)]
    # API call logic here, using location and date. Store result in weather_cache and return it
    api_key = '8b465fd822a465952655ef5bf0ca24b9'
    response = requests.get(f'http://api.openweathermap.org/data/2.5/weather?q={location}&appid={api_key}&units=metric')
    if response.status_code == 200:
        temperature = response.json()['main']['temp']
        weather_cache[(location, date)] = temperature
        return temperature
    else:
        return None

def assign_dates_and_temperatures(row):
    '''Assigns dates to events without a date and fetches temperature for all events.'''
    if pd.isna(row['Date']):
        row['Date'] = assign_dates_to_missing_events(row)
    
    # Fetch temperature for each event
    location = row['Location'].title()
    date = row['Date']
    row['Temperature'] = get_temperature(location, date)
    return row

# Ensure your DataFrame has a 'Temperature' column
results_df['Temperature'] = None

# Apply the function to fill missing dates and temperatures
results_df = results_df.apply(assign_dates_and_temperatures, axis=1)

# Save the updated DataFrame
results_df.to_csv('updated_results_with_temperatures.csv', index=False)

results_df

  return Timeout(connect=self._connect, read=self._read, total=self.total)


Unnamed: 0,Gender,Event,Location,Year,Medal,Name,Nationality,Result,Date,Temperature
0,M,10000m men,rio,2016,G,Mohamed FARAH,USA,25:05.17,2016-08-12,14.86
1,M,10000m men,rio,2016,S,Paul Kipngetich TANUI,KEN,27:05.64,2016-08-12,14.86
2,M,10000m men,rio,2016,B,Tamirat TOLA,ETH,27:06.26,2016-08-12,14.86
3,M,10000m men,beijing,2008,G,Kenenisa BEKELE,ETH,27:01.17,2008-08-15,8.94
4,M,10000m men,beijing,2008,S,Sileshi SIHINE,ETH,27:02.77,2008-08-15,8.94
...,...,...,...,...,...,...,...,...,...,...
2389,W,triple jump women,athens,2004,S,Hrysopiyi DEVETZI,GRE,15.25,,18.20
2390,W,triple jump women,athens,2004,B,Tatyana LEBEDEVA,RUS,15.14,,18.20
2391,W,triple jump women,atlanta,1996,G,Inessa KRAVETS,UKR,15.33,1996-07-30,17.69
2392,W,triple jump women,atlanta,1996,S,Inna LASOVSKAYA,RUS,14.98,1996-07-30,17.69


In [91]:
df['Date'].value_counts()

Date
1896-04-08    4
1896-04-06    4
1904-07-08    4
1904-07-07    4
1912-07-06    4
             ..
2016-08-14    2
2016-08-15    2
2016-08-16    2
2016-08-17    2
2016-08-18    2
Name: count, Length: 329, dtype: int64

In [92]:
results_df[results_df['Date'].isna()]

Unnamed: 0,Gender,Event,Location,Year,Medal,Name,Nationality,Result,Date,Temperature
12,M,10000m men,los angeles,1984,G,Alberto COVA,ITA,27:47.54,,24.38
13,M,10000m men,los angeles,1984,S,Michael MCLEOD,GBR,28:06.22,,24.38
14,M,10000m men,los angeles,1984,B,Michael MUSYOKI,KEN,28:06.46,,24.38
18,M,10000m men,mexico,1968,G,Naftali TEMU,KEN,29:27.4,,29.84
19,M,10000m men,mexico,1968,S,Mamo WOLDE,ETH,29:28.0,,29.84
...,...,...,...,...,...,...,...,...,...,...
2386,W,triple jump women,london,2012,S,Caterine IBARGUEN,COL,14.8,,12.89
2387,W,triple jump women,london,2012,B,Olga SALADUKHA,UKR,14.79,,12.89
2388,W,triple jump women,athens,2004,G,Francoise MBANGO ETONE,CMR,15.3,,18.20
2389,W,triple jump women,athens,2004,S,Hrysopiyi DEVETZI,GRE,15.25,,18.20


In [98]:
results_df[results_df['Temperature'].isna()].count().sum()

455

In [95]:
results_df[['Location','Temperature']].value_counts()

Location     Temperature
london       12.89          253
athens       18.20          173
los angeles  24.38          156
rio          14.86          141
sydney       19.57          140
atlanta      17.69          130
beijing      8.94           128
barcelona    12.15          125
moscow       13.81          111
paris        12.49          111
tokyo        14.57          102
rome         17.80           99
montreal     11.88           96
helsinki     5.68            93
munich       3.48            81
mexico       29.84           75
berlin       8.13            66
stockholm    6.46            63
amsterdam    12.74           57
st louis     12.21           51
antwerp      12.47           42
los angeles  24.04           25
seoul        13.51           18
Name: count, dtype: int64