In [24]:
import os
from dotenv import load_dotenv
import psycopg2
import requests
from typing import Dict, List
import logging 
# Load the environment variables from .env file
load_dotenv('/home/wjones/CC/Capstone/tbd2/Track/.env', override=True)
logging.basicConfig(filename='db_insert.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.INFO)

# Get the database credentials from environment variables
db_name = os.getenv('DB_NAME')
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASS')
db_host = os.getenv('DB_HOST')
db_port = os.getenv('DB_PORT')

# Connect to the database
db_params = {
    'dbname': db_name,
    'user': db_user,
    'password' : db_password,
    'host' : db_host,
    'port' : db_port
}

In [25]:
def connect_db(db_params):
    """Connect to the PostgreSQL database server."""
    conn = psycopg2.connect(**db_params)
    return conn

In [5]:
import pandas as pd

df = pd.read_csv('csv/races.csv')

# extracts the second string of numbers from the URL
def extract_id(url):
    meet_id = url.split('/')[-3] if url else None
    return meet_id

# Apply the function to the 'event_url' column and replace 'tffrs_meet_id'
df['tffrs_meet_id'] = df['event_url'].apply(extract_id)

# Write the  back to the CSV file
df.to_csv('csv/merged_with_section_id.csv', index=False)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [29]:
import csv

# Load updated_race_urls.csv into a dictionary
with open('csv/updated_race_urls.csv', 'r') as f:
    reader = csv.DictReader(f)
    dict_one = {row['meet_url']: row for row in reader}

# Load merged_with_section_id.csv into a list
with open('csv/merged_with_section_id.csv', 'r') as f:
    reader = csv.DictReader(f)
    list_two = list(reader)

# Merge the data
merged_data = []
for url, row_one in dict_one.items():
    matching_rows = [row for row in list_two if row['meet_url'] == url]
    for row_two in matching_rows:
        merged_row = {**row_one, **row_two}
        merged_data.append(merged_row)

# Write the merged data to a new CSV file
with open('csv/merged_data.csv', 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=merged_data[0].keys())
    writer.writeheader()
    writer.writerows(merged_data)

In [11]:
## this is to add sex to the races based on the url

import csv


with open('csv/merged_data.csv', 'r') as file:
    reader = csv.DictReader(file)
    data = list(reader)

# Add the 'sex' field
for row in data:
    if 'Men' in row['event_url']:
        row['sex'] = 'M'
    elif 'Women' in row['event_url']:
        row['sex'] = 'F'
    else:
        row['sex'] = ''

# Write the updated data back to the CSV file
with open('csv/final_race.csv', 'w', newline='') as file:
    fieldnames = ['meet_url','event_name','event_url','tffrs_meet_id','meet_name','date','location', 'sex']
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data)

In [12]:
import datefinder
import re

# BUG this does not work properly and gives dates back wrong
def find_date_in_string(input_string):

    input_string = re.sub(r'\s+', ' ', input_string).strip()
    
    # if there is a date range in the string
    match = re.match(r'(\w+ \d+)-(\w+ \d+), (\d{4})', input_string)

    if match:

        start_date_str = f'{match.group(1)}, {match.group(3)}'
        end_date_str = f'{match.group(2)}, {match.group(3)}'

        # Parse the start date
        start_dates = list(datefinder.find_dates(start_date_str))
        if start_dates:
            return start_dates[0].strftime('%Y-%m-%d')
        else:
            print(f"No start date found in string: {input_string}")
            return None
    else:
        # Handle non-range dates like DNS or DNF
        matches = list(datefinder.find_dates(input_string))
        if matches:
            return matches[0].strftime('%Y-%m-%d')
        else:
            print(f"No date found in string: {input_string}")
            return None

In [21]:
# meet_url,event_name,event_url,tffrs_meet_id,meet_name,date,location,sex
def insert_race(conn, race: Dict):
    with conn.cursor() as cur:
        cur.execute("""
            INSERT INTO Races (meet_name, section, tfrrs_url, date, sex, location, tfrrs_meet_id)
            VALUES (%s, %s, %s, %s, %s, %s, %s)
            ON CONFLICT (tfrrs_meet_id) DO UPDATE SET
                meet_name = EXCLUDED.meet_name,
                section = EXCLUDED.section,
                tfrrs_url = EXCLUDED.tfrrs_url,
                date = EXCLUDED.date,
                sex = EXCLUDED.sex,
                location = EXCLUDED.location,
                tfrrs_meet_id = EXCLUDED.tfrrs_meet_id;
        """, (race['meet_name'], race['event_name'], race['event_url'], race['date'], race['sex'], race['location'], race['tffrs_meet_id']))
        conn.commit()

In [22]:
import csv
def main():
    conn = connect_db(db_params)

    with open('csv/final_race.csv', 'r') as file:
        reader = csv.DictReader(file)
        data = list(reader)

    for race in data:
        #race['date'] = find_date_in_string(race['date'])
        #print(race)
        insert_race(conn, race)

    conn.close()

In [23]:
main()