In [None]:
import pandas as pd
import numpy as np
import datetime
import time
import matplotlib.pyplot as plt
from sklearn import linear_model
import pickle

In [None]:
# Creates a new DataFrame from a csv containing race information
# Drops columns that are unnecessary for the purpose of this project
df = pd.read_csv('marathon_results_2017.csv')
df = df.drop(columns = ['Unnamed: 0', 'Bib', 'Name', 'City', 'State', 'Country'])
df = df.drop(columns = ['Citizen', 'Unnamed: 9', 'Overall', 'Gender', 'Division', 'Proj Time'])
df = df.drop(columns = ['Age', 'M/F'])

In [None]:
# Renames columns to be more easily used by IDE
df = df.rename(columns={"5K":"five_k", "10K":"ten_k", "15K":"fifteen_k", "20K":"twenty_k",
                  "Half":"half", "25K":"twentyfive_k", "30K":"thirty_k", "35K":"thirtyfive_k", 
                  "40K":"forty_k", "Pace":"pace", "Official Time":"official_time"}) 

In [None]:
# Function to convert string times in DataFrame to int seconds
# Also, check rows for '-', if data point != '-', insert row into new dataframe
def string_to_seconds(df):
    i = -1

    while i < df.index.max():
        i += 1
        j = -1
        while j < 10:
            j += 1
            time_string = df.at[i,df.columns[j]]
            if time_string != '-':
                new_time = time_string.split(':')
                conv_time = datetime.time(int(new_time[0]), int(new_time[1]), int(new_time[2]))
                seconds = conv_time.hour*3600 + conv_time.minute*60 + conv_time.second
                df.at[i, df.columns[j]] = seconds


In [None]:
# Calls function to convert values in DataFrame
string_to_seconds(df)

In [None]:
# Function to remove rows containing '-' values from DataFrame
def clean_data(df):
    cleaned_data = pd.DataFrame()

    i = -1

    while i < df.index.max():
        i += 1
        j = -1
        k = 0
        while j < 10:
            j += 1
            time_string = df.at[i,df.columns[j]]
            if time_string != '-':
                k = 0
            else:
                j = 10
                k = 1
        if k == 0:
            cleaned_data = cleaned_data.append(df.loc[[i]])

    return cleaned_data

In [None]:
# Stores cleaned data in a new DataFrame
cleaned_data = clean_data(df)

In [None]:
import matplotlib.pyplot as plt
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(df[['official_time']],df.forty_k)

In [None]:
# Function to test time conversion to seconds and performing linear regressions
# Finalized functino in Capstone_Functions.py file
def calc_splits(df):

    # for testing
    # will need to use values from webpage
    # finish_time = '03:00:00'
    finish_hr = 3
    finish_min = 0
    finish_sec = 0
    
    # convert time string to seconds
    # string format must be HH:MM:SS
    # time_string = finish_time
    # new_time = time_string.split(':')
    # split string
    # conv_time = datetime.time(int(new_time[0]), int(new_time[1]), int(new_time[2]))
    # convert to seconds
    # input_in_seconds = conv_time.hour*3600 + conv_time.minute*60 + conv_time.second
    input_in_seconds = finish_hr*3600 + finish_min*60 + finish_sec.second
    
    # linear regression for finish time against all splits
    reg_five = linear_model.LinearRegression()
    reg_five.fit(df[['official_time']],df.five_k)
    
    reg_ten = linear_model.LinearRegression()
    reg_ten.fit(df[['official_time']],df.ten_k)
    
    reg_fifteen = linear_model.LinearRegression()
    reg_fifteen.fit(df[['official_time']],df.fifteen_k)
    
    reg_twenty = linear_model.LinearRegression()
    reg_twenty.fit(df[['official_time']],df.twenty_k)
    
    reg_half = linear_model.LinearRegression()
    reg_half.fit(df[['official_time']],df.half)
    
    reg_twentyfive = linear_model.LinearRegression()
    reg_twentyfive.fit(df[['official_time']],df.twentyfive_k)
    
    reg_thirty = linear_model.LinearRegression()
    reg_thirty.fit(df[['official_time']],df.thirty_k)
    
    reg_thirtyfive = linear_model.LinearRegression()
    reg_thirtyfive.fit(df[['official_time']],df.thirtyfive_k)
    
    reg_forty = linear_model.LinearRegression()
    reg_forty.fit(df[['official_time']],df.forty_k)

    # Predicts the split
    # Must be treated as an array
    # output type of reg.xyz is an array, so must access first value of array
        # only one value per array
    # round(fivek_arr[0]) rounds value to closest integer
    fivek_arr = reg_five.predict([[input_in_seconds]]).astype(float)
    
    tenk_arr = reg_ten.predict([[input_in_seconds]]).astype(float)
    
    fifteenk_arr = reg_fifteen.predict([[input_in_seconds]]).astype(float)
    
    twentyk_arr = reg_twenty.predict([[input_in_seconds]]).astype(float)
    
    half_arr = reg_half.predict([[input_in_seconds]]).astype(float)
    
    twentyfivek_arr = reg_twentyfive.predict([[input_in_seconds]]).astype(float)
    
    thirtyk_arr = reg_thirty.predict([[input_in_seconds]]).astype(float)
    
    thirtyfivek_arr = reg_thirtyfive.predict([[input_in_seconds]]).astype(float)
    
    fortyk_arr = reg_forty.predict([[input_in_seconds]]).astype(float)
    
    # convert int seconds to HH:MM:SS format
    # exists in print statements to double-check output
    print(time.strftime('%H:%M:%S', time.gmtime(round(fivek_arr[0]))))
    print(time.strftime('%H:%M:%S', time.gmtime(round(tenk_arr[0]))))
    print(time.strftime('%H:%M:%S', time.gmtime(round(fifteenk_arr[0]))))
    print(time.strftime('%H:%M:%S', time.gmtime(round(twentyk_arr[0]))))
    print(time.strftime('%H:%M:%S', time.gmtime(round(half_arr[0]))))
    print(time.strftime('%H:%M:%S', time.gmtime(round(twentyfivek_arr[0]))))
    print(time.strftime('%H:%M:%S', time.gmtime(round(thirtyk_arr[0]))))
    print(time.strftime('%H:%M:%S', time.gmtime(round(thirtyfivek_arr[0]))))
    print(time.strftime('%H:%M:%S', time.gmtime(round(fortyk_arr[0]))))
    
    # Ability to track avg pace per mile per 5k split
    # Not included in final product
    MILES_PER_FIVEK = 3.107
    fivek_avg = round(fivek_arr[0]/MILES_PER_FIVEK)
   
    tenk_avg = round((tenk_arr[0] - fivek_arr[0])/MILES_PER_FIVEK)
    
    fifteenk_avg = round((fifteenk_arr[0] - tenk_arr[0])/MILES_PER_FIVEK)
    
    twentyk_avg = round((twentyk_arr[0] - fifteenk_arr[0])/MILES_PER_FIVEK)
    
    # ***Did not calculate for half-marathon as these are in 5k segments***
    
    twentyfivek_avg = round((twentyfivek_arr[0] - twentyk_arr[0])/MILES_PER_FIVEK)
    
    thirtyk_avg = round((thirtyk_arr[0] - twentyfivek_arr[0])/MILES_PER_FIVEK)
    
    thirtyfivek_avg = round((thirtyfivek_arr[0] - thirtyk_arr[0])/MILES_PER_FIVEK)
    
    fortyk_avg = round((fortyk_arr[0] - thirtyfivek_arr[0])/MILES_PER_FIVEK)

In [None]:
# Stores the cleaned data used in final product
# This file is opened in the Capstone_Functions.py file
filename='model.pkl'

pickle.dump(cleaned_data, open(filename, 'wb'))