<a href="https://colab.research.google.com/github/janilles/couch/blob/master/couch_runs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Boilerplate

In [0]:
# added -q for suppressing output
!pip install -q -U pymysql

import pymysql
import pandas as pd
import altair as alt


# Database connection



In [0]:
def connect():
    return pymysql.connect(
        host='',
        port=,
        user='',
        passwd='',
        db='',
        autocommit=True
        )

connection = connect()

def sql_to_df(sql):
    return pd.read_sql(sql, con=connection)


# Database tables

In [0]:
tables = sql_to_df("""SHOW TABLES""")

tables


## Tables described

In [0]:
sql_to_df("""DESCRIBE c25k_user_goal""")

In [0]:
sql_to_df("""DESCRIBE c25k_device""")

In [0]:
sql_to_df("""DESCRIBE c25k_run""")

In [0]:
sql_to_df("""DESCRIBE c25k_trainer_log""")

In [0]:
sql_to_df("""DESCRIBE error_log""")

# Data sense-checking

## Users with too many runs started in one day

In [0]:
started_runs_by_day = sql_to_df("""
    SELECT
        device_id,
        DATE(started_at) AS date,
        COUNT(id) AS runs_started
    FROM
        c25k_run
    GROUP BY
        device_id,
        date
    ORDER BY
        runs_started DESC""")

started_runs_by_day.head(10)


### In proportion with all users running

In [0]:
# .sum() added so it returns a number
all_users_running = sql_to_df("""
    SELECT
        COUNT(DISTINCT device_id) AS runners
    FROM
        c25k_run""")['runners'].sum()

# conditions for dataframe slicing
condition_3 = started_runs_by_day['runs_started'] > 3
condition_1 = started_runs_by_day['runs_started'] == 1

# device counts based on conditions
more_than_3 = started_runs_by_day[condition_3]['device_id'].nunique()
just_1_started = started_runs_by_day[condition_1]['device_id'].nunique()

# calculations rounded to 1 decimal
percent_3 = round(more_than_3 / all_users_running * 100, 1)
percent_1 = round(just_1_started / all_users_running * 100, 1)

print(f"All users with runs in the databse: {all_users_running}")
print(f"Users with three plus started runs in one day: {more_than_3}")
print(f"Percentage of three plus from all users: {percent_3}%")
print(f"Just one started run in a day: {just_1_started}")
print(f"Percentage of just one from all users: {percent_1}%")


## Users with too many runs finished in one day

In [0]:
runs_finished_by_day = sql_to_df("""
    SELECT
        device_id,
        DATE(started_at) AS date,
        COUNT(id) AS runs_finished
    FROM
        c25k_run
    WHERE
        has_finished = 1
    GROUP BY
        device_id,
        date
    ORDER BY
        runs_finished DESC""")

runs_finished_by_day.head(10)


### In proportion with all users finishing runs

In [0]:
# .sum() added so it doesn't return a dataframe but a figure
all_users_finishing = sql_to_df("""
    SELECT
        COUNT(DISTINCT device_id) AS finishers
    FROM
        c25k_run
    WHERE
        has_finished = 1
    """)['finishers'].sum()

# conditions for dataframe slicing
condition_2f = runs_finished_by_day['runs_finished'] > 1
condition_1f = runs_finished_by_day['runs_finished'] == 1

# device counts based on conditions
finished_2plus = runs_finished_by_day[condition_2f]['device_id'].nunique()
finished_1 = runs_finished_by_day[condition_1f]['device_id'].nunique()

# calculations rounded to 1 decimal
percent_2f = round(finished_2plus / all_users_finishing * 100, 1)
percent_1f = round(finished_1 / all_users_finishing * 100, 1)

print(f"All users with runs finished: {all_users_finishing}")
print(f"Users with two or more runs finished in one day: {finished_2plus}")
print(f"Percentage of two or more... from all users: {percent_2f}%")
print(f"Just one finished run in a day: {finished_1}")
print(f"Percentage of just one from all users: {percent_1f}%")


## Users with many runs finished in a day by device OS

In [0]:
runs_finished_by_day_os = sql_to_df("""
    SELECT
        device_id,
        device_os,
        DATE(started_at) AS date,
        COUNT(id) AS runs_finished
    FROM
        c25k_run
    WHERE
        has_finished = 1
    GROUP BY
        device_id,
        date
    ORDER BY
        runs_finished DESC""")

runs_finished_by_day_os.head()


In [0]:
# just a sample of the worst cases
ten_or_more = runs_finished_by_day_os['runs_finished'] > 9

runs_finished_by_day_os.loc[ten_or_more].groupby(
    'device_os',
    as_index=False)['device_id'].count()


# Runs overview

In [0]:
overview = sql_to_df("""
    SELECT
        id,
        device_id,
        device_os,
        has_reached_halfway,
        has_finished,
        lat
    FROM
        c25k_run
    """)

_runners = overview['device_id'].nunique()
print(f"Users running: {_runners}")

_started = overview['id'].nunique()
print(f"Runs started:  {_started}")

_condition_h = overview['has_reached_halfway'] == 1
_halfway = overview[_condition_h]['id'].nunique()
print(f"Runs halfway:  {_halfway}")

_condition_f = overview['has_finished'] == 1
_finished = overview[_condition_f]['id'].nunique()
print(f"Runs finished: {_finished}")

_condition_lat = overview['lat'] != 0
_runs_with_loc = overview[_condition_lat]['id'].nunique()
_calc_runs = round(_runs_with_loc / _started * 100, 1)
print(f"Runs with location data: {_calc_runs}%")

_runners_with_loc = overview[_condition_lat]['device_id'].nunique()
_calc_runners = round(_runners_with_loc / _runners * 100, 1)
print(f"Users with location data: {_calc_runners}%")


# Completion rates for each run 

In [0]:
runs = sql_to_df("""
    SELECT
        id,
        week_no,
        run_no,
        has_reached_halfway,
        has_finished
    FROM
        c25k_run
    WHERE
        device_os = 'ios'
        AND
        trainer_id = 'Sarah Millican'
    """)

runs.head()

In [0]:
#@title Default title text
# initiate lists to build a data fram from
Starts = []
Halfway = []
Finished = []

for week in range(1, 10):
    # dataframe slicing condition
    week_slice = runs['week_no'] == week
    # sliced dtaframe by week number from the for loop
    wk_runs = runs[week_slice]

    for run in range(1, 4):
        # dataframe slicing condtions
        run_slice = wk_runs['run_no'] == run
        hway_slice = wk_runs['has_reached_halfway'] == 1
        fin_slice = wk_runs['has_finished'] == 1

        # counting run IDs where conditions are met
        starts = wk_runs[run_slice]['id'].count()
        halfway = wk_runs[(run_slice) & (hway_slice)]['id'].count()
        finished = wk_runs[(run_slice) & (fin_slice)]['id'].count()

        Starts.append(starts)
        Halfway.append(halfway)
        Finished.append(finished)

# dictionary to create dataframe from
d = {'Run_no': list(range(1, 28)),
     'Starts': Starts,
     'Halfway': Halfway,
     'Finished': Finished}

run_completions = pd.DataFrame(data=d)

# create columns for each rate
run_completions['Halfway_from_start_%'] = \
    round(
        run_completions['Halfway'] /
        run_completions['Starts']*100,
        1)

run_completions['Finished_from_start_%'] = \
    round(
        run_completions['Finished'] /
        run_completions['Starts']*100,
        1)

run_completions['Finished_from_halfway_%'] = \
    round(
        run_completions['Finished'] /
        run_completions['Halfway']*100,
        1)

# add conditional formatting to relevant columns
run_completions \
    .style \
    .background_gradient(
        cmap='RdYlGn',
        subset=['Halfway_from_start_%',
                'Finished_from_start_%',
                'Finished_from_halfway_%'])


### Export dataframe as CSV

In [0]:
# from google.colab import files

# run_completions.to_csv('df.csv')
# files.download('df.csv')

## Runs by day

In [0]:
runs_by_date = sql_to_df("""
    SELECT
        DATE(started_at) AS date,
        COUNT(id) AS runs_started
    FROM
        c25k_run
    WHERE
        DATE(started_at) BETWEEN '2019-04-28' AND '2019-05-22' -- data cleaning
    GROUP BY
        date
    """)

# this is necessary as the 'date' is an object, not datetime
runs_by_date['date'] = pd.to_datetime(runs_by_date['date'])

# chart from Altair viz example gallery
bar = alt.Chart(runs_by_date).mark_bar().encode(
    x='date:T',
    y='runs_started:Q'
)

rule = alt.Chart(runs_by_date).mark_rule(color='red').encode(
    y='mean(runs_started):Q'
)

(bar + rule).properties(width=600)


## Runs by hour of day

In [0]:
# count of run IDs per weekday and hour averaged
runs_by_hour = sql_to_df("""
    SELECT
        s.weekday,
        s.hour,
        AVG(s.runs) AS avg_runs
    FROM
        (
        SELECT
            WEEKDAY(started_at) AS weekday,
            HOUR(started_at) AS hour,
            COUNT(id) AS runs
        FROM
            c25k_run
        GROUP BY
            weekday,
            hour
        ) s
    GROUP BY
        s.weekday,
        s.hour
    """)

runs_by_hour.head()


In [0]:
highlight = alt.selection(type='single',
                          on='mouseover',
                          fields=['weekday'],
                          nearest=True)

base = alt.Chart(runs_by_hourG, 
                 title='Avg. number of runs by hour of day').encode(
    x='hour:N',
    y='avg_runs:Q',
    color='weekday:N'
)

points = base.mark_circle().encode(
    opacity=alt.value(0)
).add_selection(
    highlight
).properties(
    width=600
)

lines = base.mark_line().encode(
    size=alt.condition(~highlight, alt.value(1), alt.value(3))
)

points + lines

## Runs started

In [0]:
run_starts = sql_to_df("""
    SELECT
        COUNT(DISTINCT device_id) AS users,
        week_no,
        run_no
    FROM
        c25k_run        
    GROUP BY
        week_no,
        run_no
    """)

alt.Chart(
    run_starts,
    title='Runs started'
         ).mark_bar(
).encode(
    x='run_no:O',
    y='users:Q',
    color='run_no:N',
    column='week_no:N'
)
