In [415]:
import pandas as pd
import fnmatch
import os
import glob
import csv
import codecs
import boto3
import io
from io import StringIO
import dropbox
from config import dbx_token

csv_buffer = StringIO()

# Load Data from Local directory

In [416]:
#def get_dbx_token():
    #!ln -s /Users/matthiashugli/Virtualenvs/youth-base/youth-base/config.py config.py
    #dbx = dropbox.Dropbox(dbx_token)

In [417]:

files_trainings = '/Users/matthiashugli/Dropbox/bucket/trainings-yb/s1_core_trm_player_unit_skills_*.csv'
files_participants = '/Users/matthiashugli/Dropbox/bucket/trainings-yb/s1_training_participants.csv'

skills_df = pd.DataFrame()
for filename in glob.glob(files_trainings):
    file = pd.read_csv(filename, header=1, delimiter=',', low_memory=False)
    file.insert(1, 'filename', filename)
    skills_df = skills_df.append(file)

participants_df = pd.DataFrame()
for filename in glob.glob(files_participants):
    file = pd.read_csv(filename, header=1, delimiter=',', low_memory=False)
    file.insert(1, 'filename', filename)
    participants_df = participants_df.append(file)

# Clean and Transform Data

## Keeper Informationen

In [418]:
keeper_df = participants_df.iloc[1:, [0, 2]].rename(columns={'Unnamed: 0': 'keeper', 'Unnamed: 1': 'team'})
keeper_df_skills = skills_df.iloc[2:, [0, 2]].rename(columns={'Unnamed: 0': 'team', 'Unnamed: 1': 'keeper'})
keeper_df = keeper_df.append(keeper_df_skills)
keeper_df['team'] = keeper_df.team.apply(lambda x: 'U21' if fnmatch.fnmatch(x, '1. Mannschaft') else x)
keeper_df[['team', 'club']] = keeper_df.team.str.split(' BSC', 1, expand=True)
keeper_df[['team', 'club']] = keeper_df.team.str.split(' YB', 1, expand=True)
keeper_df = keeper_df[~keeper_df.keeper.isin(['David von Ballmoos', 'Guillaume Faivre', 'Leandro Zbinden'])]
keeper_df = keeper_df.groupby(['keeper', 'team']).count().drop(columns=['club']).reset_index()
keeper_df

Unnamed: 0,keeper,team
0,Abdullah Laidani,1. Mannschaft
1,Ardian Bajrami,U18
2,Badu Jones,FE-13
3,Bastian Gasche,FE-14
4,Cedrik Strupler,U16
5,Denis Martinovic,U17
6,Elia Pietropaolo,U15
7,Elio Castro,U18
8,Evan Hernandez,FE-12
9,Jamie Gretener,FE-12


## Trainigsthemen / Training Skills

In [419]:
### Properly name columns and broad data table
skills_df = skills_df.drop(skills_df.columns[[3]], axis=1)
skills_df = skills_df.rename(columns={'Unnamed: 0': 'team', 'Unnamed: 1': 'keeper'})
unpivot_df = pd.melt(skills_df, id_vars=['filename', 'team', 'keeper'], var_name='skills', value_name='value')

### Set numeric values and rename value columns for pivot
trainings = unpivot_df

trainings['value'] = pd.to_numeric(trainings.value, errors='coerce')

trainings['skills'] = trainings.skills.apply(lambda x: x.replace('.1', '.units') if fnmatch.fnmatch(x, '*.1') else x + ".duration")

### Pivot DataFrame for all categories and create two columns for duration in minutes and number of trainings
trainings[['skills', 'entity']] = trainings.skills.apply(lambda x: pd.Series(str(x).split('.')))
trainings = trainings.pivot(index=['filename', 'team', 'keeper', 'skills'], columns='entity', values='value').reset_index()
trainings = trainings.dropna(subset=['duration', 'units'])

# Date cleansing
trainings[['file', 'type']] = trainings.filename.apply(lambda x: pd.Series(str(x).split('.')))
trainings['yearmonth'] = trainings.file.str[-6:]
trainings['date'] = pd.to_datetime(trainings.yearmonth, format='%Y%m', errors='coerce').dropna()
trainings['year'] = pd.DatetimeIndex(trainings['date']).year
trainings['month'] = trainings.date.dt.month_name()

#trainings.drop(columns=['entity'], inplace=True)
#trainings = trainings.drop(columns=['filename', 'team', 'file', 'type']).reset_index()
trainings = trainings.reindex(columns=['keeper', 'date', 'year', 'yearmonth', 'month', 'skills', 'units', 'duration'])
trainings= trainings.rename_axis(None, axis=1)

### Modify training unites as int
trainings['units'] = trainings.units.astype(int)

## Trainingsbesuche / Training Participation

In [420]:
participants_df.rename(columns={'Unnamed: 0': 'keeper', 'Unnamed: 1': 'team', 'Absence Reason ': 'training_group', 'Unnamed: 3': 'anwesend'}, inplace=True)
participants_df.drop(columns=['filename'], inplace=True)
participants_df = participants_df.loc[1:]
participants_df = pd.melt(participants_df, id_vars=['keeper', 'team', 'training_group'], var_name='grund', value_name='participating')
participants_df = participants_df.dropna(subset=['participating'])
participants_df = participants_df.merge(keeper_df, left_on='keeper', right_on='keeper')
participants_df.drop(columns=['team_x'], inplace=True)
participants_df['participating'] = pd.to_numeric(participants_df['participating'], errors='coerce')

In [421]:
participants_df.head()

Unnamed: 0,keeper,training_group,grund,participating,team_y
0,Abdullah Laidani,,anwesend,17.0,1. Mannschaft
1,Abdullah Laidani,"U21 BSC Young Boys, U21",anwesend,49.0,1. Mannschaft
2,Abdullah Laidani,1. Mannschaft BSC Young Boys,anwesend,1.0,1. Mannschaft
3,Abdullah Laidani,"U21 BSC Young Boys, U21",Frei/Regeneration,3.0,1. Mannschaft
4,Abdullah Laidani,"U21 BSC Young Boys, U21",Krank,1.0,1. Mannschaft


In [386]:

participants_df.head()

Unnamed: 0,keeper,team_x,training_group,grund,participating,team_y,team
0,Abdullah Laidani,1. Mannschaft BSC Young Boys,,anwesend,17.0,1. Mannschaft,1. Mannschaft
1,Abdullah Laidani,1. Mannschaft BSC Young Boys,"U21 BSC Young Boys, U21",anwesend,49.0,1. Mannschaft,1. Mannschaft
2,Abdullah Laidani,1. Mannschaft BSC Young Boys,1. Mannschaft BSC Young Boys,anwesend,1.0,1. Mannschaft,1. Mannschaft
3,Abdullah Laidani,1. Mannschaft BSC Young Boys,"U21 BSC Young Boys, U21",Frei/Regeneration,3.0,1. Mannschaft,1. Mannschaft
4,Abdullah Laidani,1. Mannschaft BSC Young Boys,"U21 BSC Young Boys, U21",Krank,1.0,1. Mannschaft,1. Mannschaft


# Export Data to CSV


In [361]:
keeper_df.to_csv('/Users/matthiashugli/Dropbox/⚽️ YB Nachwuchs/yb_keepers.csv', index=False)
trainings.to_csv('/Users/matthiashugli/Dropbox/⚽️ YB Nachwuchs/training_skills.csv', index=False)
participants_df.to_csv('/Users/matthiashugli/Dropbox/⚽️ YB Nachwuchs/training_anwesenheiten.csv', index=False)

## Export Cleansed DataFrame to aws S3

In [114]:
trainings_export = trainings
trainings_export.to_csv('cleansing_trainings.csv', index=False)
#trainings_export.to_csv(csv_buffer, index=False)
#s3.Object(bucket_name.name, 'cleansing_trainings.csv').put(Body=csv_buffer.getvalue())
#print(s3.Object(bucket_name.name, 'cleansing_trainings.csv').put(Body=csv_buffer.getvalue()))

In [138]:
### Outdated AWS S3 not in use anymore
!ln -s /Users/matthiashugli/Virtualenvs/youth-base/youth-base/config.py config.py
from config import s3

### Monthly data is stored to subfolder in S3 Bucket, read and store filenames
bucket_name = s3.Bucket('training-minutes')

bucket_list = []
for file in bucket_name.objects.filter(Prefix = 'rawdata/'):
    file_key = file.key
    if file_key.find('.csv') != -1:
        bucket_list.append(file.key)
print(len(bucket_list))

3


In [46]:
### Read all files in Bucket and return as a DataFrame
df = pd.DataFrame()
for file_name in bucket_list:
    obj = s3.Object(bucket_name.name, file_name)
    data = obj.get()['Body'].read()
    file = pd.read_csv(io.BytesIO(data), header=1, delimiter=',', low_memory=False)
    file.insert(0, 'filename', file_name)

    df = df.append(file)

### Save DataFrame to S3 as CSV
#df.to_csv(csv_buffer)
#s3.Object(bucket_name.name, 'staging_trainings.csv').put(Body=csv_buffer.getvalue())
#s3.upload_file

NameError: name 'bucket_list' is not defined