# Dataset Load
### Code used to load downloaded files from http://www.football-data.co.uk/ and merge them into one dataset

In [1]:
# Obsługa środowisk Python 2 i Python 3
from __future__ import division, print_function, unicode_literals

# Importowanie popularnych modułów
import numpy as np
import os

# W celu zachowania powtarzalności wyników w kolejnych przebiegach
np.random.seed(42)

# Generowanie ładnych wykresów
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Lokacja, w której będą zapisywane rysunki
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "preparing_dataset"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "pictures", CHAPTER_ID)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Zapisywanie rysunku", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [2]:
FOOTBALL_PATH_GE = os.path.join("datasets/germany", "original")
football_path_ge = FOOTBALL_PATH_GE

FOOTBALL_PATH_IT = os.path.join("datasets/italy", "original")
football_path_it = FOOTBALL_PATH_IT

FOOTBALL_PATH_FR = os.path.join("datasets/france", "original")
football_path_fr = FOOTBALL_PATH_FR

FOOTBALL_PATH_EN = os.path.join("datasets/england", "original")
football_path_en = FOOTBALL_PATH_EN

FOOTBALL_PATH_SP = os.path.join("datasets/spain", "original")
football_path_sp = FOOTBALL_PATH_SP

## Files load

In [4]:
import os
import pandas as pd

def load_football_data(football_path, file):
    csv_path = os.path.join(football_path, file)
    return pd.read_csv(csv_path, error_bad_lines=False)

In [6]:
football = load_football_data(FOOTBALL_PATH_GE, "D1 (19_20).csv")

In [7]:
football.head()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,D1,16/08/2019,19:30,Bayern Munich,Hertha,2,2,D,1,2,...,3.4,-2.25,2.03,1.9,1.99,1.93,2.04,1.93,1.98,1.91
1,D1,17/08/2019,14:30,Dortmund,Augsburg,5,1,H,1,1,...,3.31,-2.25,1.92,2.01,1.92,2.0,1.98,2.04,1.91,1.97
2,D1,17/08/2019,14:30,Freiburg,Mainz,3,0,H,0,0,...,1.89,0.0,1.92,2.01,1.94,1.97,1.97,2.06,1.9,1.99
3,D1,17/08/2019,14:30,Leverkusen,Paderborn,3,2,H,2,2,...,3.58,-2.0,2.07,1.86,2.05,1.86,2.15,1.91,2.03,1.85
4,D1,17/08/2019,14:30,Werder Bremen,Fortuna Dusseldorf,1,3,A,0,1,...,2.26,-0.75,1.92,2.01,1.92,2.0,1.95,2.11,1.89,2.0


## Creating data sets

###### Dropping unneccesary or redundant columns

In [8]:
football = football.drop(columns = ['Div','Time','HHW','AHW','HC','AC','HFKC','AFKC','HO','AO','BSH','BSD','BSA','GBH','GBD','GBA','IWH','IWD','IWA','LBH','LBD','LBA','PSH','PSD','PSA','SOH','SOD','SOA','SBH','SBD','SBA','SJH','SJD','SJA','SYH','SYD','SYA','VCH','VCD','VCA','WHH','WHD','WHA','Bb1X2','BbMxH','BbAvH','BbMxD','BbAvD','BbMxA','BbAvA','MaxH','MaxD','MaxA','AvgH','AvgD','AvgA','BbOU','BbMx>2.5','BbAv>2.5','BbMx<2.5','BbAv<2.5','GB>2.5','GB<2.5','B365>2.5','B365<2.5','P>2.5','P<2.5','Max>2.5','Max<2.5','Avg>2.5','Avg<2.5','BbAH','BbAHh','AHh','GBAHH','GBAHA','GBAH','LBAHH','LBAHA','LBAH','B365AHH','B365AHA','B365AH','PAHH','PAHA','MaxAHH','MaxAHA','AvgAHH','AvgAHA','B365CH','B365CH','B365CD','B365CA','BWCH','BWCD','BWCA','IWCH','IWCD','IWCA','PSCH','PSCD','PSCA','WHCH','WHCD','WHCA','VCCH','VCCD','VCCA','MaxCH','MaxCD','MaxCA','AvgCH','AvgCD','AvgCA','B365C>2.5','B365C<2.5','PC>2.5','PC<2.5','MaxC>2.5','MaxC<2.5','AvgC>2.5','AvgC<2.5','AHCh','B365CAHH','B365CAHA','PCAHH','PCAHA','MaxCAHH','MaxCAHA','AvgCAHH','AvgCAHA'], errors='ignore')

###### Overwriting file

In [9]:
football_path_ge = os.path.join("datasets/germany")
football_path_it = os.path.join("datasets/italy")
football_path_fr = os.path.join("datasets/france")
football_path_en = os.path.join("datasets/england")
football_path_sp = os.path.join("datasets/spain")

In [12]:
csv_path = os.path.join(football_path_ge, "germany.csv")
#football.to_csv(csv_path)

football.to_csv(csv_path, mode='a', header=False)