# World Cup Final Project – Data Collection Notebook

In [2]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

# Data Parsing & Cleaning

In [90]:
# Read data
df_fifa = pd.read_csv('nb0a_match_data.csv')
df_fifa = df_fifa.drop(columns=['Unnamed: 0']);
df_fifa.head()

Unnamed: 0,date,Home,Home Confed,Away,Away Confed,FT H,FT G,AET H,AET G,AP H,...,PrevDiffHome6,PrevDiffAway6,PrevDiffHome7,PrevDiffAway7,PrevDiffHome8,PrevDiffAway8,PrevDiffHome9,PrevDiffAway9,PrevDiffHome10,PrevDiffAway10
0,03.02.1900,Scotland,UEFA,Wales,UEFA,5,2,,,,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
1,24.02.1900,Wales,UEFA,Ireland,UEFA,2,0,,,,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
2,03.03.1900,Ireland,UEFA,Scotland,UEFA,0,3,,,,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
3,17.03.1900,Ireland,UEFA,England,UEFA,0,2,,,,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
4,26.03.1900,Wales,UEFA,England,UEFA,1,1,,,,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [27]:
# Data Structure
games, columns = df_fifa.shape
print("We have", games, "games in the dataset and", columns, "columns of information.")

print("\nThe Columns names are:\n", df_fifa.columns.values)

We have 39890 games in the dataset and 42 columns of information.

The Columns names are:
 ['date' 'Home' 'Home Confed' 'Away' 'Away Confed' 'FT H' 'FT G' 'AET H'
 'AET G' 'AP H' 'AP G' 'ET' 'PEN' 'Type' 'Town' 'Land' 'neutral venue'
 'HomeAdv' 'Win' 'day' 'month' 'year' 'PrevDiffHome1' 'PrevDiffAway1'
 'PrevDiffHome2' 'PrevDiffAway2' 'PrevDiffHome3' 'PrevDiffAway3'
 'PrevDiffHome4' 'PrevDiffAway4' 'PrevDiffHome5' 'PrevDiffAway5'
 'PrevDiffHome6' 'PrevDiffAway6' 'PrevDiffHome7' 'PrevDiffAway7'
 'PrevDiffHome8' 'PrevDiffAway8' 'PrevDiffHome9' 'PrevDiffAway9'
 'PrevDiffHome10' 'PrevDiffAway10']


In [28]:
# Clean Dates
day = []
month = []
year = []

for i in range(games):
    day.append(df_fifa['date'][i].split(".")[0])
    month.append(df_fifa['date'][i].split(".")[1])
    year.append(df_fifa['date'][i].split(".")[2])
    
df_fifa['day'] = pd.to_numeric(day)
df_fifa['month'] = pd.to_numeric(month)
df_fifa['year'] = pd.to_numeric(year)

In [29]:
# Look at Data Types
df_fifa.dtypes

date               object
Home               object
Home Confed        object
Away               object
Away Confed        object
FT H                int64
FT G                int64
AET H             float64
AET G             float64
AP H              float64
AP G              float64
ET                   bool
PEN                  bool
Type               object
Town               object
Land               object
neutral venue      object
HomeAdv           float64
Win               float64
day                 int64
month               int64
year                int64
PrevDiffHome1     float64
PrevDiffAway1     float64
PrevDiffHome2     float64
PrevDiffAway2     float64
PrevDiffHome3     float64
PrevDiffAway3     float64
PrevDiffHome4     float64
PrevDiffAway4     float64
PrevDiffHome5     float64
PrevDiffAway5     float64
PrevDiffHome6     float64
PrevDiffAway6     float64
PrevDiffHome7     float64
PrevDiffAway7     float64
PrevDiffHome8     float64
PrevDiffAway8     float64
PrevDiffHome

## Restricting Data to 2006

Justification: Only one player from the 2002 World Cup played in the 2018 World Cup, so the squads that competed in the 2002 World Cup are completely different from those that competed in 2018.
Additionally, FIFA Ranking information is only available from 2006 onward, and since that is one of the inputs we plan on testing for the model, it makes sense for our game data to line up with our FIFA Ranking data.

However, in order for us to get information on teams' previous results and locations, we're keeping games going back to 2003.

In [30]:
# Restrict Dataset to 2006+
df_fifa = df_fifa[df_fifa.year > 2002]

games = df_fifa.shape[0]
print("Now we have {} games since {} in the dataset.".format(games, min(df_fifa.year)))

Now we have 14654 games since 2003 in the dataset.


In [31]:
# Re-Index
df_fifa = df_fifa.reset_index()
del df_fifa['index']

In [32]:
# Fixes

# In-Place Game Type
df_fifa.Type.replace('ANC', 'Cont', inplace=True)
df_fifa.Type.replace('AC-Q', 'ContQ', inplace=True)

# In-Place Location
df_fifa.replace('Lithania', 'Lithuania', inplace=True);
df_fifa.replace('Côte d Ivoire', 'Ivory Coast', inplace=True);
df_fifa.replace('Burma', 'Myanmar', inplace=True);
df_fifa.replace('São Tomé and Príncipe', 'Sao Tome and Principe', inplace=True);
df_fifa.replace('Bassaterre', 'Basseterre', inplace=True);
df_fifa.replace('Viry Chatillon', 'Paris', inplace=True);
df_fifa.replace('Viry Chantillon', 'Paris', inplace=True);
df_fifa.replace('Saint-Leu-La-Fort', 'Paris', inplace=True);
df_fifa.replace('Netherlands Antilles', 'Curacao', inplace=True);
df_fifa.replace('Ajacco', 'Ajaccio', inplace=True);
df_fifa.replace('Ar Rifa', 'Riffa', inplace=True);
df_fifa.replace('Barbados (nn)', 'Bridgetown', inplace=True);
df_fifa.replace('Foxboro / Boston', 'Boston', inplace=True);
df_fifa.replace('La CoruIa', 'La Coruna', inplace=True);
df_fifa.replace('Lauderhil', 'Lauderhill', inplace=True);
df_fifa.replace('Leon (ESP)', 'Leon', inplace=True);
df_fifa.replace('Valencia (esp)', 'Valencia', inplace=True);
df_fifa.replace('San Jose (crc)', 'San Jose', inplace=True);
df_fifa.replace('Ljubliana', 'Ljubljana', inplace=True);
df_fifa.replace('FYR of Macedonia', 'Macedonia', inplace=True);
df_fifa.replace('NeuchIatel', 'Neuchatel', inplace=True);
df_fifa.replace('Newcastle (Aus)', 'Newcastle', inplace=True);
df_fifa.replace('Newcastle (Eng)', 'Newcastle', inplace=True);
df_fifa.replace('Salvador da Bahia', 'Salvador', inplace=True);
df_fifa.replace('San Luis Potos', 'San Luis Potosi', inplace=True);
df_fifa.replace('Santiago de Querutaro', 'Queretaro', inplace=True);
df_fifa.replace('Tuxtla Gutiurrez', 'Tuxtla Gutierrez', inplace=True);
df_fifa.replace('WYrzburg', 'Wurzburg', inplace=True);
df_fifa.replace('DÃ¼sseldorf', 'Dusseldorf', inplace=True);
df_fifa.replace('TaQali', 'Valletta', inplace=True);
df_fifa.replace("Ta'Qali", 'Valletta', inplace=True);
df_fifa.replace('Konu', 'Kone', inplace=True);
df_fifa.replace('"\tAl Rayyan"', 'Al Rayyan', inplace=True);
df_fifa.replace('Santa Ana (gua)', 'Santa Ana', inplace=True);

# Location by Index
sd = df_fifa.index[(df_fifa.Town == 'Saint-Denis') & (df_fifa.Land == 'France')]
df_fifa.at[sd, 'Town'] = 'Paris'

sb1 = df_fifa.index[df_fifa.Home == 'Serbia and Montenegro']
sb2 = df_fifa.index[df_fifa.Away == 'Serbia and Montenegro']
df_fifa.at[sb1, 'Home'] = 'Serbia'
df_fifa.at[sb2, 'Away'] = 'Serbia'

bt = df_fifa.index[(df_fifa.Town == 'Basseterre') & (df_fifa.Land == 'Guadeloupe')]
df_fifa.at[bt, 'Town'] = 'Basse-Terre'

ha = df_fifa.index[(df_fifa.Town == 'Harrison') & (df_fifa.Land == 'USA')]
df_fifa.at[ha, 'Town'] = 'New York City'

ka = df_fifa.index[(df_fifa.Town == 'Kariavattom')]
df_fifa.at[ka, 'Land'] = 'India'

bz = df_fifa.index[(df_fifa.Town == 'Belize City')]
df_fifa.at[bz, 'Land'] = 'Belize'

sp = df_fifa.index[(df_fifa.Town == 'Estepona')]
df_fifa.at[sp, 'Land'] = 'Spain'

zb = df_fifa.index[(df_fifa.Town == 'Masvingo')]
df_fifa.at[zb, 'Land'] = 'Zimbabwe'

ch = df_fifa.index[(df_fifa.Town == 'Fuzhou')]
df_fifa.at[ch, 'Land'] = 'China'

pr = df_fifa.index[(df_fifa.Town == 'MayagYez')]
df_fifa.at[pr, 'Land'] = 'Puerto Rico'
df_fifa.at[pr, 'Town'] = 'Puerto Rico'

gb = df_fifa.index[(df_fifa.Town == 'Ebebiyn')]
df_fifa.at[gb, 'Land'] = 'Gabon'

hn = df_fifa.index[(df_fifa.Town == 'Felcsút')]
df_fifa.at[hn, 'Land'] = 'Hungary'

ind = df_fifa.index[(df_fifa.Town == 'Gurgaon')]
df_fifa.at[ind, 'Land'] = 'India'

tk = df_fifa.index[(df_fifa.Town == 'Kartepe')]
df_fifa.at[tk, 'Land'] = 'Turkey'

nc = df_fifa.index[(df_fifa.Town == 'Kone')]
df_fifa.at[nc, 'Land'] = 'New Caledonia'

bw = df_fifa.index[(df_fifa.Town == 'Lobatse')]
df_fifa.at[bw, 'Land'] = 'Botswana'

uae = df_fifa.index[(df_fifa.Town == 'Sharjah')]
df_fifa.at[uae, 'Land'] = 'United Arab Emirates'

In [33]:
df_fifa.head(n=10)

Unnamed: 0,date,Home,Home Confed,Away,Away Confed,FT H,FT G,AET H,AET G,AP H,...,PrevDiffHome6,PrevDiffAway6,PrevDiffHome7,PrevDiffAway7,PrevDiffHome8,PrevDiffAway8,PrevDiffHome9,PrevDiffAway9,PrevDiffHome10,PrevDiffAway10
0,10.01.2003,Sri Lanka,AFC,Afghanistan,AFC,1,0,,,,...,1.0,-1.0,1.0,0.0,0.0,-3.0,0.0,-2.0,2.0,0.0
1,10.01.2003,India,AFC,Pakistan,AFC,0,1,,,,...,0.0,-6.0,-1.0,-7.0,0.0,0.0,1.0,-3.0,-3.0,-6.0
2,11.01.2003,Maldives,AFC,Bhutan,AFC,6,0,,,,...,-9.0,-1.0,0.0,-3.0,-5.0,-7.0,6.0,-4.0,-2.0,-3.0
3,11.01.2003,Bangladesh,AFC,Nepal,AFC,1,0,,,,...,0.0,-6.0,-2.0,-5.0,-1.0,-5.0,-1.0,3.0,0.0,-3.0
4,12.01.2003,Barbados,CONCACAF,Jamaica,CONCACAF,1,0,,,,...,0.0,0.0,-1.0,0.0,-2.0,3.0,-5.0,2.0,3.0,1.0
5,12.01.2003,Afghanistan,AFC,India,AFC,0,4,,,,...,-3.0,1.0,-1.0,0.0,0.0,-1.0,-3.0,0.0,-2.0,1.0
6,12.01.2003,Pakistan,AFC,Sri Lanka,AFC,2,1,,,,...,-2.0,1.0,-6.0,1.0,-7.0,1.0,0.0,0.0,-3.0,0.0
7,13.01.2003,Nepal,AFC,Bhutan,AFC,2,0,,,,...,-8.0,-3.0,-6.0,-1.0,-5.0,-3.0,-5.0,-7.0,3.0,-4.0
8,13.01.2003,Bangladesh,AFC,Maldives,AFC,1,0,,,,...,-3.0,-1.0,0.0,-9.0,-2.0,0.0,-1.0,-5.0,-1.0,6.0
9,14.01.2003,Afghanistan,AFC,Pakistan,AFC,0,1,,,,...,-8.0,0.0,-3.0,-2.0,-1.0,-6.0,0.0,-7.0,-3.0,0.0


# Remove Non-FIFA Members

In [91]:
# Read data
df_wiggo = pd.read_csv('nb0b_wiggo_rankings.csv')
df_wiggo = df_wiggo.drop(columns=['Unnamed: 0']);
df_wiggo.columns = ['Date', 'Rank', 'Rating', 'Team']
df_wiggo.replace('Lithania', 'Lithuania', inplace=True);
df_wiggo.replace('FYR of Macedonia', 'Macedonia', inplace=True);

In [35]:
# Identify FIFA Members
fifa_teams = df_wiggo.Team.unique()

# Identify Non-FIFA Members
non_fifa = []

for team in df_fifa.Home.append(df_fifa.Away).unique():
    if team not in fifa_teams:
        non_fifa.append(team)
        
print(non_fifa)

['Guadeloupe', 'Martinique', 'Reunion', 'Sint Maarten', 'Sansibar', 'French Guiana', 'Northern Mariana Islands', 'Tuvalu', 'Saint-Martin', 'Mayotte', 'Kiribati', 'St. Pierre and Miquelon', 'Bonaire']


In [36]:
non_fifa_index = []

# Remove Non-FIFA Members
for i in range(games):
   
    if df_fifa.Home[i] in non_fifa:
        non_fifa_index.append(i)
        
    elif df_fifa.Away[i] in non_fifa:
        non_fifa_index.append(i)

print("We removed {} games with at least one non-FIFA member.".format(len(non_fifa_index)))

We removed 342 games with at least one non-FIFA member.


In [37]:
# Remove Games with at least one non-FIFA member
df_fifa = df_fifa.drop(df_fifa.index[non_fifa_index])
df_fifa = df_fifa.reset_index()
del df_fifa['index']
games = df_fifa.shape[0]

In [38]:
# Check to see there are no non-FIFA members
for team in non_fifa:
    print(team in df_fifa.Home, team in df_fifa.Away)

False False
False False
False False
False False
False False
False False
False False
False False
False False
False False
False False
False False
False False


# Get Geographic Coordinates

In [39]:
from geopy.geocoders import Nominatim
from geopy.distance import geodesic, great_circle
geolocator = Nominatim()
import time

In [40]:
df_fifa['latitude'], df_fifa['longitude'] = [0.]*games, [0.]*games
df_fifa['coordinates'] = [tuple]*games
df_fifa.head()

Unnamed: 0,date,Home,Home Confed,Away,Away Confed,FT H,FT G,AET H,AET G,AP H,...,PrevDiffAway7,PrevDiffHome8,PrevDiffAway8,PrevDiffHome9,PrevDiffAway9,PrevDiffHome10,PrevDiffAway10,latitude,longitude,coordinates
0,10.01.2003,Sri Lanka,AFC,Afghanistan,AFC,1,0,,,,...,0.0,0.0,-3.0,0.0,-2.0,2.0,0.0,0.0,0.0,<class 'tuple'>
1,10.01.2003,India,AFC,Pakistan,AFC,0,1,,,,...,-7.0,0.0,0.0,1.0,-3.0,-3.0,-6.0,0.0,0.0,<class 'tuple'>
2,11.01.2003,Maldives,AFC,Bhutan,AFC,6,0,,,,...,-3.0,-5.0,-7.0,6.0,-4.0,-2.0,-3.0,0.0,0.0,<class 'tuple'>
3,11.01.2003,Bangladesh,AFC,Nepal,AFC,1,0,,,,...,-5.0,-1.0,-5.0,-1.0,3.0,0.0,-3.0,0.0,0.0,<class 'tuple'>
4,12.01.2003,Barbados,CONCACAF,Jamaica,CONCACAF,1,0,,,,...,0.0,-2.0,3.0,-5.0,2.0,3.0,1.0,0.0,0.0,<class 'tuple'>


In [41]:
# Create Dataframe of locations for Unique places
combos = df_fifa.groupby(['Town','Land']).size().reset_index().rename(columns={0:'count'})
combos['lat'] = [0.0]*len(combos)
combos['lon'] = [0.0]*len(combos)
combos['coordinates'] = [tuple]*len(combos)
na_towns = []
combos.head()

Unnamed: 0,Town,Land,count,lat,lon,coordinates
0,6th October City,Egypt,3,0.0,0.0,<class 'tuple'>
1,Aachen,Germany,1,0.0,0.0,<class 'tuple'>
2,Aalborg,Denmark,3,0.0,0.0,<class 'tuple'>
3,Aarhus,Denmark,7,0.0,0.0,<class 'tuple'>
4,Abeokuta,Nigeria,1,0.0,0.0,<class 'tuple'>


In [44]:
## Find coordinates of every place
## Need to run this cell a few times because the servers time out

# For every unique place without coordinates
for place in combos.index[(combos.lat == 0.0) & (combos.lon == 0.0)]:
    
    # Find coordinates
    coord = geolocator.geocode(combos.Town[place] + ', ' + combos.Land[place])
    
    # Record Coordinates
    if coord:
        combos.at[place, 'lat'] = coord.latitude
        combos.at[place, 'lon'] = coord.longitude
        
    # If city not found
    else:
        
        # Record default coordinates of country instead
        coord = geolocator.geocode(combos.Land[place])
        combos.at[place, 'lat'] = coord.latitude
        combos.at[place, 'lon'] = coord.longitude
        
        # Add place and index to NA list 
        na_towns.append([combos.Town[place], combos.Land[place]])
        
    # Space https requests
    time.sleep(0.2)
    
    # Progress
    if place % 50 == 0:
        print(place, "out of", len(combos))
        
print("Done!")

Done!


In [45]:
# Turn lat & lon into tuple series
for i in combos.index[combos.coordinates == tuple]:
    combos.at[i, 'coordinates'] = combos.lat[i], combos.lon[i]
    
combos.head()

Unnamed: 0,Town,Land,count,lat,lon,coordinates
0,6th October City,Egypt,3,29.972346,30.940921,"(29.9723458, 30.9409205)"
1,Aachen,Germany,1,50.776351,6.083862,"(50.776351, 6.083862)"
2,Aalborg,Denmark,3,57.048221,9.919394,"(57.0482206, 9.9193939)"
3,Aarhus,Denmark,7,56.149628,10.213405,"(56.1496278, 10.2134046)"
4,Abeokuta,Nigeria,1,7.161,3.348,"(7.161, 3.348)"


In [46]:
# Check imputed coordinates so we can fix any that are wrongly matched 
na_towns

[['Bellevue', 'Mauritius'],
 ['Blairmont', 'Dominica'],
 ['Dokha', 'Yemen'],
 ['Ebebiyn', 'Gabon'],
 ['Gabs', 'Tunisia'],
 ['Gyiar', 'Hungary'],
 ['Limbu', 'Cameroon'],
 ['Longjumeau', 'New Caledonia'],
 ['Mahu', 'Seychelles'],
 ['Marignane', 'Mali'],
 ['Moquegua', 'Peru'],
 ['NorrkIping', 'Sweden'],
 ['North Sound', 'Antigua and Barbuda'],
 ['Obidos', 'Cape Verde'],
 ['Ploesti', 'Romania'],
 ['Sabanalarga', 'Dominican Republic'],
 ['Saint-Andru', 'Reunion'],
 ['Salu', 'Morocco'],
 ['Samut Sakhon', 'Cambodia'],
 ['Senlis', 'Burkina Faso'],
 ['Victoria (sey)', 'Seychelles'],
 ['Visu', 'Belgium'],
 ['Voru', 'Latvia'],
 ['Yacuba', 'Bolivia'],
 ['Zawyan', 'Libya'],
 ['Zouurate', 'Mauritania']]

We manually checked that all these cities were indeed in their respective countries. 

In [47]:
# Check for duplicates
combos[combos.Town.duplicated()]
# This means we can't look up places just by city name

Unnamed: 0,Town,Land,count,lat,lon,coordinates
803,Newcastle,England,2,54.977092,-1.614206,"(54.9770924, -1.6142059)"
989,Saida,Lebanon,2,33.561423,35.37661,"(33.5614232, 35.3766095)"
1015,San Cristobal,Venezuela,26,7.775666,-72.221415,"(7.7756663, -72.2214154)"
1025,San Juan,Puerto Rico,4,18.384239,-66.05344,"(18.38423905, -66.0534399736473)"
1174,Trujillo,Peru,1,-8.111763,-79.02867,"(-8.1117632, -79.0286702)"


In [48]:
# Save to CSV
combos.to_csv('GameCoordinates_.csv')

# Define "Home Field"

In [49]:
# Group by where each country has played at home
home_field = df_fifa.groupby(['Home','Land','Town']).size().reset_index().rename(columns={0:'count'})
home_field = home_field[home_field.Home == home_field.Land]
del home_field['Land']
home_field.head()

Unnamed: 0,Home,Town,count
0,Afghanistan,Kabul,2
18,Albania,Durres,2
19,Albania,Elbasan,9
20,Albania,Korce,1
21,Albania,Shkoder,7


In [53]:
# Find each country's most common home venue
home_hq = home_field[0:0]

for team in home_field.Home.unique():
    
    df = home_field[home_field.Home == team]
    mx = max(df['count'])
    row = home_field[(home_field.Home == team) & (home_field['count'] == mx)]
    home_hq = home_hq.append(row)

# Re-Index
home_hq = home_hq.reset_index()
del home_hq['index']

home_hq.head(n=15)

Unnamed: 0,Home,Town,count
0,Afghanistan,Kabul,2
1,Albania,Tirana,48
2,Algeria,Blida,33
3,Andorra,Andorra la Vella,44
4,Angola,Luanda,44
5,Anguilla,The Valley,5
6,Antigua and Barbuda,Saint Johns,13
7,Argentina,Buenos Aires,34
8,Armenia,Yerevan,53
9,Aruba,Oranjestad,11


In [54]:
# Find Teams who haven't played at home
for team in fifa_teams:
    if team not in home_hq.Home.unique():
        print(team)

Somalia
Cook Islands
American Samoa
nan


In [55]:
# Add those teams' capitals manually
hl = pd.DataFrame([['American Samoa', 'Pago Pago', 0], ['Cook Islands', 'Avarua', 0], ['Somalia', 'Mogadishu', 0]],
                      columns=['Home', 'Town', 'count'])

home_hq = home_hq.append(hl, ignore_index=True)

In [56]:
# Eliminate Duplicates
dup = home_hq[home_hq.Home.duplicated()].Home.index
home_hq = home_hq.drop(home_hq.index[dup])
len(home_hq)

211

In [57]:
# Add coordinates column
home_hq['lat'], home_hq['lon'] = [0.]*len(home_hq), [0.]*len(home_hq)
home_hq['coordinates'] = [tuple]*len(home_hq)
home_hq.head()

Unnamed: 0,Home,Town,count,lat,lon,coordinates
0,Afghanistan,Kabul,2,0.0,0.0,<class 'tuple'>
1,Albania,Tirana,48,0.0,0.0,<class 'tuple'>
2,Algeria,Blida,33,0.0,0.0,<class 'tuple'>
3,Andorra,Andorra la Vella,44,0.0,0.0,<class 'tuple'>
4,Angola,Luanda,44,0.0,0.0,<class 'tuple'>


In [61]:
## Find coordinates of home fields
## Need to run this cell a few times because the servers time out

# For every unique place without coordinates
for team in home_hq.index[home_hq.coordinates == tuple]:
    
    # Find coordinates
    c = geolocator.geocode(home_hq.Town[team] + ', ' + home_hq.Home[team])
    
    # Save coordinates
    if c:
        home_hq.at[team, 'lat'] = c.latitude
        home_hq.at[team, 'lon'] = c.longitude
        home_hq.at[team, 'coordinates'] = c.latitude, c.longitude
        
    else:
        home_hq.at[team, 'lat'] = combos[combos.Town == home_hq.Town[team]].lat
        home_hq.at[team, 'lat'] = combos[combos.Town == home_hq.Town[team]].lon
        home_hq.at[team, 'coordinates'] = combos[combos.Town == home_hq.Town[team]].lat, combos[combos.Town == home_hq.Town[team]].lon
        
    # Space https requests
    time.sleep(.5)
    
    # Print Progress
    if team % 10 == 0:
        print(team, "out of", len(home_hq))
        
print("Done!")

80 out of 211
90 out of 211
100 out of 211
110 out of 211
120 out of 211
130 out of 211
140 out of 211
150 out of 211
160 out of 211
170 out of 211
180 out of 211
190 out of 211
200 out of 211
210 out of 211
Done!


In [62]:
# Save to CSV
home_hq.to_csv('HomeFieldCoordinates_.csv')
home_hq.head()

Unnamed: 0,Home,Town,count,lat,lon,coordinates
0,Afghanistan,Kabul,2,34.526013,69.177648,"(34.5260131, 69.1776476)"
1,Albania,Tirana,48,41.327946,19.818532,"(41.3279457, 19.8185323)"
2,Algeria,Blida,33,36.470165,2.828798,"(36.4701645, 2.8287985)"
3,Andorra,Andorra la Vella,44,42.506939,1.521247,"(42.5069391, 1.5212467)"
4,Angola,Luanda,44,-8.82727,13.243951,"(-8.8272699, 13.2439512)"


## Calculate Distance from Home

In [63]:
df_fifa['dist_home'], df_fifa['dist_away'] = [0.]*games, [0.]*games

In [64]:
df_fifa.head()

Unnamed: 0,date,Home,Home Confed,Away,Away Confed,FT H,FT G,AET H,AET G,AP H,...,PrevDiffAway8,PrevDiffHome9,PrevDiffAway9,PrevDiffHome10,PrevDiffAway10,latitude,longitude,coordinates,dist_home,dist_away
0,10.01.2003,Sri Lanka,AFC,Afghanistan,AFC,1,0,,,,...,-3.0,0.0,-2.0,2.0,0.0,0.0,0.0,<class 'tuple'>,0.0,0.0
1,10.01.2003,India,AFC,Pakistan,AFC,0,1,,,,...,0.0,1.0,-3.0,-3.0,-6.0,0.0,0.0,<class 'tuple'>,0.0,0.0
2,11.01.2003,Maldives,AFC,Bhutan,AFC,6,0,,,,...,-7.0,6.0,-4.0,-2.0,-3.0,0.0,0.0,<class 'tuple'>,0.0,0.0
3,11.01.2003,Bangladesh,AFC,Nepal,AFC,1,0,,,,...,-5.0,-1.0,3.0,0.0,-3.0,0.0,0.0,<class 'tuple'>,0.0,0.0
4,12.01.2003,Barbados,CONCACAF,Jamaica,CONCACAF,1,0,,,,...,3.0,-5.0,2.0,3.0,1.0,0.0,0.0,<class 'tuple'>,0.0,0.0


In [65]:
## Calculate distances from home

for game in df_fifa.index[df_fifa.coordinates == tuple]:
    
    # Find Home Field Coordinates
    home_cord = home_hq[home_hq.Home == df_fifa.Home[game]].coordinates
    away_cord = home_hq[home_hq.Home == df_fifa.Away[game]].coordinates
    
    # Find Game Location Coordinates
    location = combos[(combos.Land == df_fifa.Land[game]) & (combos.Town == df_fifa.Town[game])]
    game_cord = location.coordinates
    
    # Save to FIFA DataFrame
    df_fifa.at[game, 'coordinates'] = location.lat.values[0], location.lon.values[0]
    df_fifa.at[game, 'latitude'] = location.lat.values[0]
    df_fifa.at[game, 'longitude'] = location.lon.values[0]
    
    # Calculate Distances
    home_dist = geodesic(home_cord, game_cord).km
    away_dist = geodesic(away_cord, game_cord).km
    
    # Save Distances
    df_fifa.at[game, 'dist_home'] = home_dist
    df_fifa.at[game, 'dist_away'] = away_dist
    
    # Print Progress
    if game % 1000 == 0:
        print(game, "out of", len(df_fifa))
        
print("Done!")

0 out of 14312
1000 out of 14312
2000 out of 14312
3000 out of 14312
4000 out of 14312
5000 out of 14312
6000 out of 14312
7000 out of 14312
8000 out of 14312
9000 out of 14312
10000 out of 14312
11000 out of 14312
12000 out of 14312
13000 out of 14312
14000 out of 14312
Done!


In [66]:
# Check all coordinates were recorded
np.sum(df_fifa.coordinates == tuple)

0

# Get Time Zone Changes

In [67]:
import datetime
from datetime import datetime, timedelta
import pytz
from tzwhere import tzwhere
from dateutil import tz
from timezonefinder import TimezoneFinder

tf = TimezoneFinder()
tzwhere = tzwhere.tzwhere()

In [68]:
# Add timezone column
df_fifa['UTC'] = [0.0]*games
df_fifa['datetime'] = [datetime]*games
df_fifa.head()

Unnamed: 0,date,Home,Home Confed,Away,Away Confed,FT H,FT G,AET H,AET G,AP H,...,PrevDiffAway9,PrevDiffHome10,PrevDiffAway10,latitude,longitude,coordinates,dist_home,dist_away,UTC,datetime
0,10.01.2003,Sri Lanka,AFC,Afghanistan,AFC,1,0,,,,...,-2.0,2.0,0.0,23.759357,90.378814,"(23.7593572, 90.3788136)",2175.89627,2375.728229,0.0,<class 'datetime.datetime'>
1,10.01.2003,India,AFC,Pakistan,AFC,0,1,,,,...,-3.0,-3.0,-6.0,23.759357,90.378814,"(23.7593572, 90.3788136)",1421.55043,1799.185252,0.0,<class 'datetime.datetime'>
2,11.01.2003,Maldives,AFC,Bhutan,AFC,6,0,,,,...,-4.0,-2.0,-3.0,23.759357,90.378814,"(23.7593572, 90.3788136)",2824.24651,418.202817,0.0,<class 'datetime.datetime'>
3,11.01.2003,Bangladesh,AFC,Nepal,AFC,1,0,,,,...,3.0,0.0,-3.0,23.759357,90.378814,"(23.7593572, 90.3788136)",0.0,669.983196,0.0,<class 'datetime.datetime'>
4,12.01.2003,Barbados,CONCACAF,Jamaica,CONCACAF,1,0,,,,...,2.0,3.0,1.0,13.097783,-59.618418,"(13.0977832, -59.6184184)",0.0,1918.609665,0.0,<class 'datetime.datetime'>


In [69]:
## Get UTC offset for each game

for game in df_fifa.index:
    
    # Find timezone string
    timezone_str = tf.timezone_at(lng=df_fifa.longitude[game], lat=df_fifa.latitude[game])
    
    # Translate to tz object
    info = tz.gettz(timezone_str)
    
    # Get exact datetime at noon on gameday
    dt = datetime(df_fifa.year[game], df_fifa.month[game],
                  df_fifa.day[game], 12, 0, tzinfo=info)
    
    # Get UTC offset
    df_fifa.at[game, 'UTC'] = dt.utcoffset() / timedelta(hours=1)
    
    # Print Progress
    if game % 1000 == 0:
        print(game, "out of", len(df_fifa))
        
print("Done!")

0 out of 14312
1000 out of 14312
2000 out of 14312
3000 out of 14312
4000 out of 14312
5000 out of 14312
6000 out of 14312
7000 out of 14312
8000 out of 14312
9000 out of 14312
10000 out of 14312
11000 out of 14312
12000 out of 14312
13000 out of 14312
14000 out of 14312
Done!


In [71]:
# Create datetime object column
for game in df_fifa.index:
    df_fifa.at[game, 'datetime'] = datetime.strptime(df_fifa.date[game], '%d.%m.%Y')

# Add Previous Game Info

In [72]:
df_fifa['PrevCordHome'] = [tuple]*games
df_fifa['PrevCordAway'] = [tuple]*games
df_fifa['PrevDateHome'] = [datetime]*games
df_fifa['PrevDateAway'] = [datetime]*games
df_fifa['PrevTZHome'] = [0.]*games
df_fifa['PrevTZAway'] = [0.]*games
df_fifa['TimeDeltaHome'] = [0.]*games
df_fifa['TimeDeltaAway'] = [0.]*games
df_fifa['TravelHome'] = [0.]*games
df_fifa['TravelAway'] = [0.]*games
df_fifa['TZDeltaHome'] = [0.]*games
df_fifa['TZDeltaAway'] = [0.]*games

In [73]:
# Fill in Previous Coordinates Information for 2006 and on
for row in reversed(range(len(df_fifa))):
    
    # Identify Teams
    home = df_fifa.Home[row]
    away = df_fifa.Away[row]
    
    # Find previous game
    prev_home = df_fifa[(df_fifa.index < row) & ((df_fifa.Home == home) | (df_fifa.Away == home))].tail(n=1)
    prev_away = df_fifa[(df_fifa.index < row) & ((df_fifa.Home == away) | (df_fifa.Away == away))].tail(n=1)
    
    # If previous game found
    if prev_home.size:
        
        # Set previous values
        df_fifa.at[row, 'PrevCordHome'] = df_fifa.latitude[prev_home.index[0]], df_fifa.longitude[prev_home.index[0]]
        df_fifa.at[row, 'PrevDateHome'] = df_fifa.datetime[prev_home.index[0]]
        df_fifa.at[row, 'PrevTZHome'] = df_fifa.UTC[prev_home.index[0]]
    
    # If previous game found
    if prev_away.size:
        
        # Set previous values
        df_fifa.at[row, 'PrevCordAway'] = df_fifa.latitude[prev_away.index[0]], df_fifa.longitude[prev_away.index[0]]
        df_fifa.at[row, 'PrevDateAway'] = df_fifa.datetime[prev_away.index[0]]
        df_fifa.at[row, 'PrevTZAway'] = df_fifa.UTC[prev_away.index[0]]
    
    # Progress tracker
    if (len(df_fifa)-row) % 1000 == 0:
        print(len(df_fifa)-row, "out of", len(df_fifa))
        
print("Done!")

1000 out of 14312
2000 out of 14312
3000 out of 14312
4000 out of 14312
5000 out of 14312
6000 out of 14312
7000 out of 14312
8000 out of 14312
9000 out of 14312
10000 out of 14312
11000 out of 14312
12000 out of 14312
13000 out of 14312
14000 out of 14312
Done!


In [74]:
# Calculate Days Between Games
for row in df_fifa[df_fifa.year > 2004].index.values:
    
    # Change in Days
    if df_fifa.PrevDateHome[row] != datetime:
        df_fifa.at[row, 'TimeDeltaHome'] = (df_fifa.datetime[row] - df_fifa.PrevDateHome[row]) / timedelta(days=1)
        
    if df_fifa.PrevDateAway[row] != datetime:
        df_fifa.at[row, 'TimeDeltaAway'] = (df_fifa.datetime[row] - df_fifa.PrevDateAway[row]) / timedelta(days=1)
    
    if row % 1000 == 0:
        print(row/len(df_fifa))
        
print("Done!")

0.13974287311347122
0.20961430967020683
0.27948574622694244
0.34935718278367806
0.41922861934041367
0.4891000558971492
0.5589714924538849
0.6288429290106204
0.6987143655673561
0.7685858021240917
0.8384572386808273
0.9083286752375629
0.9782001117942984
Done!


In [75]:
# Add Fatigue Metrics for 2006 on

for row in df_fifa[df_fifa.year > 2005].index.values:
    
    # Find Coordinates
    if ((df_fifa.PrevCordHome[row] != tuple) & (df_fifa.PrevCordAway[row] != tuple)):
        home_cord = df_fifa.PrevCordHome[row][0], df_fifa.PrevCordHome[row][1]
        away_cord = df_fifa.PrevCordAway[row][0], df_fifa.PrevCordAway[row][1]
        game_cord = df_fifa.latitude[row], df_fifa.longitude[row]
        
        # Find TimeZone Change
        home_tz = df_fifa.UTC[row] - df_fifa.PrevTZHome[row]
        away_tz = df_fifa.UTC[row] - df_fifa.PrevTZAway[row]
    
        # Calculate Distances
        home_dist = geodesic(home_cord, game_cord).km
        away_dist = geodesic(away_cord, game_cord).km
    
        # Save fatigue ratings
        
        # Home Team, no fatigue (over two weeks since last game)
        if ((df_fifa.TimeDeltaHome[row] > 14) | (df_fifa.TimeDeltaHome[row] == 0)):
            df_fifa.at[row, 'TravelHome'] = 0
            df_fifa.at[row, 'TZDeltaHome'] = 0
        
        # Home Team, some fatigue
        else:
            df_fifa.at[row, 'TravelHome'] = home_dist / df_fifa.TimeDeltaHome[row]
            
            if (abs(home_tz) <= 12):
                df_fifa.at[row, 'TZDeltaHome'] = home_tz / df_fifa.TimeDeltaHome[row]
    
            else:
                df_fifa.at[row, 'TZDeltaHome'] = (24-abs(home_tz)) / df_fifa.TimeDeltaHome[row]
        
        # Away Team, no fatigue (over two weeks since last game)
        if ((df_fifa.TimeDeltaAway[row] > 14) | (df_fifa.TimeDeltaAway[row] == 0)): 
            df_fifa.at[row, 'TravelAway'] = 0
            df_fifa.at[row, 'TZDeltaAway'] = 0
        
        # Away Team, some fatigue
        else:
            df_fifa.at[row, 'TravelAway'] = away_dist / df_fifa.TimeDeltaAway[row]
            
            if (abs(away_tz) <= 12):
                df_fifa.at[row, 'TZDeltaAway'] = away_tz / df_fifa.TimeDeltaAway[row]
            else:
                df_fifa.at[row, 'TZDeltaAway'] = (24-abs(away_tz)) / df_fifa.TimeDeltaAway[row]
    
    if row % 1000 == 0:
        print(row/len(df_fifa))
        
print("Done!")

0.20961430967020683
0.27948574622694244
0.34935718278367806
0.41922861934041367
0.4891000558971492
0.5589714924538849
0.6288429290106204
0.6987143655673561
0.7685858021240917
0.8384572386808273
0.9083286752375629
0.9782001117942984
Done!


In [76]:
# Sanity Check on random row
for i in range(len(df_fifa[14020:].values[0])):
    print(df_fifa.columns.values[i], ":", df_fifa[14020:].values[0][i])

date : 09.04.2018
Home : Tunisia
Home Confed : CAF
Away : Iran
Away Confed : AFC
FT H : 1
FT G : 0
AET H : nan
AET G : nan
AP H : nan
AP G : nan
ET : False
PEN : False
Type : FSS
Town : Tunis
Land : Tunisia
neutral venue : No
HomeAdv : nan
Win : nan
day : 9
month : 4
year : 2018
PrevDiffHome1 : nan
PrevDiffAway1 : nan
PrevDiffHome2 : nan
PrevDiffAway2 : nan
PrevDiffHome3 : nan
PrevDiffAway3 : nan
PrevDiffHome4 : nan
PrevDiffAway4 : nan
PrevDiffHome5 : nan
PrevDiffAway5 : nan
PrevDiffHome6 : nan
PrevDiffAway6 : nan
PrevDiffHome7 : nan
PrevDiffAway7 : nan
PrevDiffHome8 : nan
PrevDiffAway8 : nan
PrevDiffHome9 : nan
PrevDiffAway9 : nan
PrevDiffHome10 : nan
PrevDiffAway10 : nan
latitude : 33.8439408
longitude : 9.400138
coordinates : (33.8439408, 9.400138)
dist_home : 334.9912975585401
dist_away : 3820.960795884104
UTC : 1.0
datetime : 2018-04-09 00:00:00
PrevCordHome : (36.77595335, 10.280169744974)
PrevCordAway : (35.7006177, 51.4013785)
PrevDateHome : 2017-11-11 00:00:00
PrevDateAway : 2

# Add Match History (for 2018)

In [77]:
def add_history_2018(df: pd.DataFrame, n: int):
    
    # Row Index i, Start From Bottom
    i = len(df) - 1

    while df.loc[i, 'year'] == 2018:
    
        # Get Team Names
        home_team = df.loc[i, 'Home']
        away_team = df.loc[i, 'Away']
    
        # Row Index j
        j = i-1
    
        # Running Count of Previous Games Found
        home_count = 0
        away_count = 0
        while j >= 0 and not (home_count >= n and away_count >= n):
        
            if df.loc[j, 'Home'] == home_team and home_count < n:
                df.at[i, 'PrevDiffHome{}'.format(home_count+1)] = df.loc[j, 'FT H'] - df.loc[j, 'FT G']
                home_count += 1
            elif df.loc[j, 'Away'] == home_team and home_count < n:
                df.at[i, 'PrevDiffHome{}'.format(home_count+1)] = df.loc[j, 'FT G'] - df.loc[j, 'FT H']
                home_count += 1
            if df.loc[j, 'Home'] == away_team and away_count < n:
                df.at[i, 'PrevDiffAway{}'.format(away_count+1)] = df.loc[j, 'FT H'] - df.loc[j, 'FT G']
                away_count += 1
            elif df.loc[j, 'Away'] == away_team and away_count < n:
                df.at[i, 'PrevDiffAway{}'.format(away_count+1)] = df.loc[j, 'FT G'] - df.loc[j, 'FT H']
                away_count += 1
            j -= 1
    
        # Next Row
        i -= 1
    
    # Return DataFrame with New Columns
    return df

In [78]:
# Add History
df_fifa = add_history_2018(df_fifa, 10)

In [79]:
# Check
df_fifa.tail()

Unnamed: 0,date,Home,Home Confed,Away,Away Confed,FT H,FT G,AET H,AET G,AP H,...,PrevDateHome,PrevDateAway,PrevTZHome,PrevTZAway,TimeDeltaHome,TimeDeltaAway,TravelHome,TravelAway,TZDeltaHome,TZDeltaAway
14307,7.07.2018,Sweden,,England,,0,2,,,,...,2018-07-03 00:00:00,2018-07-03 00:00:00,3.0,3.0,4.0,4.0,355.382526,214.257834,0.25,0.25
14308,10.07.2018,France,,Belgium,,1,0,,,,...,2018-07-06 00:00:00,2018-07-06 00:00:00,3.0,3.0,4.0,4.0,224.689922,300.845097,0.0,0.0
14309,11.07.2018,Croatia,,England,,1,1,2.0,1.0,,...,2018-07-07 00:00:00,2018-07-07 00:00:00,3.0,4.0,4.0,4.0,340.346463,214.257834,0.0,-0.25
14310,14.07.2018,Belgium,,England,,2,0,,,,...,2018-07-10 00:00:00,2018-07-11 00:00:00,3.0,3.0,4.0,3.0,0.0,212.050996,0.0,0.0
14311,15.07.2018,France,,Croatia,,4,2,,,,...,2018-07-10 00:00:00,2018-07-11 00:00:00,3.0,3.0,5.0,4.0,127.230597,0.0,0.0,0.0


In [80]:
# Check
df_fifa[df_fifa.year==2006].head()

Unnamed: 0,date,Home,Home Confed,Away,Away Confed,FT H,FT G,AET H,AET G,AP H,...,PrevDateHome,PrevDateAway,PrevTZHome,PrevTZAway,TimeDeltaHome,TimeDeltaAway,TravelHome,TravelAway,TZDeltaHome,TZDeltaAway
2744,02.01.2006,Qatar,AFC,Libya,CAF,2,0,,,,...,2005-12-05 00:00:00,2005-12-02 00:00:00,3.0,4.0,28.0,31.0,0.0,0.0,0.0,0.0
2745,05.01.2006,Egypt,CAF,Zimbabwe,CAF,2,0,,,,...,2005-12-29 00:00:00,2005-12-31 00:00:00,2.0,2.0,7.0,5.0,25.948812,1085.359653,0.0,0.0
2746,07.01.2006,Togo,CAF,Guinea,CAF,0,1,,,,...,2005-11-13 00:00:00,2005-11-27 00:00:00,3.5,0.0,55.0,41.0,0.0,0.0,0.0,0.0
2747,09.01.2006,Morocco,CAF,DR Congo,CAF,3,0,,,,...,2005-11-15 00:00:00,2005-12-14 00:00:00,1.0,2.0,55.0,26.0,0.0,0.0,0.0,0.0
2748,11.01.2006,Togo,CAF,Ghana,CAF,1,0,,,,...,2006-01-07 00:00:00,2005-11-14 00:00:00,1.0,3.0,4.0,58.0,403.133862,0.0,0.0,0.0


In [89]:
# Sanity Check on last row
for col in range(len(df_fifa[14311:].values[0])):
    print(df_fifa.columns.values[col], ":", df_fifa[14311:].values[0][col])

date : 15.07.2018
Home : France
Home Confed : UEFA
Away : Croatia
Away Confed : UEFA
FT H : 4
FT G : 2
AET H : nan
AET G : nan
AP H : nan
AP G : nan
ET : False
PEN : False
Type : WM
Town : Moscow
Land : Russia
neutral venue : Yes
HomeAdv : 0.0
Win : 2.0
day : 15
month : 7
year : 2018
PrevDiffHome1 : 1.0
PrevDiffAway1 : 0.0
PrevDiffHome2 : 2.0
PrevDiffAway2 : 0.0
PrevDiffHome3 : 1.0
PrevDiffAway3 : 0.0
PrevDiffHome4 : 0.0
PrevDiffAway4 : 1.0
PrevDiffHome5 : 1.0
PrevDiffAway5 : 3.0
PrevDiffHome6 : 1.0
PrevDiffAway6 : 2.0
PrevDiffHome7 : 0.0
PrevDiffAway7 : 1.0
PrevDiffHome8 : 2.0
PrevDiffAway8 : -2.0
PrevDiffHome9 : 2.0
PrevDiffAway9 : 1.0
PrevDiffHome10 : 2.0
PrevDiffAway10 : -2.0
latitude : 55.7507178
longitude : 37.6176606
coordinates : (55.7507178, 37.6176606)
dist_home : 2493.691069065942
dist_away : 1871.536479172245
UTC : 3.0
datetime : 2018-07-15 00:00:00
PrevCordHome : (59.938732, 30.316229)
PrevCordAway : (55.7507178, 37.6176606)
PrevDateHome : 2018-07-10 00:00:00
PrevDateAway 

In [83]:
# Add Home Advantage variable

for game in df_fifa.index[df_fifa.year == 2018]:
    
    if df_fifa.Home[game] == df_fifa.Land[game]:
        df_fifa.at[game, 'HomeAdv'] = 1.0
        
    elif df_fifa.Away[game] == df_fifa.Land[game]:
        df_fifa.at[game, 'HomeAdv'] = -1.0 
        
    else:
        df_fifa.at[game, 'HomeAdv'] = 0.0

In [84]:
# Write Data Set with Match History to File
df_fifa.to_csv('WorldCupData_.csv')

In [85]:
# Look at descriptions of data
df_fifa.describe()

Unnamed: 0,FT H,FT G,AET H,AET G,AP H,AP G,HomeAdv,Win,day,month,...,dist_away,UTC,PrevTZHome,PrevTZAway,TimeDeltaHome,TimeDeltaAway,TravelHome,TravelAway,TZDeltaHome,TZDeltaAway
count,14312.0,14312.0,86.0,86.0,249.0,249.0,14312.0,13927.0,14312.0,14312.0,...,14312.0,14312.0,14312.0,14312.0,14312.0,14312.0,14312.0,14312.0,14312.0,14312.0
mean,1.573155,1.066308,1.639535,1.430233,4.0,3.939759,0.712968,0.599555,14.529486,6.921045,...,3237.382454,1.647062,1.676845,1.584754,34.442216,37.122694,104.886451,152.643413,0.002506,0.008994
std,1.559287,1.255609,1.2641,1.057641,1.729721,1.83832,0.473966,0.423505,8.575867,3.267421,...,3076.309987,3.885165,3.85279,3.891316,79.681101,84.411542,304.659081,411.376341,0.188174,0.260052
min,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,1.0,1.0,...,0.0,-11.0,-11.0,-11.0,-80.0,-74.0,-3120.909964,-1845.139554,-2.5,-3.0
25%,0.0,0.0,1.0,1.0,3.0,3.0,0.0,0.0,7.0,4.0,...,1176.444832,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0
50%,1.0,1.0,1.0,1.0,4.0,4.0,1.0,0.5,13.0,7.0,...,2202.904078,2.0,2.0,2.0,7.0,7.0,0.0,0.0,0.0,0.0
75%,2.0,2.0,2.0,2.0,5.0,5.0,1.0,1.0,22.0,10.0,...,4242.801789,3.0,3.0,3.0,42.0,42.0,0.0,99.582435,0.0,0.0
max,17.0,21.0,7.0,4.0,11.0,12.0,1.0,2.0,31.0,12.0,...,19604.328611,13.0,13.0,13.0,2917.0,1444.0,7444.818493,15112.446277,4.5,10.0
