In [74]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sqlite3

from collections import Counter, defaultdict
from bs4 import BeautifulSoup

import numpy as np
from numpy import random

In [166]:
#load data

with sqlite3.connect('../data/raw/database.sqlite') as con:
    countries = pd.read_sql_query("SELECT * from Country", con)
    matches = pd.read_sql_query("SELECT * from Match", con, parse_dates=['date'])
    leagues = pd.read_sql_query("SELECT * from League", con)
    teams = pd.read_sql_query("SELECT * from Team", con)
    players = pd.read_sql_query("SELECT * from Player", con)
    players_stats = pd.read_sql_query("SELECT * from Player_Stats", con)

In [167]:
selected_countries = ['England']
countries = countries[countries.name.isin(selected_countries)]
leagues = countries.merge(leagues, on='id', suffixes=('', '_y'))
matches = matches[matches.league_id.isin(leagues.id)]
matches.dropna(inplace=True)
matches.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
3215,3216,1729,1729,2012/2013,1,2012-08-20,1228230,8668,10260,1,...,1.91,4.3,3.7,1.87,4.6,3.5,1.83,4.0,3.4,1.91
3216,3217,1729,1729,2012/2013,1,2012-08-18,1228231,9879,9850,5,...,4.5,1.83,3.75,4.75,1.8,3.5,4.4,1.83,3.5,4.33
3217,3218,1729,1729,2012/2013,1,2012-08-19,1228232,8456,8466,3,...,19.0,1.2,7.0,19.0,1.18,6.25,16.0,1.17,6.5,17.0
3218,3219,1729,1729,2012/2013,1,2012-08-18,1228233,10261,8586,2,...,2.8,2.62,3.4,2.75,2.6,3.3,2.75,2.5,3.4,2.7
3219,3220,1729,1729,2012/2013,1,2012-08-18,1228234,10172,10003,0,...,3.6,2.0,3.5,4.0,2.0,3.4,3.6,2.0,3.4,3.6


In [168]:
def get_team_long_name(team_id):
    return teams.loc[teams.team_api_id == team_id, 'team_long_name'].values[0]

def get_team_short_name(team_id):
    return teams.loc[teams.team_api_id == team_id, 'team_short_name'].values[0]

def get_formation(match_api_id, team_type):
    formation_dict = Counter([matches.loc[matches.match_api_id == match_api_id]['%s_player_Y%d'%(team_type, i)].values[0] for i in range(1, 12)])
    sorted_keys = sorted(formation_dict)

    formation = ''
    for key in sorted_keys[1:-1]:
        y = formation_dict[key]
        formation += '%d-' % y
    formation += '%d' % formation_dict[sorted_keys[-1]]
    
    return formation

matches['home_team_long_name'] = matches.home_team_api_id.map(get_team_long_name)
matches['home_team_short_name'] = matches.home_team_api_id.map(get_team_short_name)

matches['away_team_long_name'] = matches.away_team_api_id.map(get_team_long_name)
matches['away_team_short_name'] = matches.away_team_api_id.map(get_team_short_name)

matches['home_team_formation'] = matches.match_api_id.apply(get_formation, args=('home', ))
matches['away_team_formation'] = matches.match_api_id.apply(get_formation, args=('away',))

In [169]:
matches[['home_team_goal', 'away_team_goal', 'home_team_short_name', 'away_team_short_name']].head()

Unnamed: 0,home_team_goal,away_team_goal,home_team_short_name,away_team_short_name
3215,1,0,EVE,MUN
3216,5,0,FUL,NOR
3217,3,2,MCI,SOU
3218,2,1,NEW,TOT
3219,0,5,QPR,SWA


In [170]:
def calculate_result(match_df):
    home_team_goals = match_df['home_team_goal']
    away_team_goals = match_df['away_team_goal']
    
    if home_team_goals > away_team_goals:
        return 'home'
    elif away_team_goals > home_team_goals:
        return 'away'
    else:
        return 'draw'

matches['result'] = matches.apply(calculate_result, axis=1)

In [171]:
def calculate_crosses_count(crosses):
    parsed_xml = BeautifulSoup(crosses, 'xml')
    num_crosses = defaultdict(int)
    
    for x in parsed_xml.find_all('value'):
        if x.crosses:
            if x.team:
                num_crosses[x.team.text] += 1
    
    return num_crosses

crosses_info = matches.cross.map(calculate_crosses_count)

In [172]:
def get_crosses_list(crosses_info, matches):
    home_team_crosses = []
    away_team_crosses = []
    counter = 0
    
    for team_id in matches.home_team_api_id.values:
        home_team_crosses.append(crosses_info.iloc[counter][str(team_id)])
        counter += 1
        
    counter = 0
    for team_id in matches.away_team_api_id.values:
        away_team_crosses.append(crosses_info.iloc[counter][str(team_id)])
        counter += 1
        
    return (home_team_crosses, away_team_crosses)

In [173]:
matches['num_home_team_crosses'], matches['num_away_team_crosses'] = get_crosses_list(crosses_info, matches)

In [178]:
features = ['date', 'home_team_formation', 'home_team_short_name', 'away_team_formation', \
            'away_team_short_name',  'stage', 'num_home_team_crosses', 'num_away_team_crosses',\
            'B365H','B365D','B365A','result']

In [179]:
from sklearn.preprocessing import LabelEncoder

In [180]:
for feature in ['home_team_formation', 'home_team_short_name', 'away_team_formation', 'away_team_short_name', 'result']:
    lbl = LabelEncoder()
    matches[feature] = lbl.fit_transform(matches[feature])

In [181]:
matches['weekday'] = matches.date.dt.weekday
matches['day'] = matches.date.dt.day

In [182]:
X = matches[['home_team_formation', 'home_team_short_name', 'away_team_formation', \
            'away_team_short_name',  'stage', 'num_home_team_crosses', 'num_away_team_crosses', \
            'B365H','B365D','B365A']]
y = matches.result

In [183]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44)

In [201]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

est = LogisticRegression()
est.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [202]:
from sklearn.metrics import accuracy_score

print('Accuracy score on test set %f '%(accuracy_score(y_test, est.predict(X_test))))

Accuracy score on test set 0.578947 


In [160]:
matches.goal.values[0]

'<goal><value><comment>n</comment><stats><goals>1</goals><shoton>1</shoton></stats><event_incident_typefk>406</event_incident_typefk><elapsed>57</elapsed><player2>24156</player2><subtype>header</subtype><player1>39618</player1><sortorder>4</sortorder><team>8668</team><id>2299281</id><n>327</n><type>goal</type><goal_type>n</goal_type></value></goal>'