In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
raw_data = pd.read_csv('Ronaldo_Messi.csv')

In [4]:
# Find størrelsen på datasættet
raw_data.shape

(1413, 14)

In [5]:
# Find kolonnernes datatyper
raw_data.dtypes

Player              object
Season              object
Competition         object
Matchday            object
Date                object
Venue               object
Club                object
Opponent            object
Result              object
Playing_Position    object
Minute              object
At_score            object
Type                object
Goal_assist         object
dtype: object

In [6]:
# Vis de første rækker i datasættet
raw_data.head()

Unnamed: 0,Player,Season,Competition,Matchday,Date,Venue,Club,Opponent,Result,Playing_Position,Minute,At_score,Type,Goal_assist
0,Cristiano Ronaldo,02/03,Liga Portugal,6,2002-10-07,H,Sporting CP,Moreirense FC,3:00,LW,34,2:00,Solo run,
1,Cristiano Ronaldo,02/03,Liga Portugal,6,2002-10-07,H,Sporting CP,Moreirense FC,3:00,LW,90+5,3:00,Header,Rui Jorge
2,Cristiano Ronaldo,02/03,Liga Portugal,8,2002-10-26,A,Sporting CP,Boavista FC,1:02,,88,1:02,Right-footed shot,Carlos Martins
3,Cristiano Ronaldo,02/03,Taca de Portugal Placard,Fourth Round,2002-11-24,H,Sporting CP,CD Estarreja,4:01,,67,3:00,Left-footed shot,Cesar Prates
4,Cristiano Ronaldo,02/03,Taca de Portugal Placard,Fifth Round,2002-12-18,H,Sporting CP,FC Oliveira do Hospital,8:01,,13,3:00,,


In [7]:
# Undersøg, om der mangler data i nogle af kolonnerne
raw_data.isna().sum()

Player                0
Season                0
Competition           0
Matchday              0
Date                  0
Venue                 0
Club                  0
Opponent              0
Result                0
Playing_Position     58
Minute                0
At_score              0
Type                 16
Goal_assist         460
dtype: int64

In [8]:
# Der mangler spillerens position og måltypen i nogle af kampene.
# Der mangler også assist i nogle af kampene, men det kan have en naturlig forklaring, idet der ikke behøver at være en assisterende spiller.
# Vi lader alle rækkerne stå.

In [9]:
# Der er noget ved datatyperne, som vi kan se på.
# F.eks. er datoen for kampen ikke angivet som en dato, men en streng.
type(raw_data['Date'][0])

str

In [10]:
# Vi kan lave datoerne om til typen datetime64, så vi evt kan regne med dem.
# De er i det rigtige format 'yyyy-mm-dd'
data = raw_data.copy()
data['Date'] = pd.to_datetime(data['Date'])

In [11]:
data.dtypes

Player                      object
Season                      object
Competition                 object
Matchday                    object
Date                datetime64[ns]
Venue                       object
Club                        object
Opponent                    object
Result                      object
Playing_Position            object
Minute                      object
At_score                    object
Type                        object
Goal_assist                 object
dtype: object

In [12]:
# Resultaterne er også givet i et lidt særligt format: 'h:u' for resultatet h - u
# Hjemmeholdets målscore er før ':' og udeholdets målscore er efter ':'
# Vi vil erstatte strengen 'h:u' med tuplen af int (h,u)
data['Result'] = data['Result'].map(lambda s: (int(s.split(" ")[0].split(':')[0]), int(s.split(" ")[0].split(':')[1])))
data['At_score'] = data['At_score'].map(lambda s: (int(s.split(" ")[0].split(':')[0]), int(s.split(" ")[0].split(':')[1])))


In [13]:
data.head()

Unnamed: 0,Player,Season,Competition,Matchday,Date,Venue,Club,Opponent,Result,Playing_Position,Minute,At_score,Type,Goal_assist
0,Cristiano Ronaldo,02/03,Liga Portugal,6,2002-10-07,H,Sporting CP,Moreirense FC,"(3, 0)",LW,34,"(2, 0)",Solo run,
1,Cristiano Ronaldo,02/03,Liga Portugal,6,2002-10-07,H,Sporting CP,Moreirense FC,"(3, 0)",LW,90+5,"(3, 0)",Header,Rui Jorge
2,Cristiano Ronaldo,02/03,Liga Portugal,8,2002-10-26,A,Sporting CP,Boavista FC,"(1, 2)",,88,"(1, 2)",Right-footed shot,Carlos Martins
3,Cristiano Ronaldo,02/03,Taca de Portugal Placard,Fourth Round,2002-11-24,H,Sporting CP,CD Estarreja,"(4, 1)",,67,"(3, 0)",Left-footed shot,Cesar Prates
4,Cristiano Ronaldo,02/03,Taca de Portugal Placard,Fifth Round,2002-12-18,H,Sporting CP,FC Oliveira do Hospital,"(8, 1)",,13,"(3, 0)",,


In [39]:
# Vi kan lave en ny kolonne, som viser, om spilleren vandt, tabte eller spillede uafgjort.
conditions = [(data['Result'].str[0] > data['Result'].str[1] & (data['Venue'] == 'H')) | ((data['Result'].str[0] < data['Result'].str[1]) & (data['Venue'] == 'A')), ((data['Result'].str[0] > data['Result'].str[1]) & (data['Venue'] == 'A')) | ((data['Result'].str[0] < data['Result'].str[1]) & (data['Venue'] == 'H')), data['Result'].str[0] == data['Result'].str[1]]
outcomes = ['W', 'L', 'D']
data['Player_result'] = np.select(conditions, outcomes)

In [41]:
data[['Venue', 'Result', 'Player_result']].query("Player_result == 'W'").tail(15)

Unnamed: 0,Venue,Result,Player_result
1398,H,"(4, 3)",W
1399,A,"(1, 3)",W
1400,A,"(2, 2)",W
1401,H,"(2, 1)",W
1402,A,"(0, 4)",W
1403,A,"(0, 4)",W
1404,A,"(0, 4)",W
1405,A,"(0, 4)",W
1406,H,"(4, 3)",W
1407,A,"(0, 3)",W
