# 1.Import library and read data

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("whole data.csv")
data = data.drop(columns = ['Unnamed: 0'])

In [3]:
data.head()

Unnamed: 0,Player,MP,Min,Goals,Assists,CrdY,CrdR,count,Comp,Shots,...,age,best_position,club,name,overall,position,potential,preferred_foot,value,wage
0,Yunis Abdelhamid,34,2983,0.06,0.0,0.15,0.03,253,Ligue 1,0.54,...,33,CB,Stade de Reims,Yunis Abdelhamid,76,LCB,76,Left,€3.6M,€23K
1,Salis Abdul Samed,31,2462,0.04,0.0,0.44,0.11,1065,Ligue 1,0.66,...,21,CDM,Clermont Foot 63,Salis Abdul Samed,70,SUB,75,Right,€2.2M,€7K
2,Laurent Abergel,34,2956,0.0,0.06,0.27,0.0,25,Ligue 1,0.91,...,28,CDM,FC Lorient,Laurent Abergel,75,CDM,75,Right,€4.9M,€18K
3,Dickson Abiama,24,726,0.0,0.12,0.37,0.0,1937,Bundesliga,2.22,...,22,ST,SpVgg Greuther Fürth,Dickson Abiama,68,SUB,76,Right,€2.7M,€9K
4,Francesco Acerbi,30,2536,0.14,0.0,0.07,0.04,941,Serie A,0.57,...,33,CB,Lazio,Francesco Acerbi,83,LCB,83,Left,€17.5M,€75K


In [4]:
# check missing values
data.isnull().sum()[data.isnull().sum() != 0]

Series([], dtype: int64)

### => we don't have any missing values

In [5]:
data.dtypes

Player                       object
MP                            int64
Min                           int64
Goals                       float64
Assists                     float64
CrdY                        float64
CrdR                        float64
count                         int64
Comp                         object
Shots                       float64
Acceleration                  int64
Aggression                    int64
Agility                       int64
Balance                       int64
Ball_Control                  int64
Composure                     int64
Crossing                      int64
Curve                         int64
Dribbling                     int64
FK_Accuracy                   int64
Finishing                     int64
GK_Diving                     int64
GK_Handling                   int64
GK_Kicking                    int64
GK_Positioning                int64
GK_Reflexes                   int64
Heading_Accuracy              int64
Interceptions               

# 2. Processing with position and best_position

In [6]:
# see position of players
data['position'].unique()

array(['LCB', 'SUB', 'CDM', 'LB', 'RES', 'LM', 'LDM', 'RB', 'RCB', 'LS',
       'LCM', 'RCM', 'RM', 'LWB', 'RDM', 'ST', 'CM', 'RW', 'RS', 'GK',
       'CAM', 'RWB', 'RF', 'LF', 'CB', 'LAM', 'CF', 'LW', 'RAM'],
      dtype=object)

In [7]:
# we replace 'SUB' and 'RES' position by best_positon
for i in range(len(data['position'])):
    if data['position'][i] == 'SUB' or data['position'][i] == 'RES':
        data['position'][i] = data['best_position'][i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['position'][i] = data['best_position'][i]


In [8]:
# we make 1 column with only 4 positions in common : goalkeeper, defender, midfielder, striker
position_common = []
for i in data['position']:
    if i in ['GK']:
        position_common.append("goalkeeper")
    elif i in ['LCB', 'LB', 'RB', 'RCB', 'LWB', 'CB','RWB']:
        position_common.append("defender")
    elif i in ['CDM', 'CAM', 'LM', 'CM', 'LDM', 'LCM', 'RM', 'RCM', 'RDM', 'LAM', 'RAM']:
        position_common.append("midfielder")
    elif i in ['ST', 'LS','RW', 'RS', 'RF', 'LW', 'LF', 'CF' ]:
        position_common.append("striker")
    else:
        continue

# add position_column into data tables
data['position common'] = position_common

In [9]:
data.head()

Unnamed: 0,Player,MP,Min,Goals,Assists,CrdY,CrdR,count,Comp,Shots,...,best_position,club,name,overall,position,potential,preferred_foot,value,wage,position common
0,Yunis Abdelhamid,34,2983,0.06,0.0,0.15,0.03,253,Ligue 1,0.54,...,CB,Stade de Reims,Yunis Abdelhamid,76,LCB,76,Left,€3.6M,€23K,defender
1,Salis Abdul Samed,31,2462,0.04,0.0,0.44,0.11,1065,Ligue 1,0.66,...,CDM,Clermont Foot 63,Salis Abdul Samed,70,CDM,75,Right,€2.2M,€7K,midfielder
2,Laurent Abergel,34,2956,0.0,0.06,0.27,0.0,25,Ligue 1,0.91,...,CDM,FC Lorient,Laurent Abergel,75,CDM,75,Right,€4.9M,€18K,midfielder
3,Dickson Abiama,24,726,0.0,0.12,0.37,0.0,1937,Bundesliga,2.22,...,ST,SpVgg Greuther Fürth,Dickson Abiama,68,ST,76,Right,€2.7M,€9K,striker
4,Francesco Acerbi,30,2536,0.14,0.0,0.07,0.04,941,Serie A,0.57,...,CB,Lazio,Francesco Acerbi,83,LCB,83,Left,€17.5M,€75K,defender


# 3 .Processing with value and wage:

In [10]:
market_value = []
for i in range(len(data['value'])):
    if data['value'][i][-1] == 'K':
        market_value.append( float(data['value'][i][1 : -1]) * 1_000)
    else:
        market_value.append( float(data['value'][i][1 : -1]) * 1_000_000)
data['market value'] = market_value
data = data.drop(columns = ['value'])

In [11]:
wage_value = []
for i in range(len(data['wage'])):
    if data['wage'][i][-1] == 'K':
        wage_value.append( float(data['wage'][i][1 : -1]) * 1_000)
    else:
        wage_value.append(float(data['wage'][i][1:]))
data['wage value'] = wage_value
data = data.drop(columns = ['wage'])

In [12]:
data.head()

Unnamed: 0,Player,MP,Min,Goals,Assists,CrdY,CrdR,count,Comp,Shots,...,best_position,club,name,overall,position,potential,preferred_foot,position common,market value,wage value
0,Yunis Abdelhamid,34,2983,0.06,0.0,0.15,0.03,253,Ligue 1,0.54,...,CB,Stade de Reims,Yunis Abdelhamid,76,LCB,76,Left,defender,3600000.0,23000.0
1,Salis Abdul Samed,31,2462,0.04,0.0,0.44,0.11,1065,Ligue 1,0.66,...,CDM,Clermont Foot 63,Salis Abdul Samed,70,CDM,75,Right,midfielder,2200000.0,7000.0
2,Laurent Abergel,34,2956,0.0,0.06,0.27,0.0,25,Ligue 1,0.91,...,CDM,FC Lorient,Laurent Abergel,75,CDM,75,Right,midfielder,4900000.0,18000.0
3,Dickson Abiama,24,726,0.0,0.12,0.37,0.0,1937,Bundesliga,2.22,...,ST,SpVgg Greuther Fürth,Dickson Abiama,68,ST,76,Right,striker,2700000.0,9000.0
4,Francesco Acerbi,30,2536,0.14,0.0,0.07,0.04,941,Serie A,0.57,...,CB,Lazio,Francesco Acerbi,83,LCB,83,Left,defender,17500000.0,75000.0


# 4. Remove redundant columns

In [13]:
data_processed = data.drop(columns = ['best_position', 'Player','club','position','count' ])

In [14]:
data_processed.to_csv("processed data.csv")