In [31]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors

import sqlite3

from IPython.display import display, HTML

#REferences: https://towardsdatascience.com/data-handling-using-pandas-cleaning-and-processing-3aa657dc9418
#Dataset: https://www.kaggle.com/hugomathien/soccer?

In [32]:
with sqlite3.connect('datasets/fifa_database.sqlite') as con:
    countries = pd.read_sql_query("SELECT * from Country", con)
    matches = pd.read_sql_query("SELECT * from Match", con)
    leagues = pd.read_sql_query("SELECT * from League", con)
    teams = pd.read_sql_query("SELECT * from Team", con)
    player = pd.read_sql_query("SELECT * from Player",con)
    player_attributes = pd.read_sql_query("SELECT * from Player_Attributes",con)
    sequence = pd.read_sql_query("SELECT * from sqlite_sequence",con)
    team_attributes = pd.read_sql_query("SELECT * from Team_Attributes",con)

In [7]:
df = player_attributes
df.head(5)
print(len(df))

183978


In [12]:
player[player.player_fifa_api_id==158023]

Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight
6169,6176,30981,Lionel Messi,158023,1987-06-24 00:00:00,170.18,159


In [13]:
player.to_csv('datasets/fifa_player_raw.csv', index=False)

## Remove unecessary columns

In [33]:
df = pd.read_csv('datasets/shawn_processed.csv')

## Data overview

In [34]:
#missing Values
print(df.isna().sum())

df_rows_with_na = df[df.isna().any(axis=1)]
print("percentage of rows with missing data is ", len(df_rows_with_na)*100//len(df), "%" )

display(df[df["volleys"].isna()])

id                     0
player_fifa_api_id     0
player_api_id          0
date                   0
overall_rating         0
potential              0
preferred_foot         0
attacking_work_rate    0
defensive_work_rate    0
crossing               0
finishing              0
heading_accuracy       0
short_passing          0
volleys                0
dribbling              0
curve                  0
free_kick_accuracy     0
long_passing           0
ball_control           0
acceleration           0
sprint_speed           0
agility                0
reactions              0
balance                0
shot_power             0
jumping                0
stamina                0
strength               0
long_shots             0
aggression             0
interceptions          0
positioning            0
vision                 0
penalties              0
marking                0
standing_tackle        0
sliding_tackle         0
gk_diving              0
gk_handling            0
gk_kicking             0


Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,player_positions,player_name,height,weight


## Wrong Data Values

### Categorial Variables 

In [35]:
# Print all unique values
categorial_columns = ['preferred_foot', 'attacking_work_rate', 'defensive_work_rate']
for c in categorial_columns:
    print(c, df[c].unique())
    
# Come up with a set of possible values
pos_values_preferred_foot = ['right', 'left']
pos_values_attacking_work =['medium', 'high', 'low']
pos_values_defensive_work = ['medium', 'high', 'low']


preferred_foot ['right' 'left']
attacking_work_rate ['medium' 'high' 'low' 'None']
defensive_work_rate ['medium' 'high' 'low' '1' '0' '4' '5' '6' '9' '7' '2' '3']


In [36]:
# Note: there are None and "None" values
def analysePossValues(df, cat_name, pos_values):
    
    valid_rows = df.loc[df[cat_name].isin(pos_values)]
    invalid_rows = df.loc[df[cat_name].isin(pos_values) == False]
    print("percentage of invalid rows is", len(invalid_rows)*100//len(df), "%")
    
    #check distribution
    valid_rows[cat_name].value_counts().plot(kind='bar')

def subWithMode(df, cat_name, pos_values):
    print('values before change\n', df[cat_name].value_counts())
    
    valid_rows = df.loc[df[cat_name].isin(pos_values)]
    value_counts = valid_rows[cat_name].value_counts()
    mode_label = value_counts.idxmax()
    print('mode label is', mode_label)
    
    df.loc[df[cat_name].isin(pos_values) == False, cat_name] = mode_label
    print('values after change\n', df[cat_name].value_counts())
    return df
    
    
    


In [37]:
df = subWithMode(df,'preferred_foot' , pos_values_preferred_foot)
df = subWithMode(df,'attacking_work_rate' , pos_values_attacking_work)
df = subWithMode(df,'defensive_work_rate' , pos_values_defensive_work)
print(df['defensive_work_rate'].value_counts())

values before change
 right    4805
left     1553
Name: preferred_foot, dtype: int64
mode label is right
values after change
 right    4805
left     1553
Name: preferred_foot, dtype: int64
values before change
 medium    4159
high      1797
low        314
None        88
Name: attacking_work_rate, dtype: int64
mode label is medium
values after change
 medium    4247
high      1797
low        314
Name: attacking_work_rate, dtype: int64
values before change
 medium    4520
high      1134
low        616
1           23
2           15
7           10
0           10
5            9
6            7
3            6
9            5
4            3
Name: defensive_work_rate, dtype: int64
mode label is medium
values after change
 medium    4608
high      1134
low        616
Name: defensive_work_rate, dtype: int64
medium    4608
high      1134
low        616
Name: defensive_work_rate, dtype: int64


### Numerical Variables

In [38]:
# Check if all are float
id_column = "player_fifa_api_id"
numerical_columns = list(set(df.columns) - set(categorial_columns) - set([id_column]))
print(df[numerical_columns].dtypes)
display(df[numerical_columns].head(3))

curve                 float64
volleys               float64
ball_control          float64
interceptions         float64
player_positions       object
short_passing         float64
gk_positioning        float64
strength              float64
gk_diving             float64
gk_reflexes           float64
sliding_tackle        float64
finishing             float64
weight                  int64
date                   object
long_passing          float64
sprint_speed          float64
aggression            float64
gk_kicking            float64
id                      int64
gk_handling           float64
overall_rating        float64
player_api_id           int64
reactions             float64
stamina               float64
potential             float64
balance               float64
agility               float64
shot_power            float64
crossing              float64
jumping               float64
player_name            object
marking               float64
long_shots            float64
heading_ac

Unnamed: 0,curve,volleys,ball_control,interceptions,player_positions,short_passing,gk_positioning,strength,gk_diving,gk_reflexes,...,long_shots,heading_accuracy,standing_tackle,height,free_kick_accuracy,positioning,dribbling,penalties,acceleration,vision
0,60.0,36.0,62.0,78.0,CB,71.0,14.0,85.0,7.0,15.0,...,66.0,77.0,80.0,177.8,73.0,46.0,53.0,57.0,67.0,57.0
1,54.0,43.0,67.0,56.0,"CDM, CM, CAM",71.0,10.0,91.0,6.0,13.0,...,52.0,63.0,65.0,175.26,47.0,61.0,56.0,64.0,72.0,68.0
2,14.0,14.0,18.0,13.0,GK,23.0,52.0,75.0,58.0,62.0,...,12.0,14.0,15.0,190.5,14.0,15.0,15.0,23.0,39.0,18.0


In [39]:
column_check = df[numerical_columns].isna().sum()
print(column_check)

curve                 0
volleys               0
ball_control          0
interceptions         0
player_positions      0
short_passing         0
gk_positioning        0
strength              0
gk_diving             0
gk_reflexes           0
sliding_tackle        0
finishing             0
weight                0
date                  0
long_passing          0
sprint_speed          0
aggression            0
gk_kicking            0
id                    0
gk_handling           0
overall_rating        0
player_api_id         0
reactions             0
stamina               0
potential             0
balance               0
agility               0
shot_power            0
crossing              0
jumping               0
player_name           0
marking               0
long_shots            0
heading_accuracy      0
standing_tackle       0
height                0
free_kick_accuracy    0
positioning           0
dribbling             0
penalties             0
acceleration          0
vision          

In [40]:
# Confirm that the 836/2713 missing values belong to the same record

column_check = df[numerical_columns].isna().sum()
missing_columns_set1 = list(column_check[column_check == 836].index)
missing_columns_set2 = list(column_check[column_check == 2713].index)

df_set1 = df[missing_columns_set1]
df_set2 = df[missing_columns_set2]

df1_rows_na = df_set1[df_set1.isna().any(axis=1)]
df2_rows_na = df_set2[df_set2.isna().any(axis=1)]

print(len(df1_rows_na))
print(len(df2_rows_na))


0
0


In [41]:
#Full outer join
rows_to_drop = df1_rows_na.merge(df2_rows_na, left_index = True, right_index=True, how="outer")
print("number of rows to drop is", len(rows_to_drop))

number of rows to drop is 0


In [42]:
print("initial df length is ", len(df))
# df = df.drop(df1_rows_na.index)
# df = df.drop(df2_rows_na.index)
df = df.drop(rows_to_drop.index)
print("new df length is ", len(df))

#double check
column_check = df[numerical_columns].isna().sum()
print(column_check)

initial df length is  6358
new df length is  6358
curve                 0
volleys               0
ball_control          0
interceptions         0
player_positions      0
short_passing         0
gk_positioning        0
strength              0
gk_diving             0
gk_reflexes           0
sliding_tackle        0
finishing             0
weight                0
date                  0
long_passing          0
sprint_speed          0
aggression            0
gk_kicking            0
id                    0
gk_handling           0
overall_rating        0
player_api_id         0
reactions             0
stamina               0
potential             0
balance               0
agility               0
shot_power            0
crossing              0
jumping               0
player_name           0
marking               0
long_shots            0
heading_accuracy      0
standing_tackle       0
height                0
free_kick_accuracy    0
positioning           0
dribbling             0
penalties     

In [43]:
print(df['preferred_foot'].value_counts())

right    4805
left     1553
Name: preferred_foot, dtype: int64


## One-hot encode

In [44]:
#one hot encode everything!
def one_hot_encode(df, column_name):
    temp = df.copy()
    df = df.drop(column_name,axis = 1)
    df = df.join(pd.get_dummies(temp[column_name], prefix=column_name))
    return df
# df = one_hot_encode(df, col_name)

In [45]:
for col_name in categorial_columns:
    df = one_hot_encode(df, col_name)

## Final checks and saving

In [46]:
df.isna().sum()

id                            0
player_fifa_api_id            0
player_api_id                 0
date                          0
overall_rating                0
potential                     0
crossing                      0
finishing                     0
heading_accuracy              0
short_passing                 0
volleys                       0
dribbling                     0
curve                         0
free_kick_accuracy            0
long_passing                  0
ball_control                  0
acceleration                  0
sprint_speed                  0
agility                       0
reactions                     0
balance                       0
shot_power                    0
jumping                       0
stamina                       0
strength                      0
long_shots                    0
aggression                    0
interceptions                 0
positioning                   0
vision                        0
penalties                     0
marking 

In [47]:
df.dtypes

id                              int64
player_fifa_api_id              int64
player_api_id                   int64
date                           object
overall_rating                float64
potential                     float64
crossing                      float64
finishing                     float64
heading_accuracy              float64
short_passing                 float64
volleys                       float64
dribbling                     float64
curve                         float64
free_kick_accuracy            float64
long_passing                  float64
ball_control                  float64
acceleration                  float64
sprint_speed                  float64
agility                       float64
reactions                     float64
balance                       float64
shot_power                    float64
jumping                       float64
stamina                       float64
strength                      float64
long_shots                    float64
aggression  

In [48]:
csv_path = "datasets/cleaned_soccer_data_2016_v2.csv"
df.to_csv(csv_path, index=False)

## Testing

In [49]:
# #Find KNN for replacement
# def replaceNA(df, cat_name):

#     numerical_df = df.drop(columns=['player_fifa_api_id', 'preferred_foot', 'attacking_work_rate', 'defensive_work_rate'])
#     print(numerical_df.isna().sum())
# #         knn = NearestNeighbors(n_neighbors=5)
# #     knn.fit(df.drop(columns=['player_fifa_api_id', 'preferred_foot', 'attacking_work_rate', 'defensive_work_rate']))
# #     knn.kneighbors(X[0], return_distance=False)

# replaceNA(df, 'attacking_work_rate')

In [50]:
df_test = pd.DataFrame([[1, True, 'dsfasd', 51.314],
                   [51, False, '56345', 56.1234]],
                  columns=['col1', 'col2', 'col3', 'col4'])

res = df_test.dtypes

print(res)

col1      int64
col2       bool
col3     object
col4    float64
dtype: object
