In [10]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors

import sqlite3

from IPython.display import display, HTML
pd.set_option('display.max_rows', 500)
#REferences: https://towardsdatascience.com/data-handling-using-pandas-cleaning-and-processing-3aa657dc9418
#Dataset: https://www.kaggle.com/hugomathien/soccer?

In [131]:
with sqlite3.connect('database.sqlite') as con:
    countries = pd.read_sql_query("SELECT * from Country", con)
    matches = pd.read_sql_query("SELECT * from Match", con)
    leagues = pd.read_sql_query("SELECT * from League", con)
    teams = pd.read_sql_query("SELECT * from Team", con)
    player = pd.read_sql_query("SELECT * from Player",con)
    player_attributes = pd.read_sql_query("SELECT * from Player_Attributes",con)
    sequence = pd.read_sql_query("SELECT * from sqlite_sequence",con)
    team_attributes = pd.read_sql_query("SELECT * from Team_Attributes",con)

In [177]:
df = player_attributes
df.head(5)
print(len(df))

183978


## Remove unecessary columns

In [178]:
#df = df.drop(columns = ['id', 'player_api_id', 'date'])

In [4]:
df = pd.read_csv('datasets/shawn_processed.csv')

## Data overview

In [5]:
#missing Values
print(df.isna().sum())

df_rows_with_na = df[df.isna().any(axis=1)]
print("percentage of rows with missing data is ", len(df_rows_with_na)*100//len(df), "%" )

display(df[df["volleys"].isna()])

id                    0
player_fifa_api_id    0
player_api_id         0
date                  0
overall_rating        0
                     ..
rcb                   0
rb                    0
player_name           0
height                0
weight                0
Length: 72, dtype: int64
percentage of rows with missing data is  0 %


Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,rdm,rwb,lb,lcb,cb,rcb,rb,player_name,height,weight


## Wrong Data Values

### Categorial Variables 

In [6]:
# Print all unique values
categorial_columns = ['preferred_foot', 'attacking_work_rate', 'defensive_work_rate']
for c in categorial_columns:
    print(c, df[c].unique())
    
# Come up with a set of possible values
pos_values_preferred_foot = ['right', 'left']
pos_values_attacking_work =['medium', 'high', 'low']
pos_values_defensive_work = ['medium', 'high', 'low']


preferred_foot ['right' 'left']
attacking_work_rate ['medium' 'high' 'low' 'None']
defensive_work_rate ['medium' 'high' '5' 'low' '1' '0' '2' '6' '3' '9' '4' '7']


In [7]:
# Note: there are None and "None" values
def analysePossValues(df, cat_name, pos_values):
    
    valid_rows = df.loc[df[cat_name].isin(pos_values)]
    invalid_rows = df.loc[df[cat_name].isin(pos_values) == False]
    print("percentage of invalid rows is", len(invalid_rows)*100//len(df), "%")
    
    #check distribution
    valid_rows[cat_name].value_counts().plot(kind='bar')

def subWithMode(df, cat_name, pos_values):
    print('values before change\n', df[cat_name].value_counts())
    
    valid_rows = df.loc[df[cat_name].isin(pos_values)]
    value_counts = valid_rows[cat_name].value_counts()
    mode_label = value_counts.idxmax()
    print('mode label is', mode_label)
    
    df.loc[df[cat_name].isin(pos_values) == False, cat_name] = mode_label
    print('values after change\n', df[cat_name].value_counts())
    return df
    
    
    


In [8]:
df = subWithMode(df,'preferred_foot' , pos_values_preferred_foot)
df = subWithMode(df,'attacking_work_rate' , pos_values_attacking_work)
df = subWithMode(df,'defensive_work_rate' , pos_values_defensive_work)
print(df['defensive_work_rate'].value_counts())

values before change
 right    4805
left     1553
Name: preferred_foot, dtype: int64
mode label is right
values after change
 right    4805
left     1553
Name: preferred_foot, dtype: int64
values before change
 medium    4159
high      1797
low        314
None        88
Name: attacking_work_rate, dtype: int64
mode label is medium
values after change
 medium    4247
high      1797
low        314
Name: attacking_work_rate, dtype: int64
values before change
 medium    4520
high      1134
low        616
1           23
2           15
7           10
0           10
5            9
6            7
3            6
9            5
4            3
Name: defensive_work_rate, dtype: int64
mode label is medium
values after change
 medium    4608
high      1134
low        616
Name: defensive_work_rate, dtype: int64
medium    4608
high      1134
low        616
Name: defensive_work_rate, dtype: int64


### Numerical Variables

In [11]:
# Check if all are float
id_column = "player_fifa_api_id"
numerical_columns = list(set(df.columns) - set(categorial_columns) - set([id_column]))
print(df[numerical_columns].dtypes)
display(df[numerical_columns].head(3))

heading_accuracy      float64
player_api_id           int64
long_passing          float64
rm                      int64
penalties             float64
rw                      int64
player_name            object
reactions             float64
strength              float64
marking               float64
gk_kicking            float64
cf                      int64
cam                     int64
lw                      int64
stamina               float64
finishing             float64
agility               float64
rwb                     int64
gk_positioning        float64
free_kick_accuracy    float64
cm                      int64
rcb                     int64
long_shots            float64
date                   object
rdm                     int64
curve                 float64
overall_rating        float64
lcm                     int64
sliding_tackle        float64
player_positions       object
lwb                     int64
rs                      int64
st                      int64
accelerati

Unnamed: 0,heading_accuracy,player_api_id,long_passing,rm,penalties,rw,player_name,reactions,strength,marking,...,gk_handling,height,rf,cdm,positioning,weight,volleys,rcm,id,ball_control
0,71.0,505942,64.0,53,48.0,51,Aaron Appindangoye,47.0,76.0,65.0,...,11.0,182.88,51,58,45.0,187,44.0,53,1,49.0
1,58.0,155782,68.0,69,59.0,67,Aaron Cresswell,67.0,56.0,76.0,...,7.0,170.18,65,70,60.0,146,40.0,67,6,71.0
2,68.0,30572,61.0,50,37.0,48,Aaron Galindo,57.0,90.0,72.0,...,12.0,182.88,47,65,26.0,198,48.0,56,65,62.0


In [12]:
column_check = df[numerical_columns].isna().sum()
print(column_check)

heading_accuracy      0
player_api_id         0
long_passing          0
rm                    0
penalties             0
rw                    0
player_name           0
reactions             0
strength              0
marking               0
gk_kicking            0
cf                    0
cam                   0
lw                    0
stamina               0
finishing             0
agility               0
rwb                   0
gk_positioning        0
free_kick_accuracy    0
cm                    0
rcb                   0
long_shots            0
date                  0
rdm                   0
curve                 0
overall_rating        0
lcm                   0
sliding_tackle        0
player_positions      0
lwb                   0
rs                    0
st                    0
acceleration          0
cb                    0
dribbling             0
crossing              0
lf                    0
shot_power            0
lm                    0
aggression            0
short_passing   

In [13]:
# Confirm that the 836/2713 missing values belong to the same record

column_check = df[numerical_columns].isna().sum()
missing_columns_set1 = list(column_check[column_check == 836].index)
missing_columns_set2 = list(column_check[column_check == 2713].index)

df_set1 = df[missing_columns_set1]
df_set2 = df[missing_columns_set2]

df1_rows_na = df_set1[df_set1.isna().any(axis=1)]
df2_rows_na = df_set2[df_set2.isna().any(axis=1)]

print(len(df1_rows_na))
print(len(df2_rows_na))


0
0


In [14]:
#Full outer join
rows_to_drop = df1_rows_na.merge(df2_rows_na, left_index = True, right_index=True, how="outer")
print("number of rows to drop is", len(rows_to_drop))

number of rows to drop is 0


In [15]:
print("initial df length is ", len(df))
# df = df.drop(df1_rows_na.index)
# df = df.drop(df2_rows_na.index)
df = df.drop(rows_to_drop.index)
print("new df length is ", len(df))

#double check
column_check = df[numerical_columns].isna().sum()
print(column_check)

initial df length is  6358
new df length is  6358
heading_accuracy      0
player_api_id         0
long_passing          0
rm                    0
penalties             0
rw                    0
player_name           0
reactions             0
strength              0
marking               0
gk_kicking            0
cf                    0
cam                   0
lw                    0
stamina               0
finishing             0
agility               0
rwb                   0
gk_positioning        0
free_kick_accuracy    0
cm                    0
rcb                   0
long_shots            0
date                  0
rdm                   0
curve                 0
overall_rating        0
lcm                   0
sliding_tackle        0
player_positions      0
lwb                   0
rs                    0
st                    0
acceleration          0
cb                    0
dribbling             0
crossing              0
lf                    0
shot_power            0
lm            

In [16]:
print(df['preferred_foot'].value_counts())

right    4805
left     1553
Name: preferred_foot, dtype: int64


## One-hot encode

In [17]:
#one hot encode everything!
def one_hot_encode(df, column_name):
    temp = df.copy()
    df = df.drop(column_name,axis = 1)
    df = df.join(pd.get_dummies(temp[column_name], prefix=column_name))
    return df
# df = one_hot_encode(df, col_name)

In [18]:
for col_name in categorial_columns:
    df = one_hot_encode(df, col_name)

## Final checks and saving

In [19]:
df.isna().sum()

id                            0
player_fifa_api_id            0
player_api_id                 0
date                          0
overall_rating                0
potential                     0
crossing                      0
finishing                     0
heading_accuracy              0
short_passing                 0
volleys                       0
dribbling                     0
curve                         0
free_kick_accuracy            0
long_passing                  0
ball_control                  0
acceleration                  0
sprint_speed                  0
agility                       0
reactions                     0
balance                       0
shot_power                    0
jumping                       0
stamina                       0
strength                      0
long_shots                    0
aggression                    0
interceptions                 0
positioning                   0
vision                        0
penalties                     0
marking 

In [25]:
df.columns

Index(['id', 'player_fifa_api_id', 'player_api_id', 'date', 'overall_rating',
       'potential', 'crossing', 'finishing', 'heading_accuracy',
       'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
       'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
       'gk_reflexes', 'player_positions', 'ls', 'st', 'rs', 'lw', 'lf', 'cf',
       'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb',
       'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb',
       'player_name', 'height', 'weight', 'preferred_foot_left',
       'preferred_foot_right', 'attacking_work_rate_high',
       'attacking_work_rate_low', 'attacking_work_rate_me

In [24]:
df.head(1).values

array([[1, 218353, 505942, '2016-02-18', 67.0, 71.0, 49.0, 44.0, 71.0,
        61.0, 44.0, 51.0, 45.0, 39.0, 64.0, 49.0, 60.0, 64.0, 59.0, 47.0,
        65.0, 55.0, 58.0, 54.0, 76.0, 35.0, 71.0, 70.0, 45.0, 54.0, 48.0,
        65.0, 69.0, 69.0, 6.0, 11.0, 10.0, 8.0, 8.0, 'CB', 53, 53, 53,
        51, 51, 51, 51, 51, 52, 52, 52, 53, 53, 53, 53, 53, 56, 58, 58,
        58, 56, 58, 62, 62, 62, 58, 'Aaron Appindangoye', 182.88, 187, 0,
        1, 0, 0, 1, 0, 0, 1]], dtype=object)

In [21]:
csv_path = "datasets/cleaned_soccer_data_2016_v2.csv"
df.to_csv(csv_path)

## Testing

In [None]:
# #Find KNN for replacement
# def replaceNA(df, cat_name):

#     numerical_df = df.drop(columns=['player_fifa_api_id', 'preferred_foot', 'attacking_work_rate', 'defensive_work_rate'])
#     print(numerical_df.isna().sum())
# #         knn = NearestNeighbors(n_neighbors=5)
# #     knn.fit(df.drop(columns=['player_fifa_api_id', 'preferred_foot', 'attacking_work_rate', 'defensive_work_rate']))
# #     knn.kneighbors(X[0], return_distance=False)

# replaceNA(df, 'attacking_work_rate')

In [71]:
df_test = pd.DataFrame([[1, True, 'dsfasd', 51.314],
                   [51, False, '56345', 56.1234]],
                  columns=['col1', 'col2', 'col3', 'col4'])

res = df_test.dtypes

print(res)

col1      int64
col2       bool
col3     object
col4    float64
dtype: object
