# About

In this lab, we will reinforce the foundations of adding new columns to our dataset and sorting data

# Step 1:  Setup python for data analysis

In [1]:
import numpy as np
import pandas as pd

# Step 2:  Bring in the dataset


In [2]:
# load the file and call it fifa
url = "https://public.tableau.com/s/sites/default/files/media/fifa18_clean.csv"
fifa = pd.read_csv(url)

In [3]:
# explore the basics
fifa.head()

Unnamed: 0,Wage (€),Value (€),Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,RB,RCB,RCM,RDM,RF,RM,RS,RW,RWB,ST
0,565000,95500000,Cristiano Ronaldo,32,https://cdn.sofifa.org/48/18/players/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Real Madrid CF,...,61.0,53.0,82.0,62.0,91.0,89.0,92.0,91.0,66.0,92.0
1,565000,105000000,L. Messi,30,https://cdn.sofifa.org/48/18/players/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,93,93,FC Barcelona,...,57.0,45.0,84.0,59.0,92.0,90.0,88.0,91.0,62.0,88.0
2,280000,123000000,Neymar,25,https://cdn.sofifa.org/48/18/players/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,94,Paris Saint-Germain,...,59.0,46.0,79.0,59.0,88.0,87.0,84.0,89.0,64.0,84.0
3,510000,97000000,L. Suárez,30,https://cdn.sofifa.org/48/18/players/176580.png,Uruguay,https://cdn.sofifa.org/flags/60.png,92,92,FC Barcelona,...,64.0,58.0,80.0,65.0,88.0,85.0,88.0,87.0,68.0,88.0
4,230000,61000000,M. Neuer,31,https://cdn.sofifa.org/48/18/players/167495.png,Germany,https://cdn.sofifa.org/flags/21.png,92,92,FC Bayern Munich,...,,,,,,,,,,


# Step 3: Remove rows

In [11]:
## remove rows where at least one value is missing
## keep the dataset as fifa
fifa.dropna(inplace = True)

In [13]:
#fifa.isna().sum()

# Step 4: Let's keep a few columns

> Save the new dataset as `fifa_final`

In [48]:
## keep only name, age, Wage (€) , Overall, Agility, Crossing, Curve, Finishing
fifa_final = fifa[["Name", "Age", "Wage (€)" , "Overall", "Agility", "Crossing", "Curve", "Finishing"]].copy()
                  
                  
                  

In [49]:
fifa_final.head()

Unnamed: 0,Name,Age,Wage (€),Overall,Agility,Crossing,Curve,Finishing
0,Cristiano Ronaldo,32,565000,94,89.0,85.0,81.0,94.0
1,L. Messi,30,565000,93,90.0,77.0,89.0,95.0
2,Neymar,25,280000,92,96.0,75.0,81.0,89.0
3,L. Suárez,30,510000,92,86.0,77.0,86.0,94.0
5,R. Lewandowski,28,355000,91,78.0,62.0,77.0,91.0


# Step 5:  Calculate 2 columns

In [51]:
## calculate the difference between overall and finishing
## call the column diff
#fifa_final.isna().sum()

fifa_final['diff'] = fifa_final['Overall'] - fifa_final['Finishing']

In [53]:
# create a column that calculates the deviation between Age and the average Age
# call this age_diff
fifa_final['age_diff'] = fifa_final['Age'] - fifa_final['Age'].mean()

In [55]:
fifa_final.head()

Unnamed: 0,Name,Age,Wage (€),Overall,Agility,Crossing,Curve,Finishing,diff,age_diff
0,Cristiano Ronaldo,32,565000,94,89.0,85.0,81.0,94.0,0.0,6.996699
1,L. Messi,30,565000,93,90.0,77.0,89.0,95.0,-2.0,4.996699
2,Neymar,25,280000,92,96.0,75.0,81.0,89.0,3.0,-0.003301
3,L. Suárez,30,510000,92,86.0,77.0,86.0,94.0,-2.0,4.996699
5,R. Lewandowski,28,355000,91,78.0,62.0,77.0,91.0,0.0,2.996699


# Step 6:  Sort the dataset

In [60]:
# print out the dataset with the 10 oldest players listed at the top
fifa_final.sort_values('Age',ascending=False, inplace = True)
fifa_final.head(10)

Unnamed: 0,Name,Age,Wage (€),Overall,Agility,Crossing,Curve,Finishing,diff,age_diff
7727,K. Wæhler,41,2000,67,33.0,38.0,23.0,25.0,42.0,15.996699
3309,M. Candelo,40,2000,72,70.0,76.0,76.0,60.0,12.0,14.996699
3148,T. Simons,40,14000,73,31.0,53.0,46.0,54.0,19.0,14.996699
13863,M. Brown,40,2000,61,37.0,51.0,45.0,55.0,6.0,14.996699
2224,B. Nivet,40,16000,74,60.0,69.0,73.0,73.0,1.0,14.996699
16749,D. Mulcahy,39,1000,55,38.0,52.0,48.0,35.0,20.0,13.996699
9201,F. Kippe,39,1000,66,46.0,33.0,53.0,46.0,20.0,13.996699
9184,W. Díaz,39,1000,66,66.0,56.0,27.0,19.0,47.0,13.996699
853,Hilton,39,18000,78,60.0,60.0,42.0,45.0,33.0,13.996699
7968,Y. Nakazawa,39,4000,67,37.0,60.0,51.0,40.0,27.0,13.996699


# Step 7: Sort the dataset 2

In [61]:
# sort the dataset by Crossing and Curve lowest-> highest, 
# and keep the results in the DataFrame fifa_final
fifa_final.sort_values(['Crossing', 'Curve'],ascending=True, inplace = True)

In [62]:
# print out the first 7 rows
fifa_final.head(7)

Unnamed: 0,Name,Age,Wage (€),Overall,Agility,Crossing,Curve,Finishing,diff,age_diff
6085,M. Pellizzer,28,2000,69,71.0,11.0,11.0,13.0,56.0,2.996699
10010,E. Lancini,23,1000,65,56.0,11.0,16.0,14.0,51.0,-2.003301
11205,A. Camigliano,22,1000,64,34.0,11.0,16.0,13.0,51.0,-3.003301
1464,S. De Maio,30,50000,76,49.0,11.0,23.0,20.0,56.0,4.996699
16710,B. Clayton,19,4000,55,60.0,11.0,32.0,57.0,-2.0,-6.003301
5770,F. Poli,28,3000,69,73.0,12.0,11.0,13.0,56.0,2.996699
8206,M. Bani,23,8000,67,48.0,12.0,13.0,12.0,55.0,-2.003301
