# Search and Filter DataFrames

In [50]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [10]:
spark = SparkSession.builder.appName("SearchFilter").getOrCreate()
spark

## About this dataframe

The **fifa19.csv** dataset includes a list of all the FIFA 2019 players and their attributes listed below: 

 - **General**: Age, Nationality, Overall, Potential, Club
 - **Metrics:** Value, Wage
 - **Player Descriptive:** Preferred Foot, International Reputation, Weak Foot, Skill Moves, Work Rate, Position, Jersey Number, Joined, Loaned From, Contract Valid Until, Height, Weight
 - **Possition:** LS, ST, RS, LW, LF, CF, RF, RW, LAM, CAM, RAM, LM, LCM, CM, RCM, RM, LWB, LDM, CDM, RDM, RWB, LB, LCB, CB, RCB, RB, 
 - **Other:** Crossing, Finishing, Heading, Accuracy, ShortPassing, Volleys, Dribbling, Curve, FKAccuracy, LongPassing, BallControl, Acceleration, SprintSpeed, Agility, Reactions, Balance, ShotPower, Jumping, Stamina, Strength, LongShots, Aggression, Interceptions, Positioning, Vision, Penalties, Composure, Marking, StandingTackle, SlidingTackle, GKDiving, GKHandling, GKKicking, GKPositioning, GKReflexes, and Release Clause.

**Source:** https://www.kaggle.com/karangadiya/fifa19

In [4]:
path = '/user/harishmohan/Datasets/'
fifa = spark.read.csv(path + 'fifa19.csv', inferSchema=True, header=True)

In [6]:
fifa.limit(5).toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96,33,28,26,6,11,15,14,8,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95,28,31,23,7,11,15,14,11,€127.1M
2,2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,Paris Saint-Germain,...,94,27,24,33,9,9,15,15,11,€228.1M
3,3,193080,De Gea,27,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91,93,Manchester United,...,68,15,21,13,90,85,87,88,94,€138.6M
4,4,192985,K. De Bruyne,27,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91,92,Manchester City,...,88,68,58,51,15,13,5,10,13,€196.4M


In [9]:
fifa.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Photo: string (nullable = true)
 |-- Nationality: string (nullable = true)
 |-- Flag: string (nullable = true)
 |-- Overall: integer (nullable = true)
 |-- Potential: integer (nullable = true)
 |-- Club: string (nullable = true)
 |-- Club Logo: string (nullable = true)
 |-- Value: string (nullable = true)
 |-- Wage: string (nullable = true)
 |-- Special: integer (nullable = true)
 |-- Preferred Foot: string (nullable = true)
 |-- International Reputation: integer (nullable = true)
 |-- Weak Foot: integer (nullable = true)
 |-- Skill Moves: integer (nullable = true)
 |-- Work Rate: string (nullable = true)
 |-- Body Type: string (nullable = true)
 |-- Real Face: string (nullable = true)
 |-- Position: string (nullable = true)
 |-- Jersey Number: integer (nullable = true)
 |-- Joined: string (nullable = true)
 |-- Loaned From: string (nu

In [37]:
fifa.select(['Nationality', 'Name', 'Age', 'Photo']) \
    .show(5, False)

+-----------+-----------------+---+----------------------------------------------+
|Nationality|Name             |Age|Photo                                         |
+-----------+-----------------+---+----------------------------------------------+
|Argentina  |L. Messi         |31 |https://cdn.sofifa.org/players/4/19/158023.png|
|Portugal   |Cristiano Ronaldo|33 |https://cdn.sofifa.org/players/4/19/20801.png |
|Brazil     |Neymar Jr        |26 |https://cdn.sofifa.org/players/4/19/190871.png|
|Spain      |De Gea           |27 |https://cdn.sofifa.org/players/4/19/193080.png|
|Belgium    |K. De Bruyne     |27 |https://cdn.sofifa.org/players/4/19/192985.png|
+-----------+-----------------+---+----------------------------------------------+
only showing top 5 rows



In [36]:
fifa.select(['Name', 'Age']) \
    .orderBy(fifa["Age"]) \
    .show(5)

+------------+---+
|        Name|Age|
+------------+---+
|   B. Nygren| 16|
|H. Andersson| 16|
|    A. Doğan| 16|
|  C. Bassett| 16|
|    B. Mumba| 16|
+------------+---+
only showing top 5 rows



In [17]:
fifa.select(['Name', 'Age']) \
    .orderBy(fifa["Age"] \
    .desc()) \
    .show(5)

+-------------+---+
|         Name|Age|
+-------------+---+
|     O. Pérez| 45|
|K. Pilkington| 44|
|    T. Warner| 44|
|  S. Narazaki| 42|
|     C. Muñoz| 41|
+-------------+---+
only showing top 5 rows



In [22]:
fifa.select(["Name", "Club"]) \
    .where(fifa.Club.like("%Barcelona%")) \
    .show(5,False)

+---------------+------------+
|Name           |Club        |
+---------------+------------+
|L. Messi       |FC Barcelona|
|L. Suárez      |FC Barcelona|
|M. ter Stegen  |FC Barcelona|
|Sergio Busquets|FC Barcelona|
|Coutinho       |FC Barcelona|
+---------------+------------+
only showing top 5 rows



In [33]:
fifa.select("Photo", fifa.Photo.substr(-4,4)) \
    .show(5, False)

+----------------------------------------------+-----------------------+
|Photo                                         |substring(Photo, -4, 4)|
+----------------------------------------------+-----------------------+
|https://cdn.sofifa.org/players/4/19/158023.png|.png                   |
|https://cdn.sofifa.org/players/4/19/20801.png |.png                   |
|https://cdn.sofifa.org/players/4/19/190871.png|.png                   |
|https://cdn.sofifa.org/players/4/19/193080.png|.png                   |
|https://cdn.sofifa.org/players/4/19/192985.png|.png                   |
+----------------------------------------------+-----------------------+
only showing top 5 rows



In [34]:
fifa[fifa.Club.isin("FC Barcelona", "Juventus")] \
    .limit(4) \
    .toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96,33,28,26,6,11,15,14,8,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95,28,31,23,7,11,15,14,11,€127.1M
2,7,176580,L. Suárez,31,https://cdn.sofifa.org/players/4/19/176580.png,Uruguay,https://cdn.sofifa.org/flags/60.png,91,91,FC Barcelona,...,85,62,45,38,27,25,31,33,37,€164M
3,15,211110,P. Dybala,24,https://cdn.sofifa.org/players/4/19/211110.png,Argentina,https://cdn.sofifa.org/flags/52.png,89,94,Juventus,...,84,23,20,20,5,4,4,5,8,€153.5M


In [35]:
fifa.select("Name","Club") \
    .where(fifa.Name.startswith("L")) \
    .where(fifa.Name.endswith("i")) \
    .limit(4) \
    .toPandas()

Unnamed: 0,Name,Club
0,L. Messi,FC Barcelona
1,L. Bonucci,Juventus
2,L. Fabiański,West Ham United
3,L. Pellegrini,Roma


In [38]:
fifa.count()

18207

In [39]:
df = fifa.limit(100)
df.count()

100

In [40]:
col_list = fifa.columns[0:5]
df3 = fifa.select(col_list)
df3.show()

+---+------+-----------------+---+--------------------+
|_c0|    ID|             Name|Age|               Photo|
+---+------+-----------------+---+--------------------+
|  0|158023|         L. Messi| 31|https://cdn.sofif...|
|  1| 20801|Cristiano Ronaldo| 33|https://cdn.sofif...|
|  2|190871|        Neymar Jr| 26|https://cdn.sofif...|
|  3|193080|           De Gea| 27|https://cdn.sofif...|
|  4|192985|     K. De Bruyne| 27|https://cdn.sofif...|
|  5|183277|        E. Hazard| 27|https://cdn.sofif...|
|  6|177003|        L. Modrić| 32|https://cdn.sofif...|
|  7|176580|        L. Suárez| 31|https://cdn.sofif...|
|  8|155862|     Sergio Ramos| 32|https://cdn.sofif...|
|  9|200389|         J. Oblak| 25|https://cdn.sofif...|
| 10|188545|   R. Lewandowski| 29|https://cdn.sofif...|
| 11|182521|         T. Kroos| 28|https://cdn.sofif...|
| 12|182493|         D. Godín| 32|https://cdn.sofif...|
| 13|168542|      David Silva| 32|https://cdn.sofif...|
| 14|215914|         N. Kanté| 27|https://cdn.so

In [41]:
len(df3.columns)

5

In [53]:
fifa.filter("Overall>50") \
    .limit(5) \
    .toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96,33,28,26,6,11,15,14,8,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95,28,31,23,7,11,15,14,11,€127.1M
2,2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,Paris Saint-Germain,...,94,27,24,33,9,9,15,15,11,€228.1M
3,3,193080,De Gea,27,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91,93,Manchester United,...,68,15,21,13,90,85,87,88,94,€138.6M
4,4,192985,K. De Bruyne,27,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91,92,Manchester City,...,88,68,58,51,15,13,5,10,13,€196.4M


In [61]:
fifa.filter("Overall<50") \
    .select(['Name', 'Age']) \
    .limit(5) \
    .toPandas()

Unnamed: 0,Name,Age
0,D. Collins,17
1,J. Egan,19
2,Xie Xiaofan,20
3,B. Buckley,17
4,G. Figliuzzi,17


In [62]:
fifa.filter("Overall>50") \
    .filter("Overall<70") \
    .select(["Name", "Age", "Overall"]) \
    .limit(5) \
    .toPandas()

Unnamed: 0,Name,Age,Overall
0,R. Abdullah,24,69
1,M. Ordóñez,28,69
2,L. Robertone,21,69
3,I. Mathew,21,69
4,P. Velthuizen,31,69


In [64]:
result =fifa.filter("Overall>50").select(["Nationality", "Name", "Age", "Overall"]).orderBy(fifa["Overall"].desc()).collect()

In [67]:
type(result[0])

pyspark.sql.types.Row

In [69]:
print("Best Player Over 50: ", result[0][1])
print("Worst Player Over 50: ", result[-1][1])

Best Player Over 50:  L. Messi
Worst Player Over 50:  C. Addai


In [73]:
fifa.select(['Name','Position','Release Clause']) \
    .show(5,False)

+-----------------+--------+--------------+
|Name             |Position|Release Clause|
+-----------------+--------+--------------+
|L. Messi         |RF      |€226.5M       |
|Cristiano Ronaldo|ST      |€127.1M       |
|Neymar Jr        |LW      |€228.1M       |
|De Gea           |GK      |€138.6M       |
|K. De Bruyne     |RCM     |€196.4M       |
+-----------------+--------+--------------+
only showing top 5 rows



In [74]:
fifa.select(['Name', 'Position']) \
    .orderBy('Name') \
    .show(5)

+-------------+--------+
|         Name|Position|
+-------------+--------+
|     A. Abang|      ST|
|A. Abdellaoui|      LB|
| A. Abdennour|      CB|
|      A. Abdi|      CM|
|A. Abdu Jaber|      ST|
+-------------+--------+
only showing top 5 rows



In [77]:
fifa.select("Name", "Club") \
    .where(fifa.Club.like("FC%")) \
    .show(5, False)

+---------------+-----------------+
|Name           |Club             |
+---------------+-----------------+
|L. Messi       |FC Barcelona     |
|L. Suárez      |FC Barcelona     |
|R. Lewandowski |FC Bayern München|
|M. ter Stegen  |FC Barcelona     |
|Sergio Busquets|FC Barcelona     |
+---------------+-----------------+
only showing top 5 rows



In [81]:
fifa.select(["Name","Club"]) \
    .where(fifa.Club.startswith("FC")) \
    .limit(4) \
    .toPandas()

Unnamed: 0,Name,Club
0,L. Messi,FC Barcelona
1,L. Suárez,FC Barcelona
2,R. Lewandowski,FC Bayern München
3,M. ter Stegen,FC Barcelona


In [85]:
fifa.select("Name", "Age") \
    .sort(desc("Age")) \
    .show(1)

+--------+---+
|    Name|Age|
+--------+---+
|O. Pérez| 45|
+--------+---+
only showing top 1 row



In [86]:
fifa[fifa.Name.isin("L. Messi", "Cristiano Ronaldo")] \
    .limit(4) \
    .toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96,33,28,26,6,11,15,14,8,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95,28,31,23,7,11,15,14,11,€127.1M


In [88]:
fifa.select("Release Clause",fifa["Release Clause"].substr(1,1)).show(5,False)

+--------------+-------------------------------+
|Release Clause|substring(Release Clause, 1, 1)|
+--------------+-------------------------------+
|€226.5M       |€                              |
|€127.1M       |€                              |
|€228.1M       |€                              |
|€138.6M       |€                              |
|€196.4M       |€                              |
+--------------+-------------------------------+
only showing top 5 rows



In [89]:
fifa.filter("Age>40").limit(4).toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,1120,156092,J. Villar,41,https://cdn.sofifa.org/players/4/19/156092.png,Paraguay,https://cdn.sofifa.org/flags/58.png,77,77,,...,55,13,13,14,75,75,74,78,77,
1,4228,3665,B. Nivet,41,https://cdn.sofifa.org/players/4/19/3665.png,France,https://cdn.sofifa.org/flags/18.png,71,71,ESTAC Troyes,...,82,58,56,43,11,7,8,14,7,
2,4741,140029,O. Pérez,45,https://cdn.sofifa.org/players/4/19/140029.png,Mexico,https://cdn.sofifa.org/flags/83.png,71,71,Pachuca,...,62,23,12,11,70,64,65,73,74,€272K
3,7225,142998,C. Muñoz,41,https://cdn.sofifa.org/players/4/19/142998.png,Argentina,https://cdn.sofifa.org/flags/52.png,68,68,CD Universidad de Concepción,...,62,18,14,19,67,65,68,71,68,€84K
