# Ex2 - Filtering and Sorting Data

This time we are going to pull data directly from the internet.

### Step 1. Import the necessary libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

spark = SparkSession.builder.appName("Euro12").getOrCreate()

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/02_Filtering_%26_Sorting/Euro12/Euro_2012_stats_TEAM.csv). 

### Step 3. Assign it to a variable called euro12.

In [3]:
url = 'https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/02_Filtering_%26_Sorting/Euro12/Euro_2012_stats_TEAM.csv'
from pyspark import SparkFiles
spark.sparkContext.addFile(url)

euro12 = spark.read.csv(SparkFiles.get("Euro_2012_stats_TEAM.csv"), sep=',', header=True, inferSchema= True)

In [52]:
euro12 = euro12.withColumnRenamed('Total shots (inc. Blocked)', 'Total shots')

In [53]:
euro12.printSchema()

root
 |-- Team: string (nullable = true)
 |-- Goals: integer (nullable = true)
 |-- Shots on target: integer (nullable = true)
 |-- Shots off target: integer (nullable = true)
 |-- Shooting Accuracy: string (nullable = true)
 |-- % Goals-to-shots: string (nullable = true)
 |-- Total shots: integer (nullable = true)
 |-- Hit Woodwork: integer (nullable = true)
 |-- Penalty goals: integer (nullable = true)
 |-- Penalties not scored: integer (nullable = true)
 |-- Headed goals: integer (nullable = true)
 |-- Passes: integer (nullable = true)
 |-- Passes completed: integer (nullable = true)
 |-- Passing Accuracy: string (nullable = true)
 |-- Touches: integer (nullable = true)
 |-- Crosses: integer (nullable = true)
 |-- Dribbles: integer (nullable = true)
 |-- Corners Taken: integer (nullable = true)
 |-- Tackles: integer (nullable = true)
 |-- Clearances: integer (nullable = true)
 |-- Interceptions: integer (nullable = true)
 |-- Clearances off line: integer (nullable = true)
 |-- Clean

### Step 4. Select only the Goal column.

In [5]:
euro12.select(F.col('Goals')).take(10)

[Row(Goals=4),
 Row(Goals=4),
 Row(Goals=4),
 Row(Goals=5),
 Row(Goals=3),
 Row(Goals=10),
 Row(Goals=5),
 Row(Goals=6),
 Row(Goals=2),
 Row(Goals=2)]

### Step 5. How many team participated in the Euro2012?

In [7]:
euro12.count()

16

### Step 6. What is the number of columns in the dataset?

In [10]:
len(euro12.columns)

35

### Step 7. View only the columns Team, Yellow Cards and Red Cards and assign them to a dataframe called discipline

In [11]:
discipline = euro12.select(F.col('Team'), F.col('Yellow Cards'), F.col('Red Cards'))

In [12]:
discipline.take(10)

[Row(Team='Croatia', Yellow Cards=9, Red Cards=0),
 Row(Team='Czech Republic', Yellow Cards=7, Red Cards=0),
 Row(Team='Denmark', Yellow Cards=4, Red Cards=0),
 Row(Team='England', Yellow Cards=5, Red Cards=0),
 Row(Team='France', Yellow Cards=6, Red Cards=0),
 Row(Team='Germany', Yellow Cards=4, Red Cards=0),
 Row(Team='Greece', Yellow Cards=9, Red Cards=1),
 Row(Team='Italy', Yellow Cards=16, Red Cards=0),
 Row(Team='Netherlands', Yellow Cards=5, Red Cards=0),
 Row(Team='Poland', Yellow Cards=7, Red Cards=1)]

### Step 8. Sort the teams by Red Cards, then to Yellow Cards

In [14]:
discipline.orderBy(['Red Cards', 'Yellow Cards'], ascending=False).take(10)

[Row(Team='Greece', Yellow Cards=9, Red Cards=1),
 Row(Team='Poland', Yellow Cards=7, Red Cards=1),
 Row(Team='Republic of Ireland', Yellow Cards=6, Red Cards=1),
 Row(Team='Italy', Yellow Cards=16, Red Cards=0),
 Row(Team='Portugal', Yellow Cards=12, Red Cards=0),
 Row(Team='Spain', Yellow Cards=11, Red Cards=0),
 Row(Team='Croatia', Yellow Cards=9, Red Cards=0),
 Row(Team='Czech Republic', Yellow Cards=7, Red Cards=0),
 Row(Team='Sweden', Yellow Cards=7, Red Cards=0),
 Row(Team='France', Yellow Cards=6, Red Cards=0)]

### Step 9. Calculate the mean Yellow Cards given per Team

In [19]:
discipline.agg({'Yellow Cards': 'mean'}).show()

+-----------------+
|avg(Yellow Cards)|
+-----------------+
|           7.4375|
+-----------------+



### Step 10. Filter teams that scored more than 6 goals

In [21]:
euro12.select('Team').where(F.col('Goals') > 6).take(10)

[Row(Team='Germany'), Row(Team='Spain')]

### Step 11. Select the teams that start with G

In [23]:
euro12.select('Team').where(F.col('Team').startswith('G')).take(10)

[Row(Team='Germany'), Row(Team='Greece')]

### Step 12. Select the first 7 columns

In [24]:
euro12.head(7)

[Row(Team='Croatia', Goals=4, Shots on target=13, Shots off target=12, Shooting Accuracy='51.9%', % Goals-to-shots='16.0%', Total shots (inc. Blocked)=32, Hit Woodwork=0, Penalty goals=0, Penalties not scored=0, Headed goals=2, Passes=1076, Passes completed=828, Passing Accuracy='76.9%', Touches=1706, Crosses=60, Dribbles=42, Corners Taken=14, Tackles=49, Clearances=83, Interceptions=56, Clearances off line=None, Clean Sheets=0, Blocks=10, Goals conceded=3, Saves made=13, Saves-to-shots ratio='81.3%', Fouls Won=41, Fouls Conceded=62, Offsides=2, Yellow Cards=9, Red Cards=0, Subs on=9, Subs off=9, Players Used=16),
 Row(Team='Czech Republic', Goals=4, Shots on target=13, Shots off target=18, Shooting Accuracy='41.9%', % Goals-to-shots='12.9%', Total shots (inc. Blocked)=39, Hit Woodwork=0, Penalty goals=0, Penalties not scored=0, Headed goals=0, Passes=1565, Passes completed=1223, Passing Accuracy='78.1%', Touches=2358, Crosses=46, Dribbles=68, Corners Taken=21, Tackles=62, Clearances=9

### Step 13. Select all columns except the last 3.

In [57]:
euro12.select(euro12.columns[:-3])

DataFrame[Team: string, Goals: int, Shots on target: int, Shots off target: int, Shooting Accuracy: string, % Goals-to-shots: string, Total shots: int, Hit Woodwork: int, Penalty goals: int, Penalties not scored: int, Headed goals: int, Passes: int, Passes completed: int, Passing Accuracy: string, Touches: int, Crosses: int, Dribbles: int, Corners Taken: int, Tackles: int, Clearances: int, Interceptions: int, Clearances off line: int, Clean Sheets: int, Blocks: int, Goals conceded: int, Saves made: int, Saves-to-shots ratio: string, Fouls Won: int, Fouls Conceded: int, Offsides: int, Yellow Cards: int, Red Cards: int]

### Step 14. Present only the Shooting Accuracy from England, Italy and Russia

In [60]:
euro12.select('Team', 'Shooting Accuracy').where(F.col('Team').isin(['England', 'Italy', 'Russia'])).show()

+-------+-----------------+
|   Team|Shooting Accuracy|
+-------+-----------------+
|England|            50.0%|
|  Italy|            43.0%|
| Russia|            22.5%|
+-------+-----------------+

