In [None]:
"""
SQL Practice
Part 3: Soccer Data

"""

In [1]:
import pandas as pd
import sqlite3

conn = sqlite3.connect('database.sqlite')

query = "SELECT * FROM sqlite_master"

df_schema = pd.read_sql_query(query, conn)

df_schema.tbl_name.unique()

array(['sqlite_sequence', 'Player_Attributes', 'Player', 'Match',
       'League', 'Country', 'Team', 'Team_Attributes'], dtype=object)

#### 1. Which team scored the most points when playing at home?

In [2]:
query = """
SELECT home_team_api_id,team_long_name, SUM(home_team_goal) FROM(
    SELECT * FROM Match
    JOIN Team
    ON Match.home_team_api_id = Team.team_api_id)
GROUP BY home_team_api_id
ORDER BY SUM(home_team_goal) DESC
"""

df = pd.read_sql_query(query, conn)

In [3]:
df

Unnamed: 0,home_team_api_id,team_long_name,SUM(home_team_goal)
0,8633,Real Madrid CF,505
1,8634,FC Barcelona,495
2,9925,Celtic,389
3,9823,FC Bayern Munich,382
4,8640,PSV,370
...,...,...,...
294,108893,AC Arles-Avignon,14
295,7992,Trofense,14
296,4064,Feirense,13
297,7869,Córdoba CF,12


#### 2.Did this team also score the most points when playing away?

In [4]:
query = """
SELECT away_team_api_id,team_long_name, SUM(away_team_goal) FROM(
    SELECT * FROM Match
    JOIN Team
    ON Match.away_team_api_id = Team.team_api_id)
GROUP BY home_team_api_id
ORDER BY SUM(home_team_goal) DESC
"""

df = pd.read_sql_query(query, conn)
df

Unnamed: 0,away_team_api_id,team_long_name,SUM(away_team_goal)
0,9864,Málaga CF,147
1,10281,Real Valladolid,101
2,9800,St. Mirren,88
3,9790,Hamburger SV,96
4,9791,Heracles Almelo,126
...,...,...,...
294,9748,Olympique Lyonnais,31
295,7841,Rio Ave FC,21
296,10214,CD Nacional,19
297,9783,RC Deportivo de La Coruña,33


In [5]:
# Yes!

#### 3. How many matches resulted in a tie?

In [6]:
query = """
SELECT COUNT(*)
FROM Match
WHERE home_team_goal = away_team_goal
"""

df = pd.read_sql_query(query, conn)
df

Unnamed: 0,COUNT(*)
0,6596


#### 4. How many players have Smith for their last name? How many have 'smith' anywhere in their name?

In [7]:
query = """
SELECT COUNT(*)
FROM Player
WHERE player_name LIKE '_smith%'
"""

df = pd.read_sql_query(query, conn)
df

Unnamed: 0,COUNT(*)
0,0


In [8]:
query = """
SELECT COUNT(*)
FROM Player
WHERE player_name LIKE '%smith%'
"""

df = pd.read_sql_query(query, conn)
df

Unnamed: 0,COUNT(*)
0,18


#### 5. What was the median tie score? Use the value determined in the previous question for the number of tie games. Hint: PostgreSQL does not have a median function. Instead, think about the steps required to calculate a median and use the WITH command to store stepwise results as a table and then operate on these results.

In [9]:
query = """
WITH ordered_match AS (
  SELECT
      home_team_goal,
      row_number() OVER (ORDER BY home_team_goal) AS row_id,
      (SELECT COUNT(*) FROM Match) AS ct
  FROM Match
)

SELECT AVG(home_team_goal) AS median
FROM ordered_match
WHERE row_id BETWEEN ct/2.0 AND ct/2.0 + 1
"""

df = pd.read_sql_query(query, conn)
df

Unnamed: 0,median
0,1.0


#### 6. What percentage of players prefer their left or right foot? Hint: Calculate either the right or left foot, whichever is easier based on how you setup the problem.

In [10]:
query = """
SELECT DISTINCT preferred_foot,
        COUNT(player_fifa_api_id) OVER(PARTITION by preferred_foot) AS "count"
FROM Player_Attributes

"""

df = pd.read_sql_query(query, conn)
df

Unnamed: 0,preferred_foot,count
0,,836
1,left,44733
2,right,138409


In [11]:
query = """
WITH stats AS (
    SELECT 
        DISTINCT preferred_foot,
        COUNT(player_fifa_api_id) OVER(PARTITION by preferred_foot) AS foot_count,
        (SELECT COUNT(*) FROM Player_Attributes) AS total
    FROM Player_Attributes
    )
    
SELECT preferred_foot,
       foot_count*1.0/total AS probability
FROM stats

"""

df = pd.read_sql_query(query, conn)
df

Unnamed: 0,preferred_foot,probability
0,,0.004544
1,left,0.243143
2,right,0.752313
