# Практика Pandas (merges and ranks)

# European Soccer Database
- https://www.kaggle.com/datasets/hugomathien/soccer
- https://github.com/hugomathien/football-data-collection

# Check data

## Импорт библиотек и подключение к БД

### SQLite

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 500)

In [2]:
from sqlalchemy import create_engine

# create engine to connect to SQLite database
engine = create_engine('sqlite:///database.sqlite')

## Список всех таблиц в БД

### SQLite

In [3]:
pd.read_sql("""select * from sqlite_master where type = 'table'""", engine)

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,sqlite_sequence,sqlite_sequence,4,"CREATE TABLE sqlite_sequence(name,seq)"
1,table,Player_Attributes,Player_Attributes,11,"CREATE TABLE ""Player_Attributes"" (\n\t`id`\tIN..."
2,table,Player,Player,14,CREATE TABLE `Player` (\n\t`id`\tINTEGER PRIMA...
3,table,Match,Match,18,CREATE TABLE `Match` (\n\t`id`\tINTEGER PRIMAR...
4,table,League,League,24,CREATE TABLE `League` (\n\t`id`\tINTEGER PRIMA...
5,table,Country,Country,26,CREATE TABLE `Country` (\n\t`id`\tINTEGER PRIM...
6,table,Team,Team,29,"CREATE TABLE ""Team"" (\n\t`id`\tINTEGER PRIMARY..."
7,table,Team_Attributes,Team_Attributes,2,CREATE TABLE `Team_Attributes` (\n\t`id`\tINTE...


### Импорт всех таблиц в Pandas Dataframe

In [90]:
league = pd.read_sql("""select * from league""", engine)
country = pd.read_sql("""select * from country""", engine)
match = pd.read_sql("""select * from match""", engine)
team = pd.read_sql("""select * from team""", engine)
# team_attributes = pd.read_sql("""select * from team_attributes""", engine)
# player = pd.read_sql("""select * from player""", engine)
# player_attributes = pd.read_sql("""select * from player_attributes""", engine)

In [91]:
# league = league.rename({'id':'league_id','name':'league_name'}, axis=1)
league = league.rename(columns={'id':'league_id','name':'league_name'})
league = league.drop(columns=['country_id'])
league

Unnamed: 0,league_id,league_name
0,1,Belgium Jupiler League
1,1729,England Premier League
2,4769,France Ligue 1
3,7809,Germany 1. Bundesliga
4,10257,Italy Serie A
5,13274,Netherlands Eredivisie
6,15722,Poland Ekstraklasa
7,17642,Portugal Liga ZON Sagres
8,19694,Scotland Premier League
9,21518,Spain LIGA BBVA


In [92]:
country = country.rename(columns={'id':'country_id','name':'country_name'})
country

Unnamed: 0,country_id,country_name
0,1,Belgium
1,1729,England
2,4769,France
3,7809,Germany
4,10257,Italy
5,13274,Netherlands
6,15722,Poland
7,17642,Portugal
8,19694,Scotland
9,21518,Spain


In [93]:
match = match[['id','country_id','league_id','season','stage','date','match_api_id',
               'home_team_api_id','away_team_api_id','home_team_goal','away_team_goal']]
match = match.rename(columns={'id':'match_id'})
# match = match.drop(columns=['country_id'])
match.head()

Unnamed: 0,match_id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal
0,1,1,1,2008/2009,1,2008-08-17 00:00:00,492473,9987,9993,1,1
1,2,1,1,2008/2009,1,2008-08-16 00:00:00,492474,10000,9994,0,0
2,3,1,1,2008/2009,1,2008-08-16 00:00:00,492475,9984,8635,0,3
3,4,1,1,2008/2009,1,2008-08-17 00:00:00,492476,9991,9998,5,0
4,5,1,1,2008/2009,1,2008-08-16 00:00:00,492477,7947,9985,1,3


In [75]:
team = team.rename(columns={'id':'team_id'})
team.head()

Unnamed: 0,team_id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
0,1,9987,673.0,KRC Genk,GEN
1,2,9993,675.0,Beerschot AC,BAC
2,3,10000,15005.0,SV Zulte-Waregem,ZUL
3,4,9994,2007.0,Sporting Lokeren,LOK
4,5,9984,1750.0,KSV Cercle Brugge,CEB


# Batch 1. Joins

## Примеры

### Вывести список всех матчей (date, match_api_id, id) вместе с названием страны и названием лиги, в которых они проходили.
Ключи для джойна: match.league_id = league.id and match.country_id = country.id

In [None]:
df_1.merge(df_2, on=common_cols, how='left')
df_1.merge(df_2, left_on=left_cols, right_on=right_cols, how='left')
df_1.merge(df_2, left_on=left_cols, right_on=right_cols) # how='inner'
df_1.merge(df_2) #on=common_cols, how='inner'

In [99]:
match.head(1)

Unnamed: 0,match_id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal
0,1,1,1,2008/2009,1,2008-08-17 00:00:00,492473,9987,9993,1,1


In [100]:
league.rename(columns={'league_name':'country_id'})

Unnamed: 0,league_id,league_name
0,1,Belgium Jupiler League
1,1729,England Premier League
2,4769,France Ligue 1
3,7809,Germany 1. Bundesliga
4,10257,Italy Serie A


In [102]:
match.columns

Index(['match_id', 'country_id', 'league_id', 'season', 'stage', 'date',
       'match_api_id', 'home_team_api_id', 'away_team_api_id',
       'home_team_goal', 'away_team_goal'],
      dtype='object')

In [103]:
list(match)

['match_id',
 'country_id',
 'league_id',
 'season',
 'stage',
 'date',
 'match_api_id',
 'home_team_api_id',
 'away_team_api_id',
 'home_team_goal',
 'away_team_goal']

In [106]:
set(match) & set(league)

{'league_id'}

In [107]:
set(match) & set(league.rename(columns={'league_name':'country_id'}))

{'country_id', 'league_id'}

In [118]:
# match.merge(league.astype({'league_id':'str'})).head(1)

In [109]:
match.merge(league.rename(columns={'league_name':'country_id'}))

ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat

In [None]:
match.merge(league.rename(columns={'league_name':'country_id'}))

In [113]:
match.head(1).dtypes

match_id             int64
country_id           int64
league_id            int64
season              object
stage                int64
date                object
match_api_id         int64
home_team_api_id     int64
away_team_api_id     int64
home_team_goal       int64
away_team_goal       int64
dtype: object

In [114]:
league.head(1).dtypes

league_id       int64
league_name    object
dtype: object

In [None]:
match.merge(league).head()

In [108]:
match.merge(league.rename(columns={'league_id':'l_id'}), on=['country_id', 'league_id'])

KeyError: 'country_id'

In [76]:
match.head(1)

Unnamed: 0,match_id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal
0,1,1,1,2008/2009,1,2008-08-17 00:00:00,492473,9987,9993,1,1


In [77]:
country.head(1)

Unnamed: 0,id,name
0,1,Belgium


In [78]:
league.head(1)

Unnamed: 0,id,name
0,1,Belgium Jupiler League


In [86]:
match \
    .merge(country, left_on=['country_id'], right_on=['id']) \
    .head(1) \
    .drop(columns=['id']) \
    .merge(league, left_on=['league_id'], right_on=['id']) \
    .drop(columns=['id']) \
    .rename(columns={'name_x':'country_name', 'name_y':'league_name'})

Unnamed: 0,match_id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,country_name,league_name
0,1,1,1,2008/2009,1,2008-08-17 00:00:00,492473,9987,9993,1,1,Belgium,Belgium Jupiler League


In [None]:
pd.read_sql("""
SELECT  date,
        match_api_id,
        m.id,
        l.name AS league_name,
        c.name AS country_name
FROM Match m
INNER JOIN League l ON m.league_id = l.id
INNER JOIN Country c ON m.country_id = c.id
""", engine)

In [133]:
df = match \
    .merge(league, on='league_id') \
    .merge(country, on='country_id') \
    [['date','match_api_id','match_id','league_name','country_name']]
df.head(3)

# merge(on=[''])

Unnamed: 0,date,match_api_id,match_id,league_name,country_name
0,2008-08-17 00:00:00,492473,1,Belgium Jupiler League,Belgium
1,2008-08-16 00:00:00,492474,2,Belgium Jupiler League,Belgium
2,2008-08-16 00:00:00,492475,3,Belgium Jupiler League,Belgium


In [123]:
match.set_index('league_id')
league.set_index('league_id')
match.set_index('league_id').merge(league.set_index('league_id'), left_index=True, right_index=True).reset_index()

Unnamed: 0,league_id,match_id,country_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,league_name
0,1,1,1,2008/2009,1,2008-08-17 00:00:00,492473,9987,9993,1,1,Belgium Jupiler League
1,1,2,1,2008/2009,1,2008-08-16 00:00:00,492474,10000,9994,0,0,Belgium Jupiler League
2,1,3,1,2008/2009,1,2008-08-16 00:00:00,492475,9984,8635,0,3,Belgium Jupiler League
3,1,4,1,2008/2009,1,2008-08-17 00:00:00,492476,9991,9998,5,0,Belgium Jupiler League
4,1,5,1,2008/2009,1,2008-08-16 00:00:00,492477,7947,9985,1,3,Belgium Jupiler League
...,...,...,...,...,...,...,...,...,...,...,...,...
25974,24558,25975,24558,2015/2016,9,2015-09-22 00:00:00,1992091,10190,10191,1,0,Switzerland Super League
25975,24558,25976,24558,2015/2016,9,2015-09-23 00:00:00,1992092,9824,10199,1,2,Switzerland Super League
25976,24558,25977,24558,2015/2016,9,2015-09-23 00:00:00,1992093,9956,10179,2,0,Switzerland Super League
25977,24558,25978,24558,2015/2016,9,2015-09-22 00:00:00,1992094,7896,10243,0,0,Switzerland Super League


In [128]:
set(match.eval('league_name = "Lionel Messi"')) & set(league)

{'league_id', 'league_name'}

In [129]:
match.eval('league_name = "Lionel Messi"').merge(league)

Unnamed: 0,match_id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,league_name


In [131]:
match.merge(league)

Unnamed: 0,match_id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,league_name
0,1,1,1,2008/2009,1,2008-08-17 00:00:00,492473,9987,9993,1,1,Belgium Jupiler League
1,2,1,1,2008/2009,1,2008-08-16 00:00:00,492474,10000,9994,0,0,Belgium Jupiler League
2,3,1,1,2008/2009,1,2008-08-16 00:00:00,492475,9984,8635,0,3,Belgium Jupiler League
3,4,1,1,2008/2009,1,2008-08-17 00:00:00,492476,9991,9998,5,0,Belgium Jupiler League
4,5,1,1,2008/2009,1,2008-08-16 00:00:00,492477,7947,9985,1,3,Belgium Jupiler League
...,...,...,...,...,...,...,...,...,...,...,...,...
25974,25975,24558,24558,2015/2016,9,2015-09-22 00:00:00,1992091,10190,10191,1,0,Switzerland Super League
25975,25976,24558,24558,2015/2016,9,2015-09-23 00:00:00,1992092,9824,10199,1,2,Switzerland Super League
25976,25977,24558,24558,2015/2016,9,2015-09-23 00:00:00,1992093,9956,10179,2,0,Switzerland Super League
25977,25978,24558,24558,2015/2016,9,2015-09-22 00:00:00,1992094,7896,10243,0,0,Switzerland Super League


In [119]:
pd.merge(match, league)

Unnamed: 0,match_id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,league_name
0,1,1,1,2008/2009,1,2008-08-17 00:00:00,492473,9987,9993,1,1,Belgium Jupiler League
1,2,1,1,2008/2009,1,2008-08-16 00:00:00,492474,10000,9994,0,0,Belgium Jupiler League
2,3,1,1,2008/2009,1,2008-08-16 00:00:00,492475,9984,8635,0,3,Belgium Jupiler League
3,4,1,1,2008/2009,1,2008-08-17 00:00:00,492476,9991,9998,5,0,Belgium Jupiler League
4,5,1,1,2008/2009,1,2008-08-16 00:00:00,492477,7947,9985,1,3,Belgium Jupiler League
...,...,...,...,...,...,...,...,...,...,...,...,...
25974,25975,24558,24558,2015/2016,9,2015-09-22 00:00:00,1992091,10190,10191,1,0,Switzerland Super League
25975,25976,24558,24558,2015/2016,9,2015-09-23 00:00:00,1992092,9824,10199,1,2,Switzerland Super League
25976,25977,24558,24558,2015/2016,9,2015-09-23 00:00:00,1992093,9956,10179,2,0,Switzerland Super League
25977,25978,24558,24558,2015/2016,9,2015-09-22 00:00:00,1992094,7896,10243,0,0,Switzerland Super League


### Объедините таблицы "Match" и "League" по полям ("league_id", "id") и найдите среднее количество забитых голов на матч в каждом чемпионате.

In [134]:
pd.read_sql("""
SELECT  l.name,
        AVG(m.home_team_goal + m.away_team_goal) as avg_goals
FROM Match m
JOIN League l ON m.league_id = l.id
GROUP BY l.name
""", engine)

Unnamed: 0,name,avg_goals
0,Belgium Jupiler League,2.801505
1,England Premier League,2.710526
2,France Ligue 1,2.443092
3,Germany 1. Bundesliga,2.901552
4,Italy Serie A,2.616838
5,Netherlands Eredivisie,3.080882
6,Poland Ekstraklasa,2.425
7,Portugal Liga ZON Sagres,2.5346
8,Scotland Premier League,2.633772
9,Spain LIGA BBVA,2.767105


In [149]:
df = match.merge(league, on='league_id')

df = df \
    .groupby('league_name') \
    [['home_team_goal', 'away_team_goal']] \
    .mean() \
#     .sum(axis=1) \
#     .reset_index() \
#     .rename(columns={0:'avg_goals'})
# df.columns = ['league_name', 'avg_goals']
df

Unnamed: 0_level_0,home_team_goal,away_team_goal
league_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Belgium Jupiler League,1.609375,1.19213
England Premier League,1.550987,1.159539
France Ligue 1,1.402961,1.040132
Germany 1. Bundesliga,1.626634,1.274918
Italy Serie A,1.500829,1.116009
Netherlands Eredivisie,1.77982,1.301062
Poland Ekstraklasa,1.394792,1.030208
Portugal Liga ZON Sagres,1.408382,1.126218
Scotland Premier League,1.429276,1.204496
Spain LIGA BBVA,1.63125,1.135855


In [169]:
df = match.merge(league, on='league_id')
# df = df \
#     .eval('home_team_goal + away_team_goal')
    
# df['goals'] = df \
#     .eval('home_team_goal + away_team_goal')
df = df \
    .eval('goals = home_team_goal + away_team_goal') \
    .groupby('league_name', as_index=False) \
    .agg(avg_goals=('goals','mean'),
         sum_goals=('goals','sum'))
#     ['goals'].mean()
#     .agg({'goals':['mean','sum']})

df

Unnamed: 0,league_name,avg_goals,sum_goals
0,Belgium Jupiler League,2.801505,4841
1,England Premier League,2.710526,8240
2,France Ligue 1,2.443092,7427
3,Germany 1. Bundesliga,2.901552,7103
4,Italy Serie A,2.616838,7895
5,Netherlands Eredivisie,3.080882,7542
6,Poland Ekstraklasa,2.425,4656
7,Portugal Liga ZON Sagres,2.5346,5201
8,Scotland Premier League,2.633772,4804
9,Spain LIGA BBVA,2.767105,8412


## Задачи для решения на семинаре

### Найдите 10 команд, у которых самое большое количество побед на домашнем поле.
Критерий победы на домашнем поле: хозяева забили больше голов, чем гости.

In [None]:
pd.read_sql("""
SELECT 
    t.team_long_name, 
    COUNT(distinct m.id) as home_wins_count
FROM Match m 
INNER JOIN Team t ON t.team_api_id = m.home_team_api_id
WHERE m.home_team_goal > m.away_team_goal
GROUP BY t.team_long_name
ORDER BY home_wins_count DESC
LIMIT 10
""", engine)

### Напишите запрос, который позволяет вывести общее количество голов, забитых каждой командой в каждом сезоне. Выведите результаты в порядке убывания количества забитых голов (вывести топ 10 записей)
Нужно учитывать голы, забитые как дома, так и в гостях

In [None]:
pd.read_sql("""
with u as (
    SELECT 
        m.home_team_api_id as team_api_id, 
        m.season, 
        m.home_team_goal AS goals_scored
    FROM Match m 
    UNION ALL
    SELECT 
        m.away_team_api_id as team_api_id, 
        m.season, 
        m.away_team_goal AS goals_scored
    FROM Match m
)
SELECT 
    t.team_long_name, 
    u.season, 
    SUM(u.goals_scored) AS goals_scored
FROM u
INNER JOIN Team t ON u.team_api_id = t.team_api_id
GROUP BY t.team_long_name, u.season
ORDER BY goals_scored DESC
LIMIT 10
""", engine)

In [None]:
df_home = match[['home_team_api_id','season','home_team_goal']] \
    .rename(columns={'home_team_api_id':'team_api_id','home_team_goal':'goals_scored'})
df_away = match[['away_team_api_id','season','away_team_goal']] \
    .rename(columns={'away_team_api_id':'team_api_id','away_team_goal':'goals_scored'})

df = pd.concat([df_home, df_away])

# Enter your code here
# ...

## Задачи в качестве домашней работы

### Вывести id, название страны, название лиги, сезон, этап, дату, название домашней команды, название гостевой команды, количество голов, забитых домашней и гостевой командами, для всех матчей в Испании, отсортированных по дате

### Найдите количество побед, поражений и ничьих для каждой команды в каждом сезоне. Для этого необходимо объединить информацию из таблицы Match и Team. Отобразите результаты в порядке убывания количества побед.

### Для каждой команды определите максимальное количество забитых голов на домашнем поле и на выезде за каждый сезон. Отобразите результаты в порядке убывания максимального количества забитых голов на выезде.

### Найдите среднее количество забитых голов за игру на домашнем поле и на выезде для каждой команды в каждом сезоне. Отобразите результаты в порядке убывания среднего количества забитых голов на выезде.

 # Виды рангов и сравнение SQL vs Pandas

## SQL. ROW_NUMBER vs RANK vs DENSE_RANK

- https://datageeks.medium.com/row-number-rank-and-dense-rank-functions-in-sql-dbc41acc61d5

## Pandas. RANKS (average, min, max, first, dense)

- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rank.html

In [177]:
df = pd.DataFrame({'a': [1,2,2,3,4]})
df

Unnamed: 0,a
0,1
1,2
2,2
3,3
4,4


In [180]:
methods = ['average', 'min', 'max', 'first', 'dense']
for method in methods:
    df[f'rank_{method}'] = df['a'].rank(method=method)
df

Unnamed: 0,a,rank_average,rank_min,rank_max,rank_first,rank_dense
0,1,1.0,1.0,1.0,1.0,1.0
1,2,2.5,2.0,3.0,2.0,2.0
2,2,2.5,2.0,3.0,3.0,2.0
3,3,4.0,4.0,4.0,4.0,3.0
4,4,5.0,5.0,5.0,5.0,4.0


In [178]:
# df = pd.DataFrame({'a': [2,1,4,3,2]})

## SQL vs Pandas

In [182]:
from pandasql import sqldf

def execute_sql(query):
    return sqldf(query, globals())

sql_query = """
SELECT  df.*,
        RANK() OVER(ORDER BY a) rank,
        DENSE_RANK() OVER(ORDER BY a) dense_rank,
        ROW_NUMBER() OVER(ORDER BY a) row_number
FROM df
"""
execute_sql(sql_query)

Unnamed: 0,a,rank_average,rank_min,rank_max,rank_first,rank_dense,rank,dense_rank,row_number
0,1,1.0,1.0,1.0,1.0,1.0,1,1,1
1,2,2.5,2.0,3.0,2.0,2.0,2,2,2
2,2,2.5,2.0,3.0,3.0,2.0,2,2,3
3,3,4.0,4.0,4.0,4.0,3.0,4,3,4
4,4,5.0,5.0,5.0,5.0,4.0,5,4,5


Соответствие:
- row_number -> rank_first
- rank -> min
- dense_rank -> rank_dense

# Batch 2. Оконные функции

## Примеры

### Найдите топ-3 лиги с самым большим количеством забитых голов в каждом сезоне. Выведите название лиги, сезон и количество забитых голов. Определите ранг лиги в топ-3 по количеству забитых голов.

In [None]:
pd.read_sql("""
with t as (
    SELECT season, league_name, goals_scored, 
           RANK() OVER (PARTITION BY season ORDER BY goals_scored DESC) AS rank
    FROM (
      SELECT m.season, l.name AS league_name, 
             SUM(m.home_team_goal + m.away_team_goal) AS goals_scored
      FROM Match AS m
      JOIN League AS l ON m.league_id = l.id
      GROUP BY m.season, l.name
    ) AS goals
)
select *
from t
WHERE rank <= 3
ORDER BY season, rank
""", engine)

In [None]:
df = pd.merge(match, league, left_on="league_id", right_on="league_id")

df = df.groupby(["season", "league_name"])[["home_team_goal", "away_team_goal"]].sum().reset_index()
df["goals_scored"] = df["home_team_goal"] + df["away_team_goal"]
df["rank"] = df.groupby("season")["goals_scored"].rank(method="min", ascending=False).astype(int)
df = df[df["rank"] <= 3]
df = df.sort_values(["season", "rank"])
df = df[["season", "league_name", "goals_scored", "rank"]]

df

## Задачи для решения на семинаре

### Для каждого сезона и каждой лиги необходимо вывести топ-3 команд по количеству забитых голов за сезон. Выводить название команды, количество забитых голов, сезон и название лиги.

In [None]:
df_home = match.merge(league, left_on='league_id', right_on='league_id') \
          .merge(team, left_on='home_team_api_id', right_on='team_api_id', how='left')

df_away = match.merge(league, left_on='league_id', right_on='league_id') \
          .merge(team, left_on='away_team_api_id', right_on='team_api_id', how='left')

df = pd.concat([df_home, df_away])

# Enter your code here
# ...

## Задачи в качестве домашней работы

### Найдите топ-3 команды с самым большим количеством выигранных матчей в каждой лиге за каждый сезон. Выведите название команды, лигу, сезон и количество выигранных матчей. Определите ранг команды в топ-3 по количеству выигранных матчей.

### Найдите топ-3 (среди всех стран) команды с самым большим количеством пропущенных голов в каждом сезоне. Выведите название команды, сезон и количество пропущенных голов. Определите ранг команды в топ-3 по количеству пропущенных голов.

### Найдите топ-3 команды с наибольшим количеством набранных очков в каждой лиге за каждый сезон. Выведите название команды, лигу, сезон и количество набранных очков.

# Полезные функции для обработки данных

In [183]:
country

Unnamed: 0,country_id,country_name
0,1,Belgium
1,1729,England
2,4769,France
3,7809,Germany
4,10257,Italy
5,13274,Netherlands
6,15722,Poland
7,17642,Portugal
8,19694,Scotland
9,21518,Spain


In [185]:
# 1-я буква из наименования каждой страны
country['first_letter'] = country['country_name'].apply(lambda x: x[0])
country

Unnamed: 0,country_id,country_name,first_letter
0,1,Belgium,B
1,1729,England,E
2,4769,France,F
3,7809,Germany,G
4,10257,Italy,I
5,13274,Netherlands,N
6,15722,Poland,P
7,17642,Portugal,P
8,19694,Scotland,S
9,21518,Spain,S


In [187]:
def get_first_letter(x):
    return x[0]
get_first_letter('Italy')

'I'

In [193]:
# del country['first_letter']
country['first_letter'] = country['country_name'].apply(get_first_letter)
# country['first_letter'] = country['country_name'].apply(lambda x: get_first_letter(x))
# country['first_letter'] = country['country_name'].apply(lambda x: x[0])
country

Unnamed: 0,country_id,country_name,first_letter
0,1,Belgium,B
1,1729,England,E
2,4769,France,F
3,7809,Germany,G
4,10257,Italy,I
5,13274,Netherlands,N
6,15722,Poland,P
7,17642,Portugal,P
8,19694,Scotland,S
9,21518,Spain,S


In [194]:
country

Unnamed: 0,country_id,country_name,first_letter
0,1,Belgium,B
1,1729,England,E
2,4769,France,F
3,7809,Germany,G
4,10257,Italy,I
5,13274,Netherlands,N
6,15722,Poland,P
7,17642,Portugal,P
8,19694,Scotland,S
9,21518,Spain,S


In [197]:
def is_france(x):
    if x == 'France':
        return 1
    else:
        return 0
# is_france('France')
country['is_france'] = country['country_name'].apply(lambda x: is_france(x))
# country['is_france'] = country['country_name'].apply(is_france)
# country['is_france'] = country['country_name'].apply(lambda x: 1 if x == 'France' else 0)
country

Unnamed: 0,country_id,country_name,first_letter,is_france
0,1,Belgium,B,0
1,1729,England,E,0
2,4769,France,F,1
3,7809,Germany,G,0
4,10257,Italy,I,0
5,13274,Netherlands,N,0
6,15722,Poland,P,0
7,17642,Portugal,P,0
8,19694,Scotland,S,0
9,21518,Spain,S,0


In [198]:
pd.read_sql("""
select  *,
        case when name == 'France' then 1
             else 0
        end as is_france
from country
""", engine)

Unnamed: 0,id,name,is_france
0,1,Belgium,0
1,1729,England,0
2,4769,France,1
3,7809,Germany,0
4,10257,Italy,0
5,13274,Netherlands,0
6,15722,Poland,0
7,17642,Portugal,0
8,19694,Scotland,0
9,21518,Spain,0


In [None]:
# condition ? if_true : if_false

In [199]:
pd.read_sql("""
select  *,
        case when name == 'France' then 1
             when name == 'England' then 2
        else 0 end as is_france
from country
""", engine)

Unnamed: 0,id,name,is_france
0,1,Belgium,0
1,1729,England,2
2,4769,France,1
3,7809,Germany,0
4,10257,Italy,0
5,13274,Netherlands,0
6,15722,Poland,0
7,17642,Portugal,0
8,19694,Scotland,0
9,21518,Spain,0


In [200]:
def is_france_or_england(x):
    if x == 'France':
        return 1
    elif x == 'England':
#     if x == 'England':
        return 2
    else:
        return 0

country['is_france_or_england'] = country['country_name'].apply(is_france_or_england)
country

Unnamed: 0,country_id,country_name,first_letter,is_france,is_france_or_england
0,1,Belgium,B,0,0
1,1729,England,E,0,2
2,4769,France,F,1,1
3,7809,Germany,G,0,0
4,10257,Italy,I,0,0
5,13274,Netherlands,N,0,0
6,15722,Poland,P,0,0
7,17642,Portugal,P,0,0
8,19694,Scotland,S,0,0
9,21518,Spain,S,0,0


In [203]:
country['country_name'].apply(lambda x: 1 if x=='France' 2 elif x=='England' else 0)
country['country_name'].apply(is_france_or_england)

SyntaxError: invalid syntax (<ipython-input-203-ebffe7ff4150>, line 1)