In [1]:
from pandasql import sqldf
import pandas as pd

In [2]:
def fsql(q):
    return sqldf(q, globals())

In [3]:
departments = pd.read_csv('departments.csv')
employees = pd.read_csv('upd_empl_2.csv')
regions = pd.read_csv('regions.csv')

In [4]:
# Sup queremos crear una variable 'estatus_pago' con base en la siguiente definición:

#salary < 90000 --> 'bajo'
# salary >= 90000 AND salary < 150000 --> 'bien pagado'
# en otro caso: 'muy bien pagado'

# --> sentencia CASE:

q="""
SELECT first_name, salary,
CASE
    WHEN salary < 90000 THEN 'BAJO'
    WHEN salary >= 90000 AND salary < 150000 THEN 'BIEN PAGADO'
    ELSE 'muy bien pagado'
END AS estatus_pago                 -- Termina con END AS nombre_de_variable
FROM employees
ORDER BY salary DESC;
"""
df = fsql(q)

In [5]:
df['estatus_pago'].value_counts()

estatus_pago
BAJO               504
BIEN PAGADO        375
muy bien pagado    121
Name: count, dtype: int64

#### QUEREMOS lo mismo que hace value_counts(), pero en SQL

    

In [6]:
q="""
SELECT a.estatus_pago, COUNT(*) AS conteo
FROM(
SELECT first_name, salary,
CASE
    WHEN salary < 90000 THEN 'BAJO'
    WHEN salary >= 90000 AND salary < 150000 THEN 'BIEN PAGADO'
    ELSE 'muy bien pagado'
END AS estatus_pago                 -- Termina con END AS nombre_de_variable
FROM employees
ORDER BY salary DESC) a
GROUP BY a.estatus_pago;
"""
fsql(q)

Unnamed: 0,estatus_pago,conteo
0,BAJO,504
1,BIEN PAGADO,375
2,muy bien pagado,121


Sup quiero transponer los datos, i.e.:

    bajo      bien pagado      muy bien pagado
    504         375                 121

In [7]:
q="""
SELECT SUM(CASE WHEN salary < 90000 THEN 1 ELSE 0 END) bajo,
SUM(CASE WHEN salary >= 90000 AND salary < 150000 THEN 1 ELSE 0 END) bien_pagado,
SUM(CASE WHEN salary > 150000 THEN 1 ELSE 0 END) muy_bien_pagado
FROM employees;
"""
fsql(q)

Unnamed: 0,bajo,bien_pagado,muy_bien_pagado
0,504,375,121


In [8]:
#quiero ahora igual que el anterior pero una columna al último con el total
q="""
SELECT a.bajo, a.bien_pagado, a.muy_bien_pagado, a.bajo + a.bien_pagado + a.muy_bien_pagado AS TOTAL
FROM
(SELECT SUM(CASE WHEN salary < 90000 THEN 1 ELSE 0 END) bajo,
SUM(CASE WHEN salary >= 90000 AND salary < 150000 THEN 1 ELSE 0 END) bien_pagado,
SUM(CASE WHEN salary > 150000 THEN 1 ELSE 0 END) muy_bien_pagado
FROM employees) a;
"""
fsql(q)

Unnamed: 0,bajo,bien_pagado,muy_bien_pagado,TOTAL
0,504,375,121,1000


In [9]:
employees.head(3)

Unnamed: 0,employee_id,first_name,last_name,email,hire_date,department,gender,salary,region_id
0,1,Berrie,Manueau,berr@aol.com.mx,2006-04-20,Sports,F,154864,4
1,2,Aeriell,McNee,aeri@gmail.co,2009-01-26,Tools,F,56752,3
2,3,Sydney,Symonds,sydn@terra.mx,2010-05-17,Clothing,F,95313,4


Supongamos quiero una consulta que me de lo siguiente:

    depto 1  n1
    depto 2  n2
    depto 3  n3
    .
    .
    .

pero sólo considerando los deptos "Sports", "Tools", "Clothing"

In [10]:
#ejercicio
q="""
SELECT department, COUNT(*) num_empleados
FROM employees
WHERE department IN ('Sports', 'Tools', 'Clothing')
GROUP BY department;
"""
fsql(q)

Unnamed: 0,department,num_empleados
0,Clothing,49
1,Sports,34
2,Tools,39


## En caso de querer un Total:
    depto 1  n1
    depto 2  n2
    depto 3  n3
    .
    Total    N

In [11]:
#1. Sacar los conteos por departamento (ejercicio anterior)
#2. A ese resultado le vamos a dar un nombre usando la sentencia WITH
#3. Vamos a hacer una union del resultado de 2 con una consulta adicional que tenga el Total (UNION ALL)

In [12]:
q="""
WITH tabla_1 AS(
    SELECT department, COUNT(*) num_empleados
    FROM employees
    WHERE department IN ('Sports', 'Tools', 'Clothing')
    GROUP BY department
)

SELECT * FROM tabla_1
UNION ALL          --   equivale a apilar las tablas

SELECT 'Total' department, SUM(num_empleados) num_empleados
FROM tabla_1;
"""
fsql(q)

Unnamed: 0,department,num_empleados
0,Clothing,49
1,Sports,34
2,Tools,39
3,Total,122


In [13]:
employees.head(3)

Unnamed: 0,employee_id,first_name,last_name,email,hire_date,department,gender,salary,region_id
0,1,Berrie,Manueau,berr@aol.com.mx,2006-04-20,Sports,F,154864,4
1,2,Aeriell,McNee,aeri@gmail.co,2009-01-26,Tools,F,56752,3
2,3,Sydney,Symonds,sydn@terra.mx,2010-05-17,Clothing,F,95313,4


Si quiero transponer estos resultados de la siguiente forma:

    Clothing     Sports    Tools
      49           34        39

In [14]:
q="""
SELECT SUM(CASE WHEN department = 'Tools' THEN 1 ELSE 0 END) As num_empl_tools,
SUM(CASE WHEN department = 'Sports' THEN 1 ELSE 0 END) As num_empl_sports,
SUM(CASE WHEN department = 'Clothing' THEN 1 ELSE 0 END) As num_empl_clothing
FROM employees;
"""
fsql(q)

Unnamed: 0,num_empl_tools,num_empl_sports,num_empl_clothing
0,39,34,49


In [15]:
#equivalentemente:
q="""
SELECT SUM(CASE WHEN department = 'Tools' THEN 1 ELSE 0 END) num_empl_tools,
SUM(CASE WHEN department = 'Sports' THEN 1 ELSE 0 END)  num_empl_sports,
SUM(CASE WHEN department = 'Clothing' THEN 1 ELSE 0 END) num_empl_clothing
FROM employees;
"""
fsql(q)

Unnamed: 0,num_empl_tools,num_empl_sports,num_empl_clothing
0,39,34,49


In [16]:
employees.head(2)

Unnamed: 0,employee_id,first_name,last_name,email,hire_date,department,gender,salary,region_id
0,1,Berrie,Manueau,berr@aol.com.mx,2006-04-20,Sports,F,154864,4
1,2,Aeriell,McNee,aeri@gmail.co,2009-01-26,Tools,F,56752,3


In [17]:
departments.head(2)

Unnamed: 0,department,division
0,Clothing,Home
1,Grocery,Home


In [21]:
regions.head(8)

Unnamed: 0,region_id,region,country
0,1,Southwest,United States
1,2,Northeast,United States
2,3,Northwest,United States
3,4,Central,Asia
4,5,East Asia,Asia
5,6,Quebec,Canada
6,7,Nova Scotia,Canada


## Ejercicios

In [19]:
#1. Quiero el promedio del salario de los empleados por departamento, con el salario redondeado a 2 decimales
q="""
SELECT department, ROUND(AVG(salary),2) AS salario_medio
FROM employees
GROUP BY department;
"""
fsql(q)

Unnamed: 0,department,salario_medio
0,Automotive,111046.16
1,Beauty,96487.44
2,Books,96497.41
3,Camping,97302.03
4,Children Clothing,87511.04
5,Clothing,83933.45
6,Computers,96064.57
7,Cosmetics,108381.71
8,Decor,92766.21
9,Device Repair,87480.73


In [20]:
#2. Quiero la tabla de empleados ordenada de formada descendente por hire_date y que sólo se muestren los primeros 20 registros:
q="""
SELECT * FROM employees
ORDER BY hire_date DESC
LIMIT 20;
"""
fsql(q)

Unnamed: 0,employee_id,first_name,last_name,email,hire_date,department,gender,salary,region_id
0,791,Barby,Daniell,barb@yahoo.com.mx,2016-12-26,Clothing,F,164588,6
1,460,Cherianne,Oxnam,cher@gmail.co,2016-12-18,Automotive,F,150821,6
2,272,Roxie,Revance,roxi@yahoo.com.mx,2016-12-16,Phones & Tablets,F,42224,2
3,380,Edik,Wardhough,edik@terra.mx,2016-12-11,Tools,M,88378,2
4,491,Eloisa,Eeles,eloi@yahoo.com,2016-12-02,Clothing,F,39200,4
5,753,Adelaide,Gubbin,adel@excite.com,2016-12-01,Toys,F,27578,7
6,876,Marlowe,Crock,marl@excite.com,2016-12-01,Jewelry,M,66384,1
7,131,Hewet,Calafate,hewe@yahoo.com,2016-11-24,Phones & Tablets,M,113979,2
8,371,Malcolm,D'Costa,malc@terra.mx,2016-11-24,Garden,M,138493,1
9,655,Darin,Pooke,dari@yahoo.com,2016-11-18,Movies,M,142023,2


In [22]:
#3. Haz un query que devuelva a los empleados de la region 'Central':
q="""
SELECT * FROM employees
WHERE region_id IN (SELECT region_id FROM regions WHERE region="Central");
"""
fsql(q)

Unnamed: 0,employee_id,first_name,last_name,email,hire_date,department,gender,salary,region_id
0,1,Berrie,Manueau,berr@aol.com.mx,2006-04-20,Sports,F,154864,4
1,3,Sydney,Symonds,sydn@terra.mx,2010-05-17,Clothing,F,95313,4
2,23,Vanda,Marwick,vand@yahoo.com.mx,2014-02-06,Automotive,F,103570,4
3,29,Cortie,Ambrosini,cort@yahoo.com,2014-05-03,Games,M,133847,4
4,53,Alphonse,Shedd,alph@gmail.com,2014-08-17,Cosmetics,M,33272,4
...,...,...,...,...,...,...,...,...,...
132,970,Barrett,Barling,barr@gmail.co,2005-10-26,Grocery,M,128512,4
133,975,Isis,McKinn,isis@excite.com,2010-06-18,Movies,F,124444,4
134,977,Vickie,Pryn,vick@gmail.co,2012-02-24,Movies,F,33269,4
135,980,Lyndy,Tooker,lynd@gmail.com,2007-01-26,Computers,F,112796,4


In [23]:
regions

Unnamed: 0,region_id,region,country
0,1,Southwest,United States
1,2,Northeast,United States
2,3,Northwest,United States
3,4,Central,Asia
4,5,East Asia,Asia
5,6,Quebec,Canada
6,7,Nova Scotia,Canada


In [24]:
#4. hAZ un query que devuelva el promedio del salario por hombres y mujeres en la region "Central":
q="""
SELECT gender, AVG(salary) FROM employees
WHERE region_id IN (SELECT region_id FROM regions WHERE region="Central")
GROUP BY gender;
"""
fsql(q)

Unnamed: 0,gender,AVG(salary)
0,F,90030.355263
1,M,96746.245902


In [25]:
#5. Quiero el conteo de hombres y mujeres:
q="""
SELECT gender, COUNT(*) AS conteo
FROM employees
GROUP BY gender;       -- COUNT(*) porque estamos contando sobre los renglones
"""
fsql(q)

Unnamed: 0,gender,conteo
0,F,501
1,M,499


In [27]:
#Si además quierom que el resultado esté ordenado de menor a mayor
q="""
SELECT gender, COUNT(*) AS conteo
FROM employees
GROUP BY gender
ORDER BY conteo;
"""
fsql(q)

Unnamed: 0,gender,conteo
0,M,499
1,F,501


In [28]:
#Equivalente a lo anterior:
q="""
SELECT gender, COUNT(*) AS conteo
FROM employees
GROUP BY 1
ORDER BY 2;
"""
fsql(q)

Unnamed: 0,gender,conteo
0,M,499
1,F,501


In [None]:
#6. Ahora quiero el conteo pero en formato:
 #           F    M
 #           n1   n2

In [29]:
q="""
SELECT SUM(CASE WHEN gender = 'F' THEN 1 ELSE 0 END) AS mujeres,
SUM(CASE WHEN gender = 'M' THEN 1 ELSE 0 END) AS hombres
FROM employees;
"""
fsql(q)

Unnamed: 0,mujeres,hombres
0,501,499


### BETWEEN
Sintaxis: BETWEEN x AND y

In [31]:
#7. Selecciona a los empleados que fueron contratados entre el 25 sep y el 9 nov de 2016:

q="""
SELECT * FROM employees
WHERE hire_date BETWEEN '2016-09-25' AND '2016-11-09'
ORDER BY hire_date DESC;
"""
fsql(q)

Unnamed: 0,employee_id,first_name,last_name,email,hire_date,department,gender,salary,region_id
0,973,Dona,Murley,dona@hotmail.com,2016-11-09,Decor,F,98159,1
1,925,Alvin,McMullen,alvi@terra.mx,2016-11-01,Beauty,M,69999,3
2,879,Reg,Newland,reg@hotmail.com,2016-10-28,Beauty,M,36053,3
3,288,Buckie,Jodrellec,buck@aol.com.mx,2016-10-10,Vitamins,M,30289,4
4,672,Jeane,Partener,jean@aol.com,2016-10-09,Pharmacy,F,136466,5
5,894,Lebbie,Slaymaker,lebb@yahoo.com,2016-10-07,Grocery,F,57389,4
6,771,Berti,Randerson,bert@gmail.co,2016-10-06,Children Clothing,M,150740,3
7,696,Roma,Pfeifer,roma@aol.com,2016-10-02,Tools,M,120070,1
8,712,Ernesto,Locarno,erne@gmail.co,2016-09-26,Music,M,57633,2
9,170,Linnell,Haps,linn@excite.com,2016-09-25,Computers,F,98885,7


In [30]:
employees.head(3)

Unnamed: 0,employee_id,first_name,last_name,email,hire_date,department,gender,salary,region_id
0,1,Berrie,Manueau,berr@aol.com.mx,2006-04-20,Sports,F,154864,4
1,2,Aeriell,McNee,aeri@gmail.co,2009-01-26,Tools,F,56752,3
2,3,Sydney,Symonds,sydn@terra.mx,2010-05-17,Clothing,F,95313,4
