In [6]:
import pandas as pd
import numpy as np
import sqlite3 as sql
pd.options.display.max_columns = 150

In [7]:
conn = sql.connect("jobs.db")

In [8]:
# Creating a function to read query
def read_query(query):
    return pd.read_sql_query(query, conn)

In [9]:
q = "SELECT * FROM recent_grads LIMIT 5"

read_query(q)

Unnamed: 0,index,Rank,Major_code,Major,Major_category,Total,Sample_size,Men,Women,ShareWomen,Employed,Full_time,Part_time,Full_time_year_round,Unemployed,Unemployment_rate,Median,P25th,P75th,College_jobs,Non_college_jobs,Low_wage_jobs
0,0,1,2419,PETROLEUM ENGINEERING,Engineering,2339,36,2057,282,0.120564,1976,1849,270,1207,37,0.018381,110000,95000,125000,1534,364,193
1,1,2,2416,MINING AND MINERAL ENGINEERING,Engineering,756,7,679,77,0.101852,640,556,170,388,85,0.117241,75000,55000,90000,350,257,50
2,2,3,2415,METALLURGICAL ENGINEERING,Engineering,856,3,725,131,0.153037,648,558,133,340,16,0.024096,73000,50000,105000,456,176,0
3,3,4,2417,NAVAL ARCHITECTURE AND MARINE ENGINEERING,Engineering,1258,16,1123,135,0.107313,758,1069,150,692,40,0.050125,70000,43000,80000,529,102,0
4,4,5,2405,CHEMICAL ENGINEERING,Engineering,32260,289,21239,11021,0.341631,25694,23170,5180,16697,1672,0.061098,65000,50000,75000,18314,4440,972


# Group By

In [5]:
q = """SELECT Major_category, AVG(ShareWomen) 
    FROM recent_grads
    GROUP BY Major_category
""";
read_query(q)

Unnamed: 0,Major_category,AVG(ShareWomen)
0,Agriculture & Natural Resources,0.617938
1,Arts,0.561851
2,Biology & Life Science,0.584518
3,Business,0.405063
4,Communications & Journalism,0.643835
5,Computers & Mathematics,0.512752
6,Education,0.674986
7,Engineering,0.257158
8,Health,0.616857
9,Humanities & Liberal Arts,0.676193


In [15]:
# For each major category, find the percentage of graduates who are employed.
q = """SELECT Major_category, AVG(Employed)/SUM(Total) Share_Employed 
from recent_grads 
GROUP BY Major_category"""
read_query(q)

Unnamed: 0,Major_category,Share_Employed
0,Agriculture & Natural Resources,0.083699
1,Arts,0.100844
2,Biology & Life Science,0.047654
3,Business,0.064305
4,Communications & Journalism,0.210557
5,Computers & Mathematics,0.072328
6,Education,0.053637
7,Engineering,0.026964
8,Health,0.066948
9,Humanities & Liberal Arts,0.050843


In [None]:
q = """SELECT Major_category,
       SUM(Women) AS Total_women,
       AVG(ShareWomen) AS Mean_women,
       SUM(Total)*AVG(ShareWomen) AS Estimate_women
  FROM recent_grads
 GROUP BY Major_category, Major;"""

read_query(q)

Sometimes we want to select a subset of rows after performing a `GROUP BY` query. On the above, for instance, we may have wanted to select only those rows where `share_employed` is greater than `0.8`. We can't use the `WHERE` clause to do this because `share_employed` isn't a column in `recent_grads`; it's actually a virtual column generated by the `GROUP BY` statement.

When we want to filter on a column generated by a `GROUP BY` query, we can use the `HAVING` statement

In [19]:
# major categories where the share of graduates with low-wage jobs is greater than .1.

q = '''SELECT Major_category, 
ROUND(AVG(Low_wage_jobs)/AVG(Total),2) as share_low_wage
       From recent_grads
       Group by Major_category
       Having share_low_wage > 0.1'''
read_query(q)

Unnamed: 0,Major_category,share_low_wage
0,Arts,0.17
1,Communications & Journalism,0.13
2,Humanities & Liberal Arts,0.13
3,Industrial Arts & Consumer Services,0.12
4,Law & Public Policy,0.12
5,Psychology & Social Work,0.12


When executing a SQL query, the computer runs the clauses in this order:
    
1. FROM
2. WHERE
3. GROUP BY
4. HAVING
5. SELECT
6. ORDER BY
7. LIMIT

We can use the `PRAGMA TABLE_INFO()` statement by itself to return the type, along with some other information, for each column:

In [21]:
q = '''PRAGMA TABLE_INFO(recent_grads)'''
read_query(q)

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,index,INTEGER,0,,0
1,1,Rank,INTEGER,0,,0
2,2,Major_code,INTEGER,0,,0
3,3,Major,TEXT,0,,0
4,4,Major_category,TEXT,0,,0
5,5,Total,INTEGER,0,,0
6,6,Sample_size,INTEGER,0,,0
7,7,Men,INTEGER,0,,0
8,8,Women,INTEGER,0,,0
9,9,ShareWomen,REAL,0,,0


If we try to divide 2 integer columns (`Women` and `Total`), SQL will round down and return integer values:
We need to instead use the `CAST()` function to the **Float** type:

In [22]:
q = """SELECT Women/Total SW from recent_grads limit 5"""
read_query(q)

Unnamed: 0,SW
0,0
1,0
2,0
3,0
4,0


In [28]:
q = '''Select Major_category,Cast(sum(Women) as Float)/sum(Total) SW  
       From recent_grads 
       Group By Major_category
       Order By SW''' # We can also cast into integer and String
read_query(q)

Unnamed: 0,Major_category,SW
0,Law & Public Policy,0.030585
1,Business,0.084743
2,Industrial Arts & Consumer Services,0.160249
3,Computers & Mathematics,0.209356
4,Engineering,0.219596
5,Communications & Journalism,0.250325
6,Arts,0.393327
7,Humanities & Liberal Arts,0.490051
8,Health,0.673588
9,Interdisciplinary,0.800911
