# SqlLite Jupyter Notebook Experiments

See environment setup notes on ./README.md

This file is meant as a parallel to the queries run in notes/210829_query_notes.txt, but this notebook runs entirely using SqlLite and not PostgreSQL

NOTE: got this code rolling with examples from: https://mungingdata.com/sqlite/create-database-load-csv-python/

In [2]:
import pandas as pd
from pathlib import Path
import sqlite3
from IPython.display import display, HTML

In [3]:
#Make sure it exists
dbPath = Path('../data/sqlLiteCensusDb.db')
if (not Path.is_file(dbPath)):
    dbPath.touch()

conn = sqlite3.connect(dbPath)
c = conn.cursor() #We'll hang on to this cursor for later queries

censusDataDF = pd.read_csv('../data/sub-est2019_all.csv')

print("Dataframe of loaded census data:")
display(censusDataDF)

censusMainTableName = "all_2019"
# write the data to a sqlite table
censusDataDF.to_sql(censusMainTableName, conn, if_exists='replace', index = False)

print("Data loaded into sql")

Dataframe of loaded census data:


Unnamed: 0,sumlev,state,county,place,cousub,concit,primgeo_flag,funcstat,name,stname,...,popestimate2010,popestimate2011,popestimate2012,popestimate2013,popestimate2014,popestimate2015,popestimate2016,popestimate2017,popestimate2018,popestimate2019
0,40,1,0,0,0,0,0,A,Alabama,Alabama,...,4785437,4799069,4815588,4830081,4841799,4852347,4863525,4874486,4887681,4903185
1,162,1,0,124,0,0,0,A,Abbeville city,Alabama,...,2699,2694,2643,2628,2608,2600,2584,2575,2571,2560
2,162,1,0,460,0,0,0,A,Adamsville city,Alabama,...,4500,4493,4471,4449,4420,4390,4356,4327,4308,4281
3,162,1,0,484,0,0,0,A,Addison town,Alabama,...,751,750,743,742,739,734,731,726,723,718
4,162,1,0,676,0,0,0,A,Akron town,Alabama,...,355,347,347,343,338,339,333,332,331,328
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81429,157,56,43,99990,0,0,1,F,Balance of Washakie County,Wyoming,...,2782,2755,2734,2737,2690,2690,2650,2599,2554,2531
81430,50,56,45,0,0,0,0,A,Weston County,Wyoming,...,7198,7142,7077,7136,7138,7208,7220,6968,6924,6927
81431,157,56,45,56215,0,0,1,A,Newcastle city,Wyoming,...,3530,3506,3476,3483,3484,3513,3530,3409,3394,3397
81432,157,56,45,79125,0,0,1,A,Upton town,Wyoming,...,1098,1089,1081,1088,1095,1104,1105,1062,1057,1056


Data loaded into sql


In [None]:
print("How many rows are in the db:")
c.execute(f"SELECT count(*) FROM {censusMainTableName}").fetchall() 

In [None]:
# States that grew the most:
c.execute('''select sumlev, name, stname, popestimate2019, census2010pop, cast(popestimate2019 as integer)-cast(census2010pop as integer) as stephdiff from all_2019 where sumlev=40 order by stephdiff desc;''').fetchall() 

In [None]:
# learned a lesson about data types

c.execute(f"select sumlev, name, stname, popestimate2019, popestimate2010, cast(popestimate2019 as integer)-cast(popestimate2010 as integer) as stephdiff from all_2019 order by stephdiff desc;").fetchall() 


In [None]:
c.execute("select sumlev, name, stname, popestimate2019, popestimate2010, popestimate2019-popestimate2010 as stephdiff from all_2019 where name like '%ownsend%' order by stephdiff desc;").fetchall() 

In [None]:

# just testing with casts, works!
c.execute(f"SELECT count(*) FROM {censusMainTableName}").fetchall() 
select sumlev, name, stname, popestimate2019, popestimate2010, popestimate2019-popestimate2010 as stephdiff from all_2019 order by stephdiff desc;


In [None]:
# trying to figure out sumlev codes for a town like PT:

c.execute(f"select sumlev, name, stname, popestimate2019, popestimate2010, popestimate2019-popestimate2010 as stephdiff from all_2019 where name like '%ownsend%';").fetchall() 

# 162 | Port Townsend city | Washington    |            9831 |            9119 |       712
# 157 | Port Townsend city | Washington    |            9831 |            9119 |       712
# so 162 or 157, lets go with 162 for now

In [None]:
# Getting to some real stuff here!  For towns under 10000, who lost the most population?
c.execute(f"select sumlev, name, stname, popestimate2019, census2010pop, popestimate2010, popestimate2019-popestimate2010 as stephdiff from all_2019 where popestimate2019 < 10000 and sumlev = 162 order by stephdiff asc;").fetchall() 

In [None]:

# change as a percentage????  No, because in 2010 some of those places were zero
c.execute(f"select sumlev, name, stname, popestimate2019, census2010pop, popestimate2010, (popestimate2019-popestimate2010)/popestimate2010 as stephdiff from all_2019 where sumlev = 162 and popestimate2010 > 0 order by stephdiff asc;").fetchall() 




In [None]:
# Count the different sumlev types:
c.execute(f"select count(*) from all_2019 where sumlev = 162;").fetchall() 

# > 19502

In [None]:
# Show a count of each sumlev type:

c.execute(f"select distinct sumlev, count(*) from all_2019 group by sumlev;").fetchall() 

#  sumlev | count 
# --------+-------
#      40 |    51  #States?
#      50 |  3142  #Counties?
#      61 | 21063
#      71 | 13839
#     157 | 23714
#     162 | 19502
#     170 |     8
#     172 |   115

In [None]:
# Show all of the states (sumlev = 40) and descending growth rates ("100*" to show more percentagy looking numbers)
c.execute(f"select sumlev, name, stname, popestimate2019, census2010pop, popestimate2010, 100*(popestimate2019-popestimate2010)/popestimate2010 as stephdiff from all_2019 where sumlev = 40 order by stephdiff desc;").fetchall() 

In [None]:
# Show the total growth of the country using sums of the state data (and format the big numbers nicely):
# printf number formatting see: https://database.guide/format-numbers-with-a-comma-in-sqlite
c.execute(f"select count(sumlev), count(stname), printf('%,d', sum(popestimate2019)) as pop2019, printf('%,d', sum(cast(census2010pop as integer))) as pop2010 from all_2019 where sumlev = 40;").fetchall() 



In [None]:
# Look for towns between 7000 and 10000 that have decreased in size:
# NOTE: This query required some postgresql > sqllite REGEX conversion thinking
# there were two routes to take, introduce a user defined regex function OR do a number formatting 
# equality check to verify that a number could be like a number

#For introducing a user defined regexp function, see: https://stackoverflow.com/a/58495085/2242421
import re
conn.create_function('regexp', 2, lambda x, y: 1 if not x is None and not y is None and re.search(x,y) else 0)
# conn.create_function('regexp', 2, lambda x, y: 1 if re.search(x,y) else 0)
        # popestimate2010 REGEXP '^\\d+$' 

#For the printf digit comparison approach, see: https://stackoverflow.com/a/32528946/2242421

c.execute('''
    select sumlev, name, stname, popestimate2019, census2010pop, popestimate2010, 100*(popestimate2019-popestimate2010)/popestimate2010 as stephdiff 
    from all_2019 where 
        printf('%d', popestimate2010) = popestimate2010
        and popestimate2010 > 0 
        and sumlev = 162 
        and popestimate2019 > 7000
        and popestimate2019 < 10000 
        and stephdiff < 0 
    order by stephdiff desc;
''').fetchall() 


In [None]:

c.execute(f"select sumlev, name, stname, popestimate2019, census2010pop, popestimate2010, 100*(popestimate2019-popestimate2010)/popestimate2010 as stephdiff from all_2019 where sumlev = 162 and popestimate2019 > 7000 and popestimate2019 < 10000 and stephdiff < 0 order by stephdiff desc;").fetchall() 

In [5]:
c.execute("SELECT * FROM all_2019").fetchall()

[(40,
  1,
  0,
  0,
  0,
  0,
  0,
  'A',
  'Alabama',
  'Alabama',
  '4779736',
  4780125,
  4785437,
  4799069,
  4815588,
  4830081,
  4841799,
  4852347,
  4863525,
  4874486,
  4887681,
  4903185),
 (162,
  1,
  0,
  124,
  0,
  0,
  0,
  'A',
  'Abbeville city',
  'Alabama',
  '2688',
  2705,
  2699,
  2694,
  2643,
  2628,
  2608,
  2600,
  2584,
  2575,
  2571,
  2560),
 (162,
  1,
  0,
  460,
  0,
  0,
  0,
  'A',
  'Adamsville city',
  'Alabama',
  '4522',
  4506,
  4500,
  4493,
  4471,
  4449,
  4420,
  4390,
  4356,
  4327,
  4308,
  4281),
 (162,
  1,
  0,
  484,
  0,
  0,
  0,
  'A',
  'Addison town',
  'Alabama',
  '758',
  754,
  751,
  750,
  743,
  742,
  739,
  734,
  731,
  726,
  723,
  718),
 (162,
  1,
  0,
  676,
  0,
  0,
  0,
  'A',
  'Akron town',
  'Alabama',
  '356',
  356,
  355,
  347,
  347,
  343,
  338,
  339,
  333,
  332,
  331,
  328),
 (162,
  1,
  0,
  820,
  0,
  0,
  0,
  'A',
  'Alabaster city',
  'Alabama',
  '30352',
  31112,
  31209,
  313

In [6]:
pd.DataFrame(c.execute("SELECT * FROM all_2019").fetchall())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,40,1,0,0,0,0,0,A,Alabama,Alabama,...,4785437,4799069,4815588,4830081,4841799,4852347,4863525,4874486,4887681,4903185
1,162,1,0,124,0,0,0,A,Abbeville city,Alabama,...,2699,2694,2643,2628,2608,2600,2584,2575,2571,2560
2,162,1,0,460,0,0,0,A,Adamsville city,Alabama,...,4500,4493,4471,4449,4420,4390,4356,4327,4308,4281
3,162,1,0,484,0,0,0,A,Addison town,Alabama,...,751,750,743,742,739,734,731,726,723,718
4,162,1,0,676,0,0,0,A,Akron town,Alabama,...,355,347,347,343,338,339,333,332,331,328
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81429,157,56,43,99990,0,0,1,F,Balance of Washakie County,Wyoming,...,2782,2755,2734,2737,2690,2690,2650,2599,2554,2531
81430,50,56,45,0,0,0,0,A,Weston County,Wyoming,...,7198,7142,7077,7136,7138,7208,7220,6968,6924,6927
81431,157,56,45,56215,0,0,1,A,Newcastle city,Wyoming,...,3530,3506,3476,3483,3484,3513,3530,3409,3394,3397
81432,157,56,45,79125,0,0,1,A,Upton town,Wyoming,...,1098,1089,1081,1088,1095,1104,1105,1062,1057,1056
