In [3]:
from sqlite3 import dbapi2 as sq3
import os
import pandas as pd
import urllib, json
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 500)
pd.set_option('display.notebook_repr_html', True)

### Create the database

In [8]:
ourschema = '''
DROP TABLE IF EXISTS 'nodes';
DROP TABLE IF EXISTS 'nodes_tags';
DROP TABLE IF EXISTS 'ways';
DROP TABLE IF EXISTS 'ways_tags';
DROP TABLE IF EXISTS 'ways_nodes';
CREATE TABLE nodes (
    id INTEGER PRIMARY KEY NOT NULL,
    lat REAL,
    lon REAL,
    user TEXT,
    uid INTEGER,
    version INTEGER,
    changeset INTEGER,
    timestamp TEXT
);

CREATE TABLE nodes_tags (
    id INTEGER,
    key TEXT,
    value TEXT,
    type TEXT,
    FOREIGN KEY (id) REFERENCES nodes(id)
);

CREATE TABLE ways (
    id INTEGER PRIMARY KEY NOT NULL,
    user TEXT,
    uid INTEGER,
    version TEXT,
    changeset INTEGER,
    timestamp TEXT
);

CREATE TABLE ways_tags (
    id INTEGER NOT NULL,
    key TEXT NOT NULL,
    value TEXT NOT NULL,
    type TEXT,
    FOREIGN KEY (id) REFERENCES ways(id)
);

CREATE TABLE ways_nodes (
    id INTEGER NOT NULL,
    node_id INTEGER NOT NULL,
    position INTEGER NOT NULL,
    FOREIGN KEY (id) REFERENCES ways(id),
    FOREIGN KEY (node_id) REFERENCES nodes(id)
);'''

In [5]:
PATHSTART="."
def get_db(dbfile):
    sqlite_db = sq3.connect(os.path.join(PATHSTART, dbfile))
    return sqlite_db

Sqlite is a text or memory based database. Connect and get a DBAPI2 connection.

Drop tables if they exist and create them.

In [28]:
def init_db(dbfile, schema):
    """Creates the database tables."""
    db = get_db(dbfile)
    db.cursor().executescript(schema)
    db.commit()
    return db

Use Pandas to read in the data

In [35]:
#Initiate the Database - 
db=init_db("Warsaw_Poland.db", ourschema)

#Read the CSV's and populate with Pandas(if_exist=replace istead of append solved the IS NULL error)
for f in os.listdir('./CSV_Files/'):
    df=pd.read_csv("./CSV_Files/{}".format(f), sep=',', encoding='utf-8')
    
    df.to_sql(f[14:-4], db, if_exists="replace", index=False, schema=ourschema)


In [6]:
db = get_db('Warsaw_Poland.db')

In [7]:
def make_query(sel):
    c=db.cursor().execute(sel)
    return c.fetchall()

In [8]:
cont_cols = [e[1] for e in make_query("PRAGMA table_info(ways_tags);")]
cont_cols

[u'id', u'key', u'value', u'type']

In [9]:
def make_frame(list_of_tuples, legend=cont_cols):
    framelist=[]
    for i, cname in enumerate(legend):
        framelist.append((cname,[e[i] for e in list_of_tuples]))
    return pd.DataFrame.from_items(framelist)

### All ways_tags keys and value counts

In [39]:
columns = ' DISTINCT key,' + ' count(key)'
query = 'SELECT '+ columns + ' FROM ways_tags GROUP BY key ORDER BY count(key) DESC'.format(columns)
pd.DataFrame(make_query(query), columns=[cols for cols in columns.split(',')])

Unnamed: 0,DISTINCT key,count(key)
0,building,324088
1,highway,175659
2,housenumber,139472
3,city,138127
4,street,137226
5,postcode,131824
6,addr,116319
7,city:simc,102973
8,street:sym_ul,97547
9,name,67942


In [19]:
columns = ' ways.user'  
query = 'SELECT '+ columns +\
            ' FROM ways, ways_tags, \
            
            (SELECT DISTINCT user, value, count(value) as ct\
            FROM ways, ways_tags\
        WHERE ways.id=ways_tags.id\
        AND key="building"\
        GROUP BY value) as sbq\
        WHERE ways.id=ways_tags.id\
        AND ct<=10\
'.format(columns)

pd.DataFrame(make_query(query), columns=[cols for cols in columns.split(',')])

KeyboardInterrupt: 

In [27]:
query ='SELECT DISTINCT user, value, count(value) as ct\
            FROM ways, ways_tags\
        WHERE ways.id=ways_tags.id\
        AND key="building"\
        GROUP BY value'
make_query(query)

[(u'KPPSPOTWOCK', u'Budynek Zabytkowy', 1),
 (u'Alkomat', u'Lokal', 1),
 (u'WiktorN-import', u'National_Monument', 1),
 (u'Edyta | Yanosik', u'Plutonowego J\xf3zefa Cie\u0107wierza', 1),
 (u'korni', u'apartments', 17998),
 (u'trzebinik', u'barn', 58),
 (u'kocio', u'belfry', 3),
 (u'zbyki', u'bridge', 2),
 (u'hormel', u'bunker', 1),
 (u'HanyuuFurude', u'cabin', 10),
 (u'Mala', u'cage', 1),
 (u'rmikke', u'castle', 2),
 (u'rmikke', u'cathedral', 1),
 (u'Mi\u0142y Kuc', u'chapel', 29),
 (u'Ciemek', u'church', 253),
 (u'Andrzej3345', u'civ', 1),
 (u'Deuar', u'civic', 153),
 (u'Rados\u0142aw Botev', u'commercial', 1580),
 (u'lemacik', u'conservatory', 4),
 (u'Mi\u0142y Kuc', u'construction', 196),
 (u'lemacik', u'container', 6),
 (u'balrog-kun', u'corridor', 2),
 (u'd3mol3k', u'cowshed', 4),
 (u'kocio', u'daycare', 1),
 (u'Javnik', u'detached', 388),
 (u'Etua', u'dormitory', 14),
 (u'masti', u'embassy', 1),
 (u'Heemskerck', u'empty', 1),
 (u'Mala', u'enclosing', 20),
 (u'WiktorN-import', u'f

To do: Delete the keys with values no and group all the entries below 10

### Value counts when key=shop

In [16]:
columns = ' DISTINCT value,' + ' count(value)'
query = 'SELECT '+ columns + '\
            FROM ways_tags\
        WHERE key = "shop"\
        GROUP BY value\
        ORDER BY count(value) DESC'.format(columns)

pd.DataFrame(make_query(query), columns=[cols for cols in columns.split(',')])

Unnamed: 0,DISTINCT value,count(value)
0,supermarket,308
1,convenience,253
2,kiosk,180
3,car_repair,142
4,car,126
5,mall,106
6,yes,80
7,florist,66
8,greengrocer,53
9,doityourself,41


Who is the uuser(or users) who is making these custom entries

In [None]:
columns = ' DISTINCT value,' + ' count(value)'
query = 'SELECT '+ columns + '\
            FROM ways_tags\
        WHERE key = "shop"\
        GROUP BY value\
        ORDER BY count(value) DESC'.format(columns)

pd.DataFrame(make_query(query), columns=[cols for cols in columns.split(',')])

In [51]:
key= '"colour"'
columns = ' DISTINCT value,' + ' count(value)'
query = 'SELECT  {} \
            FROM ways_tags\
        WHERE key = {}\
        GROUP BY value\
        ORDER BY count(value) DESC'.format(columns, key)

pd.DataFrame(make_query(query), columns=[cols for cols in columns.split(',')])

Unnamed: 0,DISTINCT value,count(value)
0,#666666,751
1,#dddddd,321
2,red,282
3,#999999,265
4,#ffffff,244
5,#333333,148
6,#e9d8c5,119
7,#222222,107
8,black,95
9,#deeded,87


Found an incosistency here with the guidelines

Let's try to load the whole OSM now

#### Or populate with SQL INSERT

In [None]:
ins="""
INSERT INTO candidates (id, first_name, last_name, middle_name, party) \
    VALUES (?,?,?,?,?);
"""
with open("candidates.txt") as fd:
    slines =[l.strip().split('|') for l in fd.readlines()]
    for line in slines[1:]:
        theid, first_name, last_name, middle_name, party = line
        print (theid, first_name, last_name, middle_name, party)
        valstoinsert = (int(theid), first_name, last_name, middle_name, party)
        print (ins, valstoinsert)
        db.cursor().execute(ins, valstoinsert)