In [12]:
from sqlalchemy import create_engine, Column, Integer, String, Float, Date, ForeignKey
from sqlalchemy.orm import sessionmaker, relationship, backref
from sqlalchemy.ext.declarative import declarative_base
import sqlite3

Connect to database using sqlite

In [24]:
con = sqlite3.connect('database.db')

Set up sqlalchemy stuff

In [25]:
engine = create_engine('sqlite:///database.db', echo=False)

In [26]:
Session = sessionmaker(bind = engine)

In [27]:
Base = declarative_base()

Create classes that map to tables using sqlalchemy's declarative base

In [28]:
class Station(Base):
    __tablename__ = 'station'
    
    stations_id = Column(Integer, primary_key=True)
    von_datum = Column(Date)
    bis_datum = Column(Date)
    stationshoehe = Column(Integer)
    geoBreite = Column(Float)
    geoLaenge = Column(Float)
    stationsname = Column(String, nullable=False)
    bundesland = Column(String)


class Measurement(Base):
    __tablename__ = 'measurement'
    
    # have to set it to Date later. sqlite has some problems with this
    mess_datum = Column(Integer, primary_key=True)
    stations_id = Column(Integer, ForeignKey('station.stations_id'), primary_key=True)
    qn_3 = Column(Integer)  # quality level of next columns
    fx = Column(Float)
    fm = Column(Float)
    qn_4 = Column(Integer)
    rsk = Column(Float)
    rskf = Column(Float)
    sdk = Column(Float)
    shk_tag = Column(Float)
    nm = Column(Float)
    vpm = Column(Float)
    pm = Column(Float)
    tmk = Column(Float)
    upm = Column(Float)
    txk = Column(Float)
    tnk = Column(Float)
    tgk = Column(Float)
    
    station = relationship('Station', backref=backref('measurements', order_by=mess_datum))

This has already set up the metadata for us to create the tables.

So create them

In [29]:
Base.metadata.create_all(engine)

In [30]:
import pandas as pd
import numpy as np

Read two measurements tables

In [31]:
data_3 = pd.read_table('data_00003.txt', sep = ';', index_col = 0).drop('eor', axis = 1)
data_3.columns = [col.strip() for col in data_3.columns]
data_3 = data_3.replace(-999, np.nan)

In [32]:
data_1 = pd.read_table('data_00001.txt', sep = ';', index_col = 0).drop('eor', axis = 1)
data_1.columns = [col.strip() for col in data_1.columns]
data_1 = data_1.replace(-999, np.nan)

Write them to the database

In [33]:
data_3.to_sql('measurement', engine, if_exists='append')

In [34]:
data_1.to_sql('measurement', engine, if_exists='append')

Use pandas to get the table and do stuff

In [35]:
%timeit df = pd.read_sql('measurement', engine)['stations_id'].unique()

360 ms ± 2.85 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [36]:
session = Session()
def distinct_ids():
    df = [m for m in session.query(Measurement.stations_id).distinct()] #distinct um werter nur einmal anzuzeigen...
    return df

In [37]:
%timeit df = distinct_ids()

4.08 ms ± 39.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


Check how measurements are automatically linked to stations by adding some dummy stations to the database

In [38]:
session.add_all([Station(stations_id = 1, stationsname = 'Aach'),
                 Station(stations_id = 3, stationsname = 'Aachen')])

In [40]:
session.commit()

In [41]:
for s in session.query(Station).filter_by(stationsname = 'Aach'):
    print(len(s.measurements))

17348


In [42]:
session.close()