In [1]:
import pymongo
import pandas as pd
import json
import csv
from dotenv import load_dotenv, find_dotenv
import os
import pprint
from bson.objectid import ObjectId
from datetime import datetime as dt

In [2]:
password = os.environ.get("MONGODB_PWD")

In [3]:
client_string = f"""mongodb+srv://hubruk:{password}@cluster0.2lppi7r.mongodb.net/myFirstDatabase"""

In [4]:
client = pymongo.MongoClient(client_string)

#

In [5]:
def csv2dict(path):
    my_list = []
    patient_id_list = []
    with open(path, newline='') as f_input:
        csv_input = csv.reader(f_input)
        header = next(csv_input)
        for row in csv_input:
            my_dict = {}
            patient_id_list.append(row[0])
            for i, val in enumerate(row):
                if val:
                    my_dict[header[i]] = val
            my_list.append(my_dict)
    return my_list, patient_id_list

In [6]:
def load_all_from_csv():
    dem, dem_ids = csv2dict('national_health\demographic.csv')
    diet, diet_ids = csv2dict('national_health\diet.csv')
    e, e_ids = csv2dict('national_health\examination.csv')
    labs, labs_ids = csv2dict('national_health\labs.csv')
    m, m_ids = csv2dict('national_health\medications.csv')
    q, q_ids = csv2dict('national_health\questionnaire.csv')
    return dem, dem_ids, diet, diet_ids, e, e_ids, labs, labs_ids, m, m_ids, q, q_ids

In [42]:
demographic, dem_SEQNs, diet, diet_SEQNs, examination, exam_SEQNs, labs, labs_SEQNs, medications, meds_SEQNs, questionnaire, ques_SEQNs =  load_all_from_csv()

In [8]:
db = client.national_health

In [9]:
print(db)

Database(MongoClient(host=['ac-ltqtr0s-shard-00-00.2lppi7r.mongodb.net:27017', 'ac-ltqtr0s-shard-00-01.2lppi7r.mongodb.net:27017', 'ac-ltqtr0s-shard-00-02.2lppi7r.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin', replicaset='atlas-balcde-shard-0', tls=True), 'national_health')


[{},{}]
przy insert many, przeleci po liście wpisująć każde {} jako osobny dokument

In [10]:
def collections_in_db():
    collections = db.list_collection_names()
    return collections

In [11]:
collections_in_db()

['labs', 'examination', 'questionnaire', 'diet', 'demographic', 'medications']

dobra czyli mam teraz po 2 listy np. demographic i dem_SEQNs - obie listy są równe, nie ma tutaj uniq idsów 

In [18]:
def removekey(d, key):
    r = dict(d)
    del r[key]
    return r

[
    {
        "SEQN" : "nr",
        "demographic_id" : "nr",
        ...
    },
    ...
]

In [73]:
def insert_2_many_collections(do = True):
    if do:
        if 'demographic' in collections_in_db() == True: db.demographic.drop()
        if 'diet' in collections_in_db() == True: db.diet.drop()
        if 'examination' in collections_in_db() == True: db.examination.drop()
        if 'labs' in collections_in_db() == True: db.labs.drop()
        if 'medications' in collections_in_db() == True: db.medications.drop()
        if 'questionnaire' in collections_in_db() == True: db.questionnaire.drop()
        demographic, dem_SEQNs, diet, diet_SEQNs, examination, exam_SEQNs, labs, labs_SEQNs, medications, meds_SEQNs, questionnaire, ques_SEQNs =  load_all_from_csv()
            
        demographic_row_id_less = [] # in case of no data in this
        for row in demographic:
            demographic_row_id_less.append(removekey(row, 'SEQN'))
        
        diet_row_id_less = []
        for row in diet:
          diet_row_id_less.append(removekey(row, 'SEQN'))

        examination_row_id_less = []
        for row in examination:
            examination_row_id_less.append(removekey(row, 'SEQN'))
        
        labs_row_id_less = []
        for row in labs:
            labs_row_id_less.append(removekey(row, 'SEQN'))

        medications_row_id_less = []
        for row in medications:
            medications_row_id_less.append(removekey(row, 'SEQN'))
        
        questionnaire_row_id_less = []
        for row in questionnaire:
            questionnaire_row_id_less.append(removekey(row, 'SEQN'))
        
        dem_ids = db.demographic.insert_many(demographic_row_id_less).inserted_ids
        diet_ids = db.diet.insert_many(diet_row_id_less).inserted_ids
        ex_ids = db.examination.insert_many(examination_row_id_less).inserted_ids
        l_ids = db.labs.insert_many(labs_row_id_less).inserted_ids
        m_ids = db.medications.insert_many(medications_row_id_less).inserted_ids
        q_ids = db.questionnaire.insert_many(questionnaire_row_id_less).inserted_ids

        relations_list = []
        all_patients_id_set = set(dem_SEQNs + diet_SEQNs + exam_SEQNs + labs_SEQNs + meds_SEQNs + ques_SEQNs)
        for list_pos, patient in enumerate(all_patients_id_set):
            relations_list.append({"SEQN" : patient})    
            for i, row in enumerate(dem_SEQNs):
                if row == patient:
                    relations_list[list_pos]["demographic_id"] = dem_ids[i]
            for i, row in enumerate(diet_SEQNs):
                if row == patient:
                    relations_list[list_pos]["diet_id"] = diet_ids[i]
            for i, row in enumerate(exam_SEQNs):
                if row == patient:
                    relations_list[list_pos]["examination_id"] = ex_ids[i]
            for i, row in enumerate(labs_SEQNs):
                if row == patient:
                    relations_list[list_pos]["labs_id"] = l_ids[i]
            for i, row in enumerate(meds_SEQNs):
                if row == patient:
                    relations_list[list_pos]["medications_id"] = m_ids[i]
            for i, row in enumerate(ques_SEQNs):
                if row == patient:
                    relations_list[list_pos]["questionnaire_id"] = q_ids[i]
        
        patient_relation_mongo_ids = db.patient_relations.insert_many(relations_list).inserted_ids

        return dem_ids, diet_ids, ex_ids, l_ids, m_ids, q_ids, patient_relation_mongo_ids

In [26]:
# insert_2_many_collections()

10175 10175


In [74]:
demographic_mongo_ids, diet_mongo_ids, examination_mongo_ids, labs_mongo_ids, medications_mongo_ids, questionnaire_mongo_ids, patient_relation_mongo_ids = insert_2_many_collections(True)

In [76]:
meds_SEQNs

['73557',
 '73557',
 '73558',
 '73558',
 '73558',
 '73558',
 '73559',
 '73559',
 '73559',
 '73559',
 '73559',
 '73560',
 '73561',
 '73561',
 '73561',
 '73561',
 '73562',
 '73562',
 '73562',
 '73562',
 '73562',
 '73562',
 '73562',
 '73563',
 '73564',
 '73564',
 '73564',
 '73565',
 '73566',
 '73566',
 '73566',
 '73567',
 '73568',
 '73569',
 '73570',
 '73571',
 '73571',
 '73571',
 '73572',
 '73573',
 '73574',
 '73575',
 '73575',
 '73575',
 '73576',
 '73577',
 '73578',
 '73579',
 '73580',
 '73581',
 '73582',
 '73583',
 '73584',
 '73585',
 '73586',
 '73587',
 '73588',
 '73589',
 '73590',
 '73591',
 '73592',
 '73593',
 '73594',
 '73595',
 '73596',
 '73597',
 '73597',
 '73597',
 '73598',
 '73599',
 '73600',
 '73601',
 '73602',
 '73603',
 '73604',
 '73604',
 '73604',
 '73604',
 '73604',
 '73604',
 '73605',
 '73606',
 '73607',
 '73607',
 '73607',
 '73607',
 '73607',
 '73608',
 '73609',
 '73610',
 '73611',
 '73612',
 '73613',
 '73613',
 '73613',
 '73613',
 '73613',
 '73613',
 '73613',
 '73614',


In [58]:
len(medications_mongo_ids), len(meds_SEQNs)

(20194, 20194)

In [70]:
relations_list = []
all_patients_id_set = set(dem_SEQNs + diet_SEQNs + exam_SEQNs + labs_SEQNs + meds_SEQNs + ques_SEQNs)
#print(dem_SEQNs.index('73557'))

for list_pos, patient in enumerate(all_patients_id_set):
    relations_list.append({"SEQN" : patient})
    for i, med in enumerate(meds_SEQNs):
        if med == patient:
            relations_list[list_pos]["medications"] = med #m_ids[i]


In [71]:
relations_list

[{'SEQN': '74887', 'medications': '74887'},
 {'SEQN': '74564', 'medications': '74564'},
 {'SEQN': '80753', 'medications': '80753'},
 {'SEQN': '81691', 'medications': '81691'},
 {'SEQN': '74124', 'medications': '74124'},
 {'SEQN': '78631', 'medications': '78631'},
 {'SEQN': '81685', 'medications': '81685'},
 {'SEQN': '75083', 'medications': '75083'},
 {'SEQN': '79213', 'medications': '79213'},
 {'SEQN': '80567', 'medications': '80567'},
 {'SEQN': '79137', 'medications': '79137'},
 {'SEQN': '78297', 'medications': '78297'},
 {'SEQN': '82332', 'medications': '82332'},
 {'SEQN': '82952', 'medications': '82952'},
 {'SEQN': '83113', 'medications': '83113'},
 {'SEQN': '73874', 'medications': '73874'},
 {'SEQN': '80901', 'medications': '80901'},
 {'SEQN': '74236', 'medications': '74236'},
 {'SEQN': '77665', 'medications': '77665'},
 {'SEQN': '74962', 'medications': '74962'},
 {'SEQN': '81961', 'medications': '81961'},
 {'SEQN': '80838', 'medications': '80838'},
 {'SEQN': '73662', 'medications'

In [45]:
demographic_mongo_ids

[ObjectId('639b424e5f96bff7a12fa03a'),
 ObjectId('639b424e5f96bff7a12fa03b'),
 ObjectId('639b424e5f96bff7a12fa03c'),
 ObjectId('639b424e5f96bff7a12fa03d'),
 ObjectId('639b424e5f96bff7a12fa03e'),
 ObjectId('639b424e5f96bff7a12fa03f'),
 ObjectId('639b424e5f96bff7a12fa040'),
 ObjectId('639b424e5f96bff7a12fa041'),
 ObjectId('639b424e5f96bff7a12fa042'),
 ObjectId('639b424e5f96bff7a12fa043'),
 ObjectId('639b424e5f96bff7a12fa044'),
 ObjectId('639b424e5f96bff7a12fa045'),
 ObjectId('639b424e5f96bff7a12fa046'),
 ObjectId('639b424e5f96bff7a12fa047'),
 ObjectId('639b424e5f96bff7a12fa048'),
 ObjectId('639b424e5f96bff7a12fa049'),
 ObjectId('639b424e5f96bff7a12fa04a'),
 ObjectId('639b424e5f96bff7a12fa04b'),
 ObjectId('639b424e5f96bff7a12fa04c'),
 ObjectId('639b424e5f96bff7a12fa04d'),
 ObjectId('639b424e5f96bff7a12fa04e'),
 ObjectId('639b424e5f96bff7a12fa04f'),
 ObjectId('639b424e5f96bff7a12fa050'),
 ObjectId('639b424e5f96bff7a12fa051'),
 ObjectId('639b424e5f96bff7a12fa052'),
 ObjectId('639b424e5f96bf

In [44]:
demographic_mongo_ids

[ObjectId('639b424e5f96bff7a12fa03a'),
 ObjectId('639b424e5f96bff7a12fa03b'),
 ObjectId('639b424e5f96bff7a12fa03c'),
 ObjectId('639b424e5f96bff7a12fa03d'),
 ObjectId('639b424e5f96bff7a12fa03e'),
 ObjectId('639b424e5f96bff7a12fa03f'),
 ObjectId('639b424e5f96bff7a12fa040'),
 ObjectId('639b424e5f96bff7a12fa041'),
 ObjectId('639b424e5f96bff7a12fa042'),
 ObjectId('639b424e5f96bff7a12fa043'),
 ObjectId('639b424e5f96bff7a12fa044'),
 ObjectId('639b424e5f96bff7a12fa045'),
 ObjectId('639b424e5f96bff7a12fa046'),
 ObjectId('639b424e5f96bff7a12fa047'),
 ObjectId('639b424e5f96bff7a12fa048'),
 ObjectId('639b424e5f96bff7a12fa049'),
 ObjectId('639b424e5f96bff7a12fa04a'),
 ObjectId('639b424e5f96bff7a12fa04b'),
 ObjectId('639b424e5f96bff7a12fa04c'),
 ObjectId('639b424e5f96bff7a12fa04d'),
 ObjectId('639b424e5f96bff7a12fa04e'),
 ObjectId('639b424e5f96bff7a12fa04f'),
 ObjectId('639b424e5f96bff7a12fa050'),
 ObjectId('639b424e5f96bff7a12fa051'),
 ObjectId('639b424e5f96bff7a12fa052'),
 ObjectId('639b424e5f96bf

żeby zrobić tę jedną kolekcję z danymi chce otrzymać listę gdzie każdym jej element to dict o strukturze: 

In [27]:
# {
#     pacjent_id : ... ,
#     data : {
#         demographic : {
#             ...
#         } ,
#         ...
#     }
# }

In [28]:
# {
#     pacjent_id : ... ,
#     data : {
#       ...
#     }
# }

czyli tak, chce przejść przez listę wszystkich pacjentów (all_patients_id_set) i dla każdego pliku znaleźć tego konkretnego pacjenta i dodać do data dane (w zależności od struktury ^^) - oczywiście wszystkich danych prócz id pacjenta

In [29]:
# demographic, diet, examination, labs, medications, questionnaire

In [30]:
# list_for_sigle_collection_structure = []
# for pacjent_id in all_patients_id_set:
#     list_for_sigle_collection_structure.append(
#         {'SEQN' : pacjent_id
#             , 'data' : {
#                 #i tutaj wszystkie dane jak leci
#             }
#         })

jednak trzeba zmienić trochę strukture, żeby można było dodać wiele badań danego typu dla konkretnego użytkownika

In [31]:
# {
#     pacjent_id : ... ,
#     data : {
#         demographic : {
#             ...
#         } ,
#         ...
#     }
# }
#->
# {
#     pacjent_id : ... ,
#     data : {
#         demographic : 
#           [   {
#                    ...
#               }
#           ,
#               {
#                    ...
#               }
#         
#           ] ,
#         ...
#     }
# }

In [35]:
def single_collection_2mongo(all_patients_id_set, demographic, diet, examination, labs, medications, questionnaire):
    list_for_sigle_collection_structure = []
    for pacjent_id in all_patients_id_set:
        
        demographic_row_id_less = [] # in case of no data in this
        for row in demographic[:]:
            if row['SEQN'] == pacjent_id:
                demographic_row_id_less.append(removekey(row, 'SEQN'))
                demographic.remove(row)
        
        diet_row_id_less = []
        for row in diet[:]:
            if row['SEQN'] == pacjent_id:
                diet_row_id_less.append(removekey(row, 'SEQN'))
                diet.remove(row)

        examination_row_id_less = []
        for row in examination[:]:
            if row['SEQN'] == pacjent_id:
                examination_row_id_less.append(removekey(row, 'SEQN'))
                examination.remove(row)
        
        labs_row_id_less = []
        for row in labs[:]:
            if row['SEQN'] == pacjent_id:
                labs_row_id_less.append(removekey(row, 'SEQN'))
                labs.remove(row)
        
        #medications[:] = [removekey(row, 'SEQN') for row in medications if row['SEQN'] == pacjent_id]
        medications_row_id_less = []
        for row in medications[:]:
            if row['SEQN'] == pacjent_id:
                medications_row_id_less.append(removekey(row, 'SEQN'))
                medications.remove(row)
                #break
        #jakiś drugi warunek że jak seqn +1 to wtedy break
        
        questionnaire_row_id_less = []
        for row in questionnaire[:]:
            if row['SEQN'] == pacjent_id:
                questionnaire_row_id_less.append(removekey(row, 'SEQN'))
                questionnaire.remove(row)

        list_for_sigle_collection_structure.append(
            {'SEQN' : pacjent_id
                , 'data' : {
                    'demographic' : 
                    demographic_row_id_less
                    ,
                    'diet' : 
                    diet_row_id_less
                    ,
                    'examination' : 
                    examination_row_id_less
                    ,
                    'labs' : 
                    labs_row_id_less
                    ,
                    'medications' : 
                    medications_row_id_less
                    ,
                    'questionnaire' : 
                    questionnaire_row_id_less
                }
            })
    return list_for_sigle_collection_structure

In [36]:
def insert_2_pacjent_data_single_collection(do = True):
    if do:
        col_exists = 'pacjent_data_single_collection' in collections_in_db()
        if col_exists == True: db.pacjent_data_single_collection.drop()
        demographic, dem_SEQNs, diet, diet_SEQNs, examination, exam_SEQNs, labs, labs_SEQNs, medications, meds_SEQNs, questionnaire, ques_SEQNs =  load_all_from_csv()
        all_patients_id_set = set(dem_SEQNs + diet_SEQNs + exam_SEQNs + labs_SEQNs + meds_SEQNs + ques_SEQNs)
        signle_collection_patients = single_collection_2mongo(all_patients_id_set, demographic, diet, examination, labs, medications, questionnaire)
        pacjent_data_single_collection_ids = db.pacjent_data_single_collection.insert_many(signle_collection_patients).inserted_ids
        return pacjent_data_single_collection_ids

In [38]:
pacjent_single_collection_ids = insert_2_pacjent_data_single_collection(True)

In [None]:
printer = pprint.PrettyPrinter()

# Single collection

In [None]:
def find_all_SEQN():
    SEQNs = db.pacjent_data_single_collection.find()
    #to jest cursor pymongo, wiec albo list(SEQNs) albo iterowac po nim
    for SEQN in SEQNs:
        printer.pprint(SEQN) 

In [None]:
#find_all_SEQN()

In [None]:
def find_SEQN(SEQN_nr):
    SEQN = db.pacjent_data_single_collection.find_one({"SEQN" : SEQN_nr})
    printer.pprint(SEQN) 

In [None]:
find_SEQN("77432")

{'SEQN': '77432',
 '_id': ObjectId('639b345777f138c88f8778d2'),
 'data': {'demographic': [{'DMDBORN4': '1',
                           'DMDCITZN': '1',
                           'DMDFMSIZ': '4',
                           'DMDHHSIZ': '4',
                           'DMDHHSZA': '1',
                           'DMDHHSZB': '0',
                           'DMDHHSZE': '1',
                           'DMDHRAGE': '62',
                           'DMDHRBR4': '2',
                           'DMDHREDU': '1',
                           'DMDHRGND': '2',
                           'DMDHRMAR': '4',
                           'RIAGENDR': '1',
                           'RIDAGEYR': '4',
                           'RIDEXAGM': '59',
                           'RIDEXMON': '2',
                           'RIDRETH1': '2',
                           'RIDRETH3': '2',
                           'RIDSTATR': '2',
                           'SDDSRVYR': '8',
                           'SDMVPSU': '2',
           

In [None]:
def find_DMDBORN4():
    SEQN = db.pacjent_data_single_collection.find( {"data" : {"demographic" : { "DMDHRAGE" : "69" }}})
    for data in SEQN:
        printer.pprint(data) 

In [None]:
find_DMDBORN4()

# DEMOGRAPHIC

In [None]:
def find_DMDBORN4():
    SEQN = db.demographic.find( { "DMDHRAGE" : "69" })
    for data in SEQN:
        printer.pprint(data) 

In [None]:
find_DMDBORN4()

{'AIALANGA': '1',
 'DMDBORN4': '1',
 'DMDCITZN': '1',
 'DMDEDUC2': '3',
 'DMDFMSIZ': '3',
 'DMDHHSIZ': '3',
 'DMDHHSZA': '0',
 'DMDHHSZB': '0',
 'DMDHHSZE': '2',
 'DMDHRAGE': '69',
 'DMDHRBR4': '1',
 'DMDHREDU': '3',
 'DMDHRGND': '1',
 'DMDHRMAR': '4',
 'DMDMARTL': '4',
 'DMQADFC': '1',
 'DMQMILIZ': '1',
 'FIAINTRP': '2',
 'FIALANG': '1',
 'FIAPROXY': '2',
 'INDFMIN2': '4',
 'INDFMPIR': '0.84',
 'INDHHIN2': '4',
 'MIAINTRP': '2',
 'MIALANG': '1',
 'MIAPROXY': '2',
 'RIAGENDR': '1',
 'RIDAGEYR': '69',
 'RIDEXMON': '1',
 'RIDRETH1': '4',
 'RIDRETH3': '4',
 'RIDSTATR': '2',
 'SDDSRVYR': '8',
 'SDMVPSU': '1',
 'SDMVSTRA': '112',
 'SEQN': '73557',
 'SIAINTRP': '2',
 'SIALANG': '1',
 'SIAPROXY': '2',
 'WTINT2YR': '13281.237386',
 'WTMEC2YR': '13481.042095',
 '_id': ObjectId('639514e5470610b8d0ec08b2')}
{'DMDBORN4': '2',
 'DMDCITZN': '1',
 'DMDEDUC2': '4',
 'DMDFMSIZ': '7',
 'DMDHHSIZ': '7',
 'DMDHHSZA': '1',
 'DMDHHSZB': '1',
 'DMDHHSZE': '2',
 'DMDHRAGE': '69',
 'DMDHRBR4': '2',
 'DMDHREDU'

In [None]:
def find_DMDBORN4():
    SEQN = db.demographic.count_documents(filter = { "DMDHRAGE" : "69" })
    printer.pprint(SEQN)
    #SEQN = db.demographic.find({"DMDHRAGE" : "69"}).count()
    #printer.pprint(SEQN)

In [None]:
find_DMDBORN4()

308


In [None]:
def find_demographic_by_object_id(demographic_id):
    _id = ObjectId(demographic_id)
    printer.pprint(db.demographic.find_one( {"_id" : _id}) )
    #SEQN = db.demographic.find({"DMDHRAGE" : "69"}).count()
    #printer.pprint(SEQN)

In [None]:
find_demographic_by_object_id("639514e5470610b8d0ec08b2")

{'AIALANGA': '1',
 'DMDBORN4': '1',
 'DMDCITZN': '1',
 'DMDEDUC2': '3',
 'DMDFMSIZ': '3',
 'DMDHHSIZ': '3',
 'DMDHHSZA': '0',
 'DMDHHSZB': '0',
 'DMDHHSZE': '2',
 'DMDHRAGE': '69',
 'DMDHRBR4': '1',
 'DMDHREDU': '3',
 'DMDHRGND': '1',
 'DMDHRMAR': '4',
 'DMDMARTL': '4',
 'DMQADFC': '1',
 'DMQMILIZ': '1',
 'FIAINTRP': '2',
 'FIALANG': '1',
 'FIAPROXY': '2',
 'INDFMIN2': '4',
 'INDFMPIR': '0.84',
 'INDHHIN2': '4',
 'MIAINTRP': '2',
 'MIALANG': '1',
 'MIAPROXY': '2',
 'RIAGENDR': '1',
 'RIDAGEYR': '69',
 'RIDEXMON': '1',
 'RIDRETH1': '4',
 'RIDRETH3': '4',
 'RIDSTATR': '2',
 'SDDSRVYR': '8',
 'SDMVPSU': '1',
 'SDMVSTRA': '112',
 'SEQN': '73557',
 'SIAINTRP': '2',
 'SIALANG': '1',
 'SIAPROXY': '2',
 'WTINT2YR': '13281.237386',
 'WTMEC2YR': '13481.042095',
 '_id': ObjectId('639514e5470610b8d0ec08b2')}


In [None]:
def DMDHRAGE_range(min_v, max_v):
    query = {"$and" : [
                {"DMDHRAGE" : {"$gte" : min_v}},
                {"DMDHRAGE" : {"$lte" : max_v}}
        ]}

    a = db.demographic.find(query).sort("DMDHRAGE")
    for i in a:
        printer.pprint(i)

In [None]:
DMDHRAGE_range("69", "70")

{'AIALANGA': '1',
 'DMDBORN4': '1',
 'DMDCITZN': '1',
 'DMDEDUC2': '3',
 'DMDFMSIZ': '3',
 'DMDHHSIZ': '3',
 'DMDHHSZA': '0',
 'DMDHHSZB': '0',
 'DMDHHSZE': '2',
 'DMDHRAGE': '69',
 'DMDHRBR4': '1',
 'DMDHREDU': '3',
 'DMDHRGND': '1',
 'DMDHRMAR': '4',
 'DMDMARTL': '4',
 'DMQADFC': '1',
 'DMQMILIZ': '1',
 'FIAINTRP': '2',
 'FIALANG': '1',
 'FIAPROXY': '2',
 'INDFMIN2': '4',
 'INDFMPIR': '0.84',
 'INDHHIN2': '4',
 'MIAINTRP': '2',
 'MIALANG': '1',
 'MIAPROXY': '2',
 'RIAGENDR': '1',
 'RIDAGEYR': '69',
 'RIDEXMON': '1',
 'RIDRETH1': '4',
 'RIDRETH3': '4',
 'RIDSTATR': '2',
 'SDDSRVYR': '8',
 'SDMVPSU': '1',
 'SDMVSTRA': '112',
 'SEQN': '73557',
 'SIAINTRP': '2',
 'SIALANG': '1',
 'SIAPROXY': '2',
 'WTINT2YR': '13281.237386',
 'WTMEC2YR': '13481.042095',
 '_id': ObjectId('639514e5470610b8d0ec08b2')}
{'DMDBORN4': '2',
 'DMDCITZN': '1',
 'DMDEDUC2': '4',
 'DMDFMSIZ': '7',
 'DMDHHSIZ': '7',
 'DMDHHSZA': '1',
 'DMDHHSZB': '1',
 'DMDHHSZE': '2',
 'DMDHRAGE': '69',
 'DMDHRBR4': '2',
 'DMDHREDU'

In [None]:
def demographic_col(*columns):
    col = {"_id" : 0}
    for i in columns: col[i] = 1
    a = db.demographic.find({}, col)
    for i in a:
        printer.pprint(i)

In [None]:
demographic_col('SEQN', 'AIALANGA')

{'AIALANGA': '1', 'SEQN': '73557'}
{'AIALANGA': '1', 'SEQN': '73558'}
{'SEQN': '73559'}
{'AIALANGA': '1', 'SEQN': '73560'}
{'SEQN': '73561'}
{'AIALANGA': '1', 'SEQN': '73562'}
{'SEQN': '73563'}
{'AIALANGA': '1', 'SEQN': '73564'}
{'SEQN': '73565'}
{'AIALANGA': '1', 'SEQN': '73566'}
{'AIALANGA': '1', 'SEQN': '73567'}
{'AIALANGA': '1', 'SEQN': '73568'}
{'SEQN': '73569'}
{'AIALANGA': '1', 'SEQN': '73570'}
{'SEQN': '73571'}
{'AIALANGA': '1', 'SEQN': '73572'}
{'AIALANGA': '1', 'SEQN': '73573'}
{'AIALANGA': '1', 'SEQN': '73574'}
{'SEQN': '73575'}
{'AIALANGA': '1', 'SEQN': '73576'}
{'AIALANGA': '2', 'SEQN': '73577'}
{'AIALANGA': '1', 'SEQN': '73578'}
{'AIALANGA': '1', 'SEQN': '73579'}
{'AIALANGA': '1', 'SEQN': '73580'}
{'AIALANGA': '1', 'SEQN': '73581'}
{'SEQN': '73582'}
{'SEQN': '73583'}
{'AIALANGA': '1', 'SEQN': '73584'}
{'AIALANGA': '1', 'SEQN': '73585'}
{'SEQN': '73586'}
{'AIALANGA': '1', 'SEQN': '73587'}
{'AIALANGA': '1', 'SEQN': '73588'}
{'AIALANGA': '1', 'SEQN': '73589'}
{'SEQN': '73590

# propozycje na potem

funkcja, że jak insertuje dane to dodaje się data insertu

jak byłyby odczytywane dane to w przypadku tych co już były (zestaw z kaglle 2013-2014), to wyświetla się właśnie ta data. ////// może nawet coś w stylu, że jak nie ma takiego key jak data insert to wyświetla 2013-14