In [1]:
import pandas as pd
import numpy as np
import os
import psycopg2

### Find CSV files in directory

In [2]:
directory = '../airbnb_data_vienna/csv/'

def csv_files():
    
    csv_files = []
    for file in os.listdir(directory):
        #change if needed for calendar
        if file.endswith('.csv') & file.startswith('reviews'):
            csv_files.append(file)
    return csv_files

In [3]:
def dataset_directory(csv_files, data_path):
    df = {}
    for file in csv_files:
        try:
            df[file] = pd.read_csv(data_path + file)
        except UnicodeDecodeError:
            df[file] = pd.read_csv(data_path+file, encoding='ISO-8859-1')
        print(file)
    return df

reviews_07_06.csv
reviews_10_03.csv
reviews_11_09.csv


In [7]:
def clean_tbl_name(filename):

    clean_tbl_name = filename.lower().replace(" ", "_").replace("?", "") \
        .replace("-", "_").replace(r"/", "_").replace("\\", "_").replace("%", "") \
            .replace(")", "").replace(r"(", "").replace("$", "")

    #remove .csv extension from clean_tbl_name
    tbl_name = '{0}'.format(clean_tbl_name.split('.')[0])
    return tbl_name


In [8]:
 #clean header names, lower case letters, remove all white spaces
def clean_colname(dataframe):

    dataframe.columns = [x.lower().replace(" ", "_").replace("?", "") \
        .replace("-", "_").replace(r"/", "_").replace("\\", "_").replace("%", "") \
            .replace(")", "").replace(r"(", "").replace("$", "") for x in dataframe.columns]

    #replacement dictionairy that maps pandas dtypes to sql types
    replacements = {
    'object' : 'varchar',
    'float64' : 'float',
    'int64' : 'int',
    'datetime64[ns]' : 'timestamp'
    }

    #columns for postgres database - table schema
    col_str = ", ".join("{} {}".format(n, d) for (n, d) in zip(dataframe.columns, dataframe.dtypes.replace(replacements)))

    return col_str, dataframe.columns

In [15]:
def upload_to_db(host, dbname, user, password, tbl_name, col_str, file, dataframe, dataframe_columns):
  
    conn_string = "host=%s dbname=%s user=%s password=%s" % (host, dbname, user, password)
    conn = psycopg2.connect(conn_string)
    cursor = conn.cursor()
    print('Opened database successfully')

    #drop tables with the same name
    cursor.execute("drop table if exists %s;" % (tbl_name))

    #create table
    cursor.execute("create table %s (%s)" % (tbl_name, col_str))
    print('{0} was created successfully'.format(tbl_name))

    #insert value into table

    # Save to csv
    dataframe.to_csv(file, header=dataframe_columns, index=False, encoding='utf-8')

    #open the csv file, save it as an object
    reviews_file = open(file, encoding='utf-8')
    print('File opened in memory')

    # upload to db
    sql_query = """
    COPY %s FROM STDIN WITH
        CSV
        HEADER
        DELIMITER AS ','
    """
    cursor.copy_expert(sql=sql_query % tbl_name, file= reviews_file)
    print('File copied to db')

    cursor.execute("grant select on table %s to public" % tbl_name)
    conn.commit()
    cursor.close()
    print('Table {0} immported to db completed'.format(tbl_name))

reviews_07_06
Opened database successfully
reviews_07_06 was created successfully
File opened in memory
File copied to db
Table reviews_07_06 immported to db completed
reviews_10_03
Opened database successfully
reviews_10_03 was created successfully
File opened in memory
File copied to db
Table reviews_10_03 immported to db completed
reviews_11_09
Opened database successfully
reviews_11_09 was created successfully
File opened in memory
File copied to db
Table reviews_11_09 immported to db completed
All table have been successfully imported into the db
