In [152]:
import numpy as np
import pandas as pd
import psycopg2 as pg
#import sql
from sqlalchemy import create_engine
import sqlalchemy
import sys
import os

#import matplotlib.pyplot as plt
#%matplotlib inline

In [45]:
data_dir = '/mnt/data/mvesc/PartnerData/' # top level data directory
absence_folder = data_dir +  "AbsenceDaysDetail/" # 2nd level data directory
absence_file_names = os.listdir(absence_folder)

In [69]:
absence_file_names.sort()

In [70]:
absence_file_names

['CCFRRWRVabsence09_16.txt',
 'MATVWMAbsences1415.txt',
 'MATVWMAbsences1516.txt']

### Connect to database

In [194]:
# Connect to database; doesn't required for the next session of dumping csv to sql
pass_file = "/mnt/data/mvesc/pgpass" # username, db information
with open(pass_file, 'r') as f:
    passinfo = f.read()
passinfo = passinfo.strip().split(':')

host_address = passinfo[0]
port = passinfo[1]
user_name = passinfo[2]
name_of_database = passinfo[3]
user_password = passinfo[4]
sqlcmd_table_names = "SELECT table_name FROM information_schema.tables WHERE table_schema = 'public'"
connection = pg.connect(host=host_address, database=name_of_database, user=user_name, password=user_password)
table_names = pd.read_sql(sqlcmd_table_names, connection)
#sql alchemy format dialect+driver://username:password@host:port/database
# 'postgresql://scott:tiger@localhost:5432/mydatabase'
sqlalchemy_eng = "postgresql://"+user_name+":"+user_password+"@"+host_address+'/'+name_of_database
engine = create_engine(sqlalchemy_eng)
print(table_names)

                  table_name
0          DistrictSchoolIDs
1                 all_lookup
2         MATVWMAbsences1415
3       CCFRRWRVabsence09_16
4         MATVWMAbsences1516
5            CurrentMobility
6            CurrentStudents
7              ASQ_Preschool
8                  ActScores
9           AllDistricts1112
10                  HSGrades
11          AllDistricts1213
12                 AIRScores
13          AllDistricts1314
14  CurrentAbsenceDiscipline
15                  DIBELSv2
16          AllDistricts1415
17             AllGradsTotal
18                    OAAOGT
19                     PARCC
20                  StarRead
21                    StarEL
22                  StarMath
23                 TerraNova


## Dumping CSV to postgresql
No column names changed

In [216]:
def read_csv_noheader(filepath):
    """ read a csv file with no header
    
    :param str filepath: file path name
    :return pandas.DataFrame with header 'col1', 'col2', ...
    :rtype pandas.DataFrame
    """
    df = pd.read_csv(filepath, header=None, low_memory=False) # read csv data with no header
    colnames = {i:'col'+str(i) for i in df.columns} # column names of col0, col1, col2, ... 
    df = df.rename(columns=colnames)
    return df

def csv2postgres_file(filepath, header=False, nrows=-1, if_exists='fail'):
    """ dump csv file to postgres database
    
    :param str filepath: file path name
    :param bool header: True means there is header;
    :return str table_name: table name of the sql table
    :rtype str
    """
    # read the data frame 
    if header:
        df = pd.read_csv(filepath, low_memory=False)
    else:
        df = read_csv_noheader(filepath) # header: col0, col1, col2
    
    # create a postgresql engine to wirte to postgres
    from sqlalchemy import create_engine
    pass_file = "/mnt/data/mvesc/pgpass" # username, db information
    with open(pass_file, 'r') as f:
        passinfo = f.read()
    passinfo = passinfo.strip().split(':')
    host_address = passinfo[0]
    port = passinfo[1]
    user_name = passinfo[2]
    name_of_database = passinfo[3]
    user_password = passinfo[4]
    sqlalchemy_eng = "postgresql://"+user_name+":"+user_password+"@"+host_address+'/'+name_of_database
    engine = create_engine(sqlalchemy_eng)
    
    #write the data frame to postgres
    table_name = filepath.split('/')[-1].split('.')[0] # table name is filename without .txt or other extension
    if nrows==-1:
        df.to_sql(table_name, engine, schema='public', index=False, if_exists=if_exists)
    else:
        df.iloc[:nrows, :].to_sql(table_name, engine, schema='public', index=False, if_exists=if_exists)
    return table_name


def csv2postgres_dir(directory, header=False, nrows=-1, if_exists='fail'):
    """ dump a directory of csv files to postgres database
    
    :param str filepath: file path name
    :param bool header: True means there is header;
    :return str table_name: table name of the sql table
    :rtype str
    """
    data_dir = directory
    data_file_names = os.listdir(data_dir)
    # full path name of filenames
    fnames = [data_dir + fn for fn in data_file_names]
    table_names = []
    for filepath in fnames:
        print("working on ", filepath)
        tab_name = csv2postgres_file(filepath, header=header, nrows=nrows, if_exists=if_exists)
        table_names.append(tab_name)
    return table_names
    

data_dir = '/mnt/data/mvesc/PartnerData/AbsenceDaysDetail/'
abs_table_names = csv2postgres_dir(data_dir, header=False, nrows=-1, if_exists='replace')  
print(abs_table_names)

data_dir = '/mnt/data/mvesc/PartnerData/DistrictGrades2006_16/'
grade_talbe_names = csv2postgres_dir(data_dir, header=True, nrows=-1, if_exists='replace')
print(grade_talbe_names)

# data_dir = '/mnt/data/mvesc/PartnerData/AbsenceDaysDetail/' # top level data directory
# data_file_names = os.listdir(data_dir)
# # full path name of filenames
# fnames = [data_dir + fn for fn in data_file_names]
# table_names = []
# for filepath in fnames:
#     print("working on ", filepath)
#     tab_name = csv2postgres_file(filepath, header=False, nrows=-1, if_exists='replace')
#     table_names.append(tab_name)
    
# table_names

working on  /mnt/data/mvesc/PartnerData/AbsenceDaysDetail/MATVWMAbsences1415.txt
working on  /mnt/data/mvesc/PartnerData/AbsenceDaysDetail/CCFRRWRVabsence09_16.txt
working on  /mnt/data/mvesc/PartnerData/AbsenceDaysDetail/MATVWMAbsences1516.txt
['MATVWMAbsences1415', 'CCFRRWRVabsence09_16', 'MATVWMAbsences1516']
working on  /mnt/data/mvesc/PartnerData/DistrictGrades2006_16/Maysvillegrades2006_16.txt
working on  /mnt/data/mvesc/PartnerData/DistrictGrades2006_16/CoshoctonGrades2006_16.txt
working on  /mnt/data/mvesc/PartnerData/DistrictGrades2006_16/Ridgewoodgrades2007_2016.txt
working on  /mnt/data/mvesc/PartnerData/DistrictGrades2006_16/WestMuskingumgrades2006_16.txt
working on  /mnt/data/mvesc/PartnerData/DistrictGrades2006_16/Franklingrades2006_16.txt
['Maysvillegrades2006_16', 'CoshoctonGrades2006_16', 'Ridgewoodgrades2007_2016', 'WestMuskingumgrades2006_16', 'Franklingrades2006_16']


In [126]:
fnames = [absence_folder + fn for fn in absence_file_names]
df_CCFRRWRVabsence09_16 = read_csv_noheader(fnames[0])
df_MATVWMAbsences1415 = read_csv_noheader(fnames[1])
df_MATVWMAbsences1516 = read_csv_noheader(fnames[2])

In [115]:
head_names_CCFRRWRVabsence09_16 = ['StudentLookup', 'Timestamp', 'Absence_length', 'Absence_code', 'Abscence_desc', 'col5','District']

In [127]:
print(df_MATVWMAbsences1415.shape)
df_MATVWMAbsences1415.head()

(91407, 7)


Unnamed: 0,col0,col1,col2,col3,col4,col5,col6
0,5672,2014-09-02 00:00:00,0.0,R,UNEXCUSED TARDY,MAES,Maysville
1,5672,2014-11-04 00:00:00,0.5,A,EXCUSED ABSENCE,MAES,Maysville
2,5672,2014-11-05 00:00:00,1.0,A,EXCUSED ABSENCE,MAES,Maysville
3,5672,2014-12-17 00:00:00,0.5,A,EXCUSED ABSENCE,MAES,Maysville
4,5672,2015-02-23 00:00:00,1.0,A,EXCUSED ABSENCE,MAES,Maysville


In [193]:
def read_csv_noheader(filepath):
    """ read a csv file with no header
    
    :param str filepath: file path name
    :return pandas.DataFrame with header 'col1', 'col2', ...
    :rtype pandas.DataFrame
    """
    df = pd.read_csv(filepath, header=None) # read csv data with no header
    colnames = {i:'col'+str(i) for i in df.columns} # column names of col0, col1, col2, ... 
    df = df.rename(columns=colnames)
    return df

def csv2postgres_file(filepath, header=False, nrows=-1, if_exists='fail'):
    """ dump csv file to postgres database
    :param str filepath: file path name
    :param bool header: True means there is header;
    :return None
    :rtype None
    """
    # read the data frame 
    if header:
        df = pd.read_csv(filepath)
    else:
        df = read_csv_noheader(filepath) # header: col0, col1, col2
    
    # create a postgresql engine to wirte to postgres
    from sqlalchemy import create_engine
    pass_file = "/mnt/data/mvesc/pgpass" # username, db information
    with open(pass_file, 'r') as f:
        passinfo = f.read()
    passinfo = passinfo.strip().split(':')
    host_address = passinfo[0]
    port = passinfo[1]
    user_name = passinfo[2]
    name_of_database = passinfo[3]
    user_password = passinfo[4]
    sqlalchemy_eng = "postgresql://"+user_name+":"+user_password+"@"+host_address+'/'+name_of_database
    engine = create_engine(sqlalchemy_eng)
    
    #write the data frame to postgres
    table_name = filepath.split('/')[-1].split('.')[0] # table name is filename without .txt or other extension
    if nrows==-1:
        df.to_sql(table_name, engine, schema='public', index=False, if_exists=if_exists)
    else:
        df.iloc[:nrows, :].to_sql(table_name, engine, schema='public', index=False, if_exists=if_exists)
    return None

data_dir = '/mnt/data/mvesc/PartnerData/AbsenceDaysDetail/' # top level data directory
data_file_names = os.listdir(data_dir)
fnames = [data_dir + fn for fn in data_file_names]
for filepath in fnames:
    print("working on ", filepath)
    csv2postgres_file(filepath, header=False, nrows=20, if_exists='replace')

working on  /mnt/data/mvesc/PartnerData/AbsenceDaysDetail/MATVWMAbsences1415.txt
working on  /mnt/data/mvesc/PartnerData/AbsenceDaysDetail/CCFRRWRVabsence09_16.txt
working on  /mnt/data/mvesc/PartnerData/AbsenceDaysDetail/MATVWMAbsences1516.txt


In [190]:
df_CCFRRWRVabsence09_16.dtypes

col0      int64
col1     object
col2    float64
col3     object
col4     object
col5     object
dtype: object

In [191]:
df_MATVWMAbsences1415.dtypes

col0      int64
col1     object
col2    float64
col3     object
col4     object
col5     object
col6     object
dtype: object

In [192]:
df_MATVWMAbsences1516.dtypes

col0      int64
col1     object
col2    float64
col3     object
col4     object
col5     object
col6     object
dtype: object

In [179]:
fnames

['/mnt/data/mvesc/PartnerData/AbsenceDaysDetail/CCFRRWRVabsence09_16.txt',
 '/mnt/data/mvesc/PartnerData/AbsenceDaysDetail/MATVWMAbsences1415.txt',
 '/mnt/data/mvesc/PartnerData/AbsenceDaysDetail/MATVWMAbsences1516.txt']

In [201]:
df_MATVWMAbsences1415.iloc[:, 6].unique()

array(['Maysville', 'TriValley', 'West Muskingum'], dtype=object)

In [202]:
df_MATVWMAbsences1516.iloc[:, 6].unique()

array(['Maysville', 'TriValley', 'West Muskingum'], dtype=object)

In [204]:
df_CCFRRWRVabsence09_16.iloc[:, 5].unique()

array(['RWAR', 'within the two hour window', 'RCON', 'RVHS', 'RVJH',
       'RUNI', 'RKEE', 'RIJH', 'RIHS', 'RIWE', 'FPJH', 'FRPR', 'FDFP',
       'FPHS', 'FRMS', 'COMI', 'COCS', 'COHI', 'COSL', 'COLS'], dtype=object)

In [207]:
df_MATVWMAbsences1516.iloc[:, 5].unique()

array(['MAES', 'MAHS', 'MAMS', 'TVEA', 'TVEF', 'TVEJ', 'TVEN', 'TVHS',
       'TVMS', 'WMED', 'WMEH', 'WMHS', 'WMMS'], dtype=object)

In [205]:
df_CCFRRWRVabsence09_16.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5
0,2540,2016-01-20 00:00:00,1.0,A,Excused Absence,RWAR
1,2540,2016-02-02 00:00:00,1.0,A,Excused Absence,RWAR
2,2540,2015-12-03 00:00:00,1.0,A,Excused Absence,RWAR
3,2540,2016-02-18 00:00:00,1.0,A,Excused Absence,RWAR
4,2540,2016-02-23 00:00:00,1.0,A,Excused Absence,RWAR
