In [77]:
import os
import math
from datetime import datetime, timedelta

from helpers import general_helpers
import pytest 
import pytz

project_home = os.environ['PACKAGE_HOME']
import configparser
config = configparser.ConfigParser()
config.read(project_home + '/Development/config.ini')
start_date = config['DATES']['START_DATE']
end_date = config['DATES']['END_DATE']


# set folders
rawxml_folder = '{}/raw_data/'.format(config['FOLDERS']['WORKING_FOLDER'])
cleanxml_folder = '{}/clean_data/'.format(config['FOLDERS']['WORKING_FOLDER'])
parsedxml_folder = '{}/parsed_data/'.format(config['FOLDERS']['WORKING_FOLDER'])

In [70]:
qa_db = 'patent_QA'
new_db = config['DATABASE']['NEW_DB']
db_con = general_helpers.connect_to_db(config['DATABASE']['HOST'], config['DATABASE']['USERNAME'], config['DATABASE']['PASSWORD'], qa_db)

In [136]:
# REQUIRES: start and end timestamps (database format "YYYYMMDD")
# MODIFIES: nothing 
# EFFECTS: returns number of weeks data update spans
def get_num_weeks(start_timestamp, end_timestamp):
    
    time_period = end_timestamp - start_timestamp
    num_weeks = math.ceil(time_period.days // 7)
    
    return num_weeks

# REQUIRES: data update start & end dates (from config file, database format "YYYYMMDD")
# MODIFIES: nothing 
# EFFECTS: returns number of weeks data update spans, all tuesdays of data
def get_tuesdays(start_date, end_date):
    
    # convert dates to datetime objects
    start_timestamp = datetime.strptime(start_date,'%Y%m%d')
    end_timestamp = datetime.strptime(end_date,'%Y%m%d')

    # see how many weeks data spans ( get # of files that should be downloaded)
    num_weeks = get_num_weeks(start_timestamp, end_timestamp)
    
    # now get dates of all tuesdays part of data update
    all_tuesdays = []

    # need to -1 from start timestamp since it will always be a wednesday
    ts = start_timestamp - timedelta(days=1)

    for i in range(0, num_weeks):
        # get date of next tuesday
        ts = ts + timedelta(days=7)
        ts_str = datetime.strftime(ts, '%Y%m%d')
        all_tuesdays.append(ts_str)
        
    return num_weeks, all_tuesdays


# REQUIRES: db connection, qa_db name, new_db name, data folder path, num_weeks, filetype('raw, clean, or parsed xml')
# MODIFIES: nothing
# EFFECTS: asserts if # of folders matches expected # for data update
def test_num_xmlfiles(db_con, qa_db, new_db, db_table, folder_path, num_weeks, file_type):
    
    test_type = 'num_files'
    
    num_xml_files = len(os.listdir(folder_path))

    # static - table columns are fixed
    stmt_pt1 = "insert into {0}.{1} (`db_timestamp`,`test_type`, `file_type`, `expected_value`, `actual_value`)".format(qa_db,db_table)

    # dynamic - row to insert changes
    stmt_pt2 = "values ('{0}', '{1}', '{2}', {3}, {4});".format(new_db, test_type, file_type, num_weeks, num_xml_files)

    # insert row into qa table
    insert_stmt = stmt_pt1 + stmt_pt2
    db_con.execute(insert_stmt)
      
    assert(num_xml_files == num_weeks)
    return

# REQUIRES: db connection, qa_db name, new_db name, data folder path, filetype('raw, clean, or parsed xml')
# MODIFIES: nothing
# EFFECTS: returns 
def test_size_xml(db_con, qa_db, new_db, db_table, folder_path, file_type):
    
    test_type = 'size_files'
    xml_files = os.listdir(folder_path)

    # check each file's size
    for fle in xml_files:
        
        
        fle_size = os.stat(folder_path + fle).st_size
        
        fle_size = round((fle_size/math.pow(10,3)),2)
        
        # static - table columns are fixed
        stmt_pt1 = "insert into {0}.{1} (`db_timestamp`,`test_type`, `file_type`, `file_name`, `actual_value`)".format(qa_db,db_table)

        # dynamic - row to insert changes
        stmt_pt2 = "values ('{0}', '{1}', '{2}', '{3}', {4});".format(new_db, test_type, file_type, fle, fle_size)

        # insert row into qa table
        insert_stmt = stmt_pt1 + stmt_pt2
        db_con.execute(insert_stmt)
        
        
        assert(fle_size > 0)
        
    
    return
    
# REQUIRES: db connection, qa_db name, new_db name, data folder path, tuesdays
# MODIFIES: nothing
# EFFECTS: returns 
def test_num_parsedxml(db_con, qa_db, new_db, db_table, folder_path, tuesdays, tables):
    test_type = 'num_files'
    file_type = 'parsed_xml'
    
    for tues in tuesdays:
        num_parsed_tues_files = len(os.listdir(folder_path + tues + '/'))

        # static - table columns are fixed
        stmt_pt1 = "insert into {0}.{1} (`db_timestamp`,`test_type`, `file_type`, `file_name`,  `expected_value`, `actual_value`)".format(qa_db,db_table)

        # dynamic - row to insert changes
        stmt_pt2 = "values ('{0}', '{1}', '{2}', '{3}', {4}, {5});".format(new_db, test_type, file_type, tues, len(tables), num_parsed_tues_files)

        # insert row into qa table
        insert_stmt = stmt_pt1 + stmt_pt2

        db_con.execute(insert_stmt)

        assert num_parsed_tues_files == len(tables)
        
    return


# REQUIRES: db connection, qa_db name, new_db name, data folder path, tuesdays
# MODIFIES: nothing
# EFFECTS: returns 
def test_size_parsedxml(db_con, qa_db, new_db, db_table, folder_path, tuesdays, tables):
    test_type = 'size_files'
    file_type = 'parsed_xml'
    # TODO: assess error_counts.csv and error_data.csv
    tables = ['application.csv', 'botanic.csv','brf_sum_text.csv','claim.csv','detail_desc_length.csv','detail_desc_text.csv',
 'draw_desc_text.csv','figures.csv','foreign_priority.csv','foreigncitation.csv','government_interest.csv','ipcr.csv','mainclass.csv','non_inventor_applicant.csv','otherreference.csv','patent.csv','pct_data.csv','rawassignee.csv','rawexaminer.csv','rawinventor.csv','rawlawyer.csv','rawlocation.csv','rel_app_text.csv','subclass.csv','us_term_of_grant.csv','usapplicationcitation.csv','uspatentcitation.csv', 'uspc.csv','usreldoc.csv']

    
    for tues in tuesdays:
        fp = folder_path + tues + '/'
        parsed_tues_files = os.listdir(fp)
        
        for tbl in parsed_tues_files:
            if tbl in tables:
                full_tbl_name = tues + '/' + tbl

                tbl_size = os.stat(fp + tbl).st_size
                print(tbl)
                print(tbl_size)
                tbl_size = round((tbl_size/math.pow(10,9)),2)

                # static - table columns are fixed
                stmt_pt1 = "insert into {0}.{1} (`db_timestamp`,`test_type`, `file_type`, `file_name`, `actual_value`)".format(qa_db,db_table)

                # dynamic - row to insert changes
                stmt_pt2 = "values ('{0}', '{1}', '{2}', '{3}', {4});".format(new_db, test_type, file_type, full_tbl_name, tbl_size)

                # insert row into qa table
                insert_stmt = stmt_pt1 + stmt_pt2
                db_con.execute(insert_stmt)

        
                assert(tbl_size > 0)
        
    return

In [137]:
#1. Get span of data update
num_weeks, all_tuesdays = get_tuesdays(start_date, end_date)

print("Number of weeks of data update: ", num_weeks)
print("Tuesdays are: ", all_tuesdays)

all_tuesdays = ['20180717', '20181023', '20181030', '20181106', '20181113', '20181120', '20181127']

# #2. Check raw XML files were downloaded
# test_num_xmlfiles(db_con, qa_db, new_db, '01_xml', rawxml_folder, num_weeks + 1, 'raw_xml')

# #3. Check that raw XML files have size > 0MB
# test_size_xml(db_con, qa_db, new_db, '01_xml', rawxml_folder, 'raw_xml')

# #4. Check that clean XML files were created
# test_num_xmlfiles(db_con, qa_db, new_db, '01_xml', cleanxml__folder, num_weeks + 1, 'clean_xml')

# #5. Check that clean XML files have size > 0MB
# test_size_xml(db_con, qa_db, new_db, '01_xml', rawxml_folder, 'clean_xml')

#6. Check that a parsed XML folder was created for each Tuesday
#test_num_xmlfiles(db_con, qa_db, new_db, '01_xml', parsedxml__folder, num_weeks + 1, 'parsed_xml')
# convert format from '20180717' to '180717'
all_tuesdays = ['20190723'] # for testing
abbrev_tuesdays = [x[2:] for x in all_tuesdays]

#7. Check that each parsed .csv is in each parsed folder 
test_num_parsedxml(db_con, qa_db, new_db, '01_xml', parsedxml_folder, abbrev_tues, tables)

#8. Check that each parsed .csv in each parsed folder has a size > 0 MB
test_size_parsedxml(db_con, qa_db, new_db, '01_xml', parsedxml_folder, abbrev_tuesdays, tables)

Number of weeks of data update:  6
Tuesdays are:  ['20190827', '20190903', '20190910', '20190917', '20190924', '20191001']


FileNotFoundError: [Errno 2] No such file or directory: '/dbupdate_data//parsed_data/180717/'

In [None]:


db_con.execute("insert into {0}.test_num_xmlfiles (`db_timestamp`, `expected_value`, `actual_value`) values ('{1}', 5, 4);".format(qa_db, new_db))

In [102]:
test = all_tuesdays[1] 

In [None]:

downloaded_xml_files = os.listdir(rawdata_folder)


In [105]:

abbrev_tues

['180717', '181023', '181030', '181106', '181113', '181120', '181127']

In [133]:
z = 447786
z/math.pow(10,3)

447.786

In [135]:
                                                                                                                                   
round(z/math.pow(10,3),3)

447.786