In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os, glob, re, datetime

In [2]:
unit_test_regex = re.compile('Testing \[(\w*)\]\s*(Ok|Failed)')

def parse_unit_log(filename):
    results = {}
    with open(filename) as fh:
        for line in fh:
            match = unit_test_regex.search(line)
            if match:
                results[match.group(1)] = (match.group(2) == 'Ok')
    return results

In [3]:
app_test_regex = re.compile('Testing \[(\w*)\]')
app_test_case_regex = re.compile('(OK|Failed) ... Case \[(\w*)\]')

def parse_app_log(filename):
    results = {}
    with open(filename) as fh:
        for line in fh:
            app_match = app_test_regex.search(line)
            if app_match:
                current_app = app_match.group(1)
            case_match = app_test_case_regex.search(line)
            if case_match:
                results['{}_{}'.format(current_app, case_match.group(2))] = (case_match.group(1) == 'OK')
    return results

In [4]:
cat_package_regex = re.compile('Working on Package \[(\w*)\]')
cat_test_regex = re.compile('(OK|Failed) ... Case \[(\w*)\]')

def parse_cat_log(filename):
    results = {}
    with open(filename) as fh:
        for line in fh:
            package_match = cat_package_regex.search(line)
            if package_match:
                current_package = package_match.group(1)
            test_match = cat_test_regex.search(line)
            if test_match:
                results['{}_{}'.format(current_package, test_match.group(2))] = (test_match.group(1) == 'OK')
    return results

In [5]:
date_regex = re.compile('Test(\d{4})-(\d{2})-(\d{2}).log')

def get_date(filename):
    match = date_regex.search(filename)
    if match:
        return datetime.date(int(match.group(1)), int(match.group(2)), int(match.group(3)))

In [6]:
app_test_glob = "appTest*.log"
cat_test_glob = "catTest*.log"
unit_test_glob = "unitTest*.log"

def get_test_data_frames(directory):
    app_test_logs = glob.glob(os.path.join(directory, app_test_glob))
    cat_logs = glob.glob(os.path.join(directory, cat_test_glob))
    unit_logs = glob.glob(os.path.join(directory, unit_test_glob))
    
    app_results = {}
    for log_file in app_test_logs:
        app_results[get_date(log_file)] = parse_app_log(log_file)
    app_df = pd.DataFrame.from_dict(app_results)
    app_df = app_df.reindex(sorted(app_df.columns), axis=1)
    
    cat_results = {}
    for log_file in cat_logs:
        cat_results[get_date(log_file)] = parse_cat_log(log_file)
    cat_df = pd.DataFrame.from_dict(cat_results)
    cat_df = cat_df.reindex(sorted(cat_df.columns), axis=1)
        
    unit_results = {}
    for log_file in unit_logs:
        unit_results[get_date(log_file)] = parse_unit_log(log_file)
    unit_df = pd.DataFrame.from_dict(unit_results)
    unit_df = unit_df.reindex(sorted(unit_df.columns), axis=1)
    
    return app_df, cat_df, unit_df

In [7]:
def compute_failure_frequencies(df):
    return (df==False).sum(axis=1) / df.count(axis=1)

In [8]:
def compute_log_failure_frequencies(df):
    return (df==False).sum(axis=1) / np.log(df.count(axis=1))

In [9]:
def compute_changes(df):
    return (df != df.shift(axis=1)) & (df.notna() & df.shift(axis=1).notna())

In [10]:
def compute_change_frequencies(df):
    return compute_changes(df).sum(axis=1) / df.count(axis=1)

In [11]:
def compute_log_change_frequencies(df):
    return compute_changes(df).sum(axis=1) / np.log(df.count(axis=1))

In [12]:
def compute_stats(df):
    stats = pd.DataFrame(compute_failure_frequencies(df), columns=['freq'])
    stats['log_freq'] = compute_log_failure_frequencies(df)
    stats['change'] = compute_change_frequencies(df)
    stats['log_change'] = compute_log_change_frequencies(df)
    return stats

In [13]:
def create_csvs(input_dir, output_dir):
    app_df, cat_df, unit_df = get_test_data_frames(input_dir)
    app_stats = compute_stats(app_df)
    cat_stats = compute_stats(cat_df)
    unit_stats = compute_stats(unit_df)
    
    app_df.to_csv( os.path.join(output_dir, 'app_tests.csv') )
    cat_df.to_csv( os.path.join(output_dir, 'cat_tests.csv') )
    unit_df.to_csv( os.path.join(output_dir, 'unit_tests.csv') )
    app_stats.to_csv( os.path.join(output_dir, 'app_stats_tests.csv') )
    cat_stats.to_csv( os.path.join(output_dir, 'cat_stats_tests.csv') )
    unit_stats.to_csv( os.path.join(output_dir, 'unit_stats_tests.csv') )

In [14]:
def process_all_data(input_dir, output_dir, sub_dirs):
    for sub_dir in sub_dirs:
        print ('processing {}'.format(sub_dir))
        input_sub_dir = os.path.join(input_dir, sub_dir)
        output_sub_dir = os.path.join(output_dir, sub_dir)
        os.makedirs(name=output_sub_dir, exist_ok=True)
        create_csvs(input_sub_dir, output_sub_dir)

In [15]:
def compute_all_dataframes(input_dir, sub_dirs):
    results = {}
    for sub_dir in sub_dirs:
        print ('processing {}'.format(sub_dir))
        input_sub_dir = os.path.join(input_dir, sub_dir)
        print('  parsing logs')
        app_df, cat_df, unit_df = get_test_data_frames(input_sub_dir)
        print('  computing stats')
        app_stats = compute_stats(app_df)
        cat_stats = compute_stats(cat_df)
        unit_stats = compute_stats(unit_df)
        results[sub_dir] = [app_df, cat_df, unit_df, app_stats, cat_stats, unit_stats]
    return results

In [16]:
logs_dir = '/usgs/cpkgs/isis3/logs/builds-tests/'
output_dir = '/work/users/jmapel/ISIS3_test_stats'

Ubuntu_14 = 'Linux_x86_64_prog22'
Debian = 'Linux_unknown_prog23'
CentOS_7 = 'Linux_x86_64_prog24'
Fedora_25 = 'Linux_x86_64_prog25'
Mac10_11 = 'Darwin_i386_prog26'

systems = [Ubuntu_14, Debian, CentOS_7, Fedora_25, Mac10_11]

In [17]:
frames = compute_all_dataframes(logs_dir, systems)

processing Linux_x86_64_prog22
  parsing logs
  computing stats
processing Linux_unknown_prog23
  parsing logs
  computing stats
processing Linux_x86_64_prog24
  parsing logs
  computing stats
processing Linux_x86_64_prog25
  parsing logs
  computing stats
processing Darwin_i386_prog26
  parsing logs
  computing stats


In [36]:
app_dict = {}
metrics = ['freq', 'log_freq', 'change', 'log_change']
for system in systems:
    for metric in metrics:
        tests = frames[system][3][metric].nlargest(10).index.tolist()
        for test in tests:
            if test in app_dict:
                app_dict[test] += 1
            else:
                app_dict[test] = 1

In [37]:
worst_app_tests = pd.Series(app_dict)

In [38]:
worst_app_tests.sort_values(ascending=False)

tgocassisrdrgen_default          13
isis2pds_pds4                    12
tgocassis2isis_exportedFile      11
hyb1pds4gen_nirsitokawa3drefl    10
hyb1pds4gen_mars                 10
hyb1pds4gen_amicaitokawa2diof    10
tgocassismos_default              9
photcorri_mcewen                  9
photcorri_lommelSeeliger          8
tgocassisrdrgen_raw               7
photcorri_rolo                    6
photcorri_minnaert                6
csv2table_errors                  6
tgocassis2isis_errors             6
tgocassis2isis_blu                5
ckwriter_lronac                   4
sumspice_dawn                     4
findimageoverlaps_nooverlap       4
photomet_useDem                   4
cnettable_append                  4
cnettable_allowErrors             3
edrget_http                       3
tgocassis2isis_default            3
cnetcombinept_messenger           3
mdisedrinfo_kernelchk             3
cnetcombinept_noclean             3
findfeatures_debug                2
cnetcombinept_setapriori    