## Reading of ASCII files created for cams diagnostics tool

In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import os
from helper_funcs import read_file_custom

data_dir = "./data/michael_ascii_read/"
case_ok = "./data/from_ada/table_GLBL_ANN_obs_FIXED.asc"

VERBOSE = True

In [2]:
files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith(".webarchive")]

for file in files:
    print(file)
    
test_file = files[0]

print("TEST FILE: {}".format(os.path.basename(test_file)))

./data/michael_ascii_read/N1850_f19_tn14_r227_ctrl (yrs 310-340).webarchive
./data/michael_ascii_read/N1850_f19_tn14_r227_ctrl (yrs 80-110).webarchive
./data/michael_ascii_read/N1850_f09_tn14_230218 (yrs 1-20).webarchive
./data/michael_ascii_read/N1850_f19_tn14_r227_ctrl (yrs 185-215).webarchive
./data/michael_ascii_read/N1850C53CLM45L32_f09_tn11_191017 (yrs 71-100).webarchive
./data/michael_ascii_read/N1850_f19_tn14_r265_ctrl_20180411 (yrs 90-120).webarchive
TEST FILE: N1850_f19_tn14_r227_ctrl (yrs 310-340).webarchive


Try read first file as is with pandas

In [3]:
try:
    frame = pd.read_csv(test_file, encoding="latin-1")
except Exception as e:
    print(repr(e))
frame.head()

Unnamed: 0,bplist00Ñ_WebMainResourceÕ
0,_WebResourceTextEncodingName^WebResourceUR...
1,TEST CASE: N1850_f19_tn14_r227_ctrl (yrs 310-340)
2,CONTROL CASE: OBS data
3,Variable N1850_f19_tn14_r227_ctrl OBS...
4,...


In [4]:
frame = pd.read_csv(case_ok)
frame.head()
frame.shape

(67, 1)

This did not really work since the data is not splitted by columns but includes one column containing the content of each row. The reading has to be done from scratch, especially also because there is some variables with too long names (e.g. L.25 and L. 28) that stick together the first two columns. 

This folder contains a file ``helper_funcs.py`` in which I defined a custom read function ``read_file_custom`` that can convert these files into pandas dataframes.

In [5]:
help(read_file_custom)

Help on function read_file_custom in module helper_funcs:

read_file_custom(fpath, verbose=False)
    Custom ASCII conversion method 
    
    Parameters
    ----------
    fpath : str
        path to file location
    verbose : bool
        if True, print output (defaults to False)
    Returns
    -------
    Dataframe 
        pandas data frame ready for further analysis



Now put the results into dataframe.

In [6]:
df = read_file_custom(test_file, VERBOSE)
df.head()

Ignoring line: bplist00Ñ_WebMainResourceÕ	
Ignoring line: 
Ignoring line: 
Ignoring line: _WebResourceTextEncodingName^WebResourceURL_WebResourceFrameName_WebResourceData_WebResourceMIMETypeUUTF-8_}http://ns2345k.web.sigma2.no/noresm_diagnostics/N1850_f19_tn14_r227_ctrl/CAM_DIAG/yrs310to340-obs/set1/table_GLBL_ANN_obs.ascPOf<html><head></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">DIAG SET 1: ANN MEANS GLOBAL
Ignoring line:  
Ignoring line:  
Ignoring line:  
Problem case FSNTOA_CERES-EBAF
Problem case FSNTOAC_CERES-EBAF
Test case: N1850_f19_tn14_r227_ctrl 
Control case: OBS data


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Model,Obs,Bias,RMSE
Variable,Run,Years,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
RESTOM,N1850_f19_tn14_r227_ctrl,310-340,0.327,,,
RESSURF,N1850_f19_tn14_r227_ctrl,310-340,0.337,,,
RESTOA_CERES-EBAF,N1850_f19_tn14_r227_ctrl,310-340,2.412,0.992,1.42,8.937
RESTOA_ERBE,N1850_f19_tn14_r227_ctrl,310-340,2.412,0.059,2.353,8.906
SOLIN_CERES-EBAF,N1850_f19_tn14_r227_ctrl,310-340,340.2,340.054,0.146,0.417


See if the corrected file also works.

In [7]:
df1 = read_file_custom(case_ok, VERBOSE)

Ignoring line: DIAG SET 1: ANN MEANS GLOBAL
Ignoring line:  
Ignoring line:  
Ignoring line:  
Problem case FSNTOA_CERES-EBAF
Problem case FSNTOAC_CERES-EBAF
Test case: N1850_f19_tn14_r265_ctrl_20180411 
Control case: OBS data


Reading worked, check, if both dataframes have the same dimension.

In [8]:
print(df.shape)
print(df1.shape)

(63, 4)
(63, 4)


In [46]:
dfs = []
#only load first two for now
for k in range(2):
    df = read_file_custom(files[k], verbose=False)
    #print(df.head(), "\n\n")
    dfs.append(df)
    if VERBOSE:
        print("Number of variables / values")
        print(df.shape)        

Number of variables / values
(63, 4)
Number of variables / values
(63, 4)


In [47]:
merged = pd.concat(dfs)
merged

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Model,Obs,Bias,RMSE
Variable,Run,Years,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
RESTOM,N1850_f19_tn14_r227_ctrl,310-340,0.327,,,
RESSURF,N1850_f19_tn14_r227_ctrl,310-340,0.337,,,
RESTOA_CERES-EBAF,N1850_f19_tn14_r227_ctrl,310-340,2.412,0.992,1.420,8.937
RESTOA_ERBE,N1850_f19_tn14_r227_ctrl,310-340,2.412,0.059,2.353,8.906
SOLIN_CERES-EBAF,N1850_f19_tn14_r227_ctrl,310-340,340.200,340.054,0.146,0.417
SOLIN_CERES,N1850_f19_tn14_r227_ctrl,310-340,340.200,341.479,-1.279,1.296
CLDTOT_ISCCP,N1850_f19_tn14_r227_ctrl,310-340,68.234,66.800,1.435,13.078
CLDTOT_CLOUDSAT,N1850_f19_tn14_r227_ctrl,310-340,68.234,66.824,1.411,10.952
FLDS_ISCCP,N1850_f19_tn14_r227_ctrl,310-340,354.846,343.347,11.499,17.664
FLNS_ISCCP,N1850_f19_tn14_r227_ctrl,310-340,56.272,49.425,6.847,14.174


In [48]:
bias_mean = merged.groupby(["Variable", "Run", "Years", "Bias"]).mean()
print(bias_mean.shape)
bias_mean

(114, 3)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Model,Obs,RMSE
Variable,Run,Years,Bias,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CLDTOT_CLOUDSAT,N1850_f19_tn14_r227_ctrl,310-340,1.411,68.234,66.824,10.952
CLDTOT_CLOUDSAT,N1850_f19_tn14_r227_ctrl,80-110,2.133,68.956,66.824,10.894
CLDTOT_ISCCP,N1850_f19_tn14_r227_ctrl,310-340,1.435,68.234,66.800,13.078
CLDTOT_ISCCP,N1850_f19_tn14_r227_ctrl,80-110,2.157,68.956,66.800,12.869
FLDS_ISCCP,N1850_f19_tn14_r227_ctrl,310-340,11.499,354.846,343.347,17.664
FLDS_ISCCP,N1850_f19_tn14_r227_ctrl,80-110,5.162,348.508,343.347,16.720
FLNS_ISCCP,N1850_f19_tn14_r227_ctrl,310-340,6.847,56.272,49.425,14.174
FLNS_ISCCP,N1850_f19_tn14_r227_ctrl,80-110,7.167,56.592,49.425,14.516
FLUTC_CERES,N1850_f19_tn14_r227_ctrl,310-340,0.599,267.477,266.878,5.884
FLUTC_CERES,N1850_f19_tn14_r227_ctrl,80-110,-1.614,265.265,266.878,7.486


In [45]:
bias_mean.unstack(level=(1,2))

Unnamed: 0_level_0,Unnamed: 1_level_0,Model,Obs,RMSE
Unnamed: 0_level_1,Run,N1850_f19_tn14_r265_ctrl_20180411,N1850_f19_tn14_r265_ctrl_20180411,N1850_f19_tn14_r265_ctrl_20180411
Unnamed: 0_level_2,Years,90-120,90-120,90-120
Variable,Bias,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
CLDTOT_CLOUDSAT,3.923,70.747,66.824,10.561
CLDTOT_ISCCP,3.947,70.747,66.8,12.485
FLDS_ISCCP,4.507,347.854,343.347,15.278
FLNS_ISCCP,6.925,56.35,49.425,13.988
FLUTC_CERES,-2.009,264.869,266.878,6.708
FLUTC_CERES-EBAF,-1.182,264.869,266.051,4.778
FLUTC_ERBE,0.44,264.869,264.429,4.878
FLUT_CERES,-0.221,238.742,238.963,6.571
FLUT_CERES-EBAF,-0.832,238.742,239.574,6.598
FLUT_ERBE,4.796,238.742,233.946,8.126
