### Import the necessary modules and packages

In [0]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from zipfile import ZipFile   
from glob import glob
from datetime import datetime
from matplotlib.dates import DateFormatter, MonthLocator
# read level file example 
!pip install timestring
import timestring

### Map google drive to read and save files 

In [0]:
from google.colab import drive
drive.mount('/content/drive')

### Data source directory 

In [0]:
data_source = '/content/drive/My Drive/ewin_mex_data/'
file_names = os.listdir(data_source)

### Create an empty folder to unzip files

In [0]:
# Create an empty folder (if it doesnt exist) for the unzipped files
unzip_data_folder = os.path.join(data_source,"unzipped_data")
try:
    os.mkdir(unzip_data_folder)
except OSError:
    print ("folder exists already %s:" % unzip_data_folder)
else:
    print ("Successfully created the directory %s " % unzip_data_folder)

### Unzip all documents recursively 

In [0]:
# function that goes through folders and unzip fils 
def unzip_files(data_source,output_file):
  for path, dir_list, file_list in os.walk(data_source):
      for file_name in file_list:
          if file_name.endswith(".zip"):
              abs_file_path = os.path.join(path, file_name)
              print(abs_file_path)
              zip_obj = ZipFile(abs_file_path, 'r')
              zip_obj.extractall(output_file)
              zip_obj.close()
  print("Successfully unzipped!")   

In [0]:
# apply the function 
unzip_files(data_source=data_source, output_file=unzip_data_folder)

### RiverCore data
Find all file paths 

In [0]:
# find level files 
all_lvl_files = []
substring = "Node"#_10
for root, subdirs, files in os.walk(unzip_data_folder):
        for filename in files:
            if substring in filename:
                name_path = os.path.join(root,filename)
                all_lvl_files.append(name_path)
                
print(all_lvl_files)

### Datetime management libraries 

In [0]:
# read level file example 
data = pd.read_csv(all_lvl_files[7],sep='\t',error_bad_lines=False, engine='python') #, header=None 
# Data summary
print(data.dtypes)
print(data.describe())
print(data.columns)
# rename variables 
new_columns = ['time_stamp','wl', 'sm']
data.columns = new_columns
# Check for NaNs
data.isna().any()
# remove last digits from time_stamp 
data['time_stamp'] = data['time_stamp'].str[:-5]+data['time_stamp'].str[-1:]
# correct the month name
if 'September'in all_lvl_files[7]:
  data['time_stamp'] = data['time_stamp'].str.replace('Septeber','september')
#apply timestring function 
data['time_stamp'] = data['time_stamp'].apply(timestring.Date) 
# convert date to string
data['time_stamp'] = data['time_stamp'].apply(str)
# parse datetime
data['time_stamp'] = pd.to_datetime(data['time_stamp'], format='%Y-%m-%d %H:%M:%S') 
#print(data.describe())
# convert stage to numeric
data['wl'] = data['wl'].apply(pd.to_numeric, errors='coerce')
data['sm'] = data['sm'].apply(pd.to_numeric, errors='coerce')
# set time_stamp as index
data = data.set_index('time_stamp')
# print dtypes 
print(data.dtypes)
# aggregate to hourly ts
hourly_lvl = data.resample('h').mean()

In [0]:
plt.plot(hourly_lvl['wl'])
plt.plot(hourly_lvl['sm'])

### Define a function that reads the level files and exports tidy files 

In [0]:
def lvl_data_prep(file_name):
  data = pd.read_csv(file_name,sep='\t',error_bad_lines=False, engine='python') #, header=None 
  # rename variables 
  new_columns = ['time_stamp','wl', 'sm']
  data.columns = new_columns
  # remove the wrong rows
  data.isna().any()
  # remove last digits from time 
  data['time_stamp'] = data['time_stamp'].str[:-5]+data['time_stamp'].str[-1:]
  # correct September 
  if 'September'in file_name:
    data['time_stamp'] = data['time_stamp'].str.replace('Septeber','september')  
  # remove rows with "e"
  data = data[data.time_stamp!="e"]
  # apply the timestring function 
  data['time_stamp'] = data['time_stamp'].apply(timestring.Date) 
  # convert date to string
  data['time_stamp'] = data['time_stamp'].apply(str)
  # parse dates 
  data['time_stamp'] = pd.to_datetime(data['time_stamp'], format='%Y-%m-%d %H:%M:%S') 
  # convert stage to numeric
  data['wl'] = data['wl'].apply(pd.to_numeric, errors='coerce')
  data['sm'] = data['sm'].apply(pd.to_numeric, errors='coerce')
  # set time_stamp as index
  data = data.set_index('time_stamp')
  # aggerate to hourly data
  hourly_data = data.resample('h').mean()
  return(hourly_data)

In [0]:
# create new folder for the processed precip data
hourly_level = os.path.join(data_source,"hourly_level")
try:
    os.mkdir(hourly_level)
except OSError:
    print ("file exists already %s:" % hourly_level)
else:
    print ("Successfully created the directory %s " % hourly_level)

### aggregate to hourly and save files by station

In [0]:
# iterate through all rainfall files  
for lvl_file in all_lvl_files:
  print(lvl_file)
  hourly_lvl = lvl_data_prep(lvl_file)
  # new file name
  rc_stations = ['Node'+str(i) for i in range(1,15)]
  for nd in rc_stations:
    if nd in lvl_file:
      fname = nd
  # define path 
  new_name_path = os.path.join(hourly_level,f'rc_{fname}.csv')
  # if the file exists append, if not create new
  if os.path.isfile(new_name_path):
    hourly_lvl.to_csv(new_name_path,   mode='a', header=False)
  else:
    hourly_lvl.to_csv(os.path.join(hourly_level,f'rc_{fname}.csv'))

### plot levels

In [0]:
# list files
lvl_file_names = os.listdir(hourly_level)
# Initialize the figure
plt.style.use('seaborn-whitegrid')
# create a color palette
palette = plt.get_cmap('Set3')

fig = plt.figure(figsize=(20, 12))
num = 0
for i in lvl_file_names:
  this_file = os.path.join(hourly_level,i)
  df = pd.read_csv(this_file,infer_datetime_format= True,
                   parse_dates=True,index_col='time_stamp') 
  # sort by datetime
  df = df.sort_values(by=['time_stamp'])
  df = df.resample('h').mean()
  num+=1
  ax = fig.add_subplot(3,3, num)
  ax.plot(df['wl'], marker='', color='blue', linewidth=1.9, alpha=0.9, label=i)
 # Add title
  plt.title(i[:-4], loc='center', fontsize=12, fontweight=0, color="black")
# general title
  plt.suptitle("EWIN Mex Water level ", fontsize=25, fontweight=0,
               color='black', style='italic', y=1.02)

  months = MonthLocator()
  monthsFmt = DateFormatter("%b-%Y")
  ax.xaxis.set_major_locator(months)
  ax.xaxis.set_major_formatter(monthsFmt)

### Atmos data

In [0]:
unzip_data_folder = os.path.join(data_source,"unzipped_data")

### Find all ATMOS file paths 

In [0]:
# find level files 
all_prcp_files = []
substring = "_10"
for root, subdirs, files in os.walk(unzip_data_folder):
        for filename in files:
            if substring in filename:
                name_path = os.path.join(root,filename)
                all_prcp_files.append(name_path)
                
all_prcp_files

In [0]:
# create new folder for the processed precip data
hourly_atmos = os.path.join(data_source,"hourly_atmos")
try:
    os.mkdir(hourly_atmos)
except OSError:
    print ("file exists already %s:" % hourly_atmos)
else:
    print ("Successfully created the directory %s " % hourly_atmos)

In [0]:
for p in all_prcp_files:
  print(p)
  # date parser 
  dparser = lambda x:datetime.strptime(x, "%d/%m/%Y %H:%M")
  # read txt file   
  if 'sep' in p:
    data = pd.read_csv(p,sep='\t',
                parse_dates=['Date'],
                index_col='Date',
                date_parser = dparser,
                error_bad_lines=False,
                encoding='utf-16')
  else:
    data = pd.read_csv(p,sep='\t',
                parse_dates=['Date'],
                index_col='Date',
                date_parser = dparser,
                error_bad_lines=False)  
  # rename variables 
  new_columns = ['air_temp','relative_hum', 'prcp','solar_rad','vapor_press']
  data.columns = new_columns
  # set time_stamp as index
  data.index.names = ['time_stamp']
  # remove the wrong rows
  data.isna().any()
  data.dtypes
  print('missing data per column:\n',data.isnull().sum())
  # find missing data
  missing_data = data[data['air_temp'].isnull()]
  # quick plot 
  data['prcp'].plot()

  set1_noPrcp = data.drop('prcp',1).resample('h').mean()
  set2_Prcp = data.prcp.resample('h').sum(min_count=1)
  # to hourly 
  atmos_hourly = pd.concat([set1_noPrcp, set2_Prcp], axis=1)
  # new file name
  atmos_stations = ['Atmos'+str(i) for i in range(1,9)]
  for s in atmos_stations:
    if s in p:
      fname = s
  # define path 
  new_name_path = os.path.join(hourly_atmos,f'{fname}.csv')
  # if the file exists append, if not create new
  if os.path.isfile(new_name_path):
    atmos_hourly.to_csv(new_name_path,mode='a', header=False)
  else:
    atmos_hourly.to_csv(os.path.join(hourly_atmos,f'{fname}.csv'))



In [0]:
# list files
prcp_file_names = os.listdir(hourly_atmos)
# Initialize the figure
plt.style.use('seaborn-whitegrid')
# variable to plot across stations
var = "prcp"
# fugure config 
fig = plt.figure(figsize=(20, 12))
num = 0
for i in prcp_file_names:
  this_file = os.path.join(hourly_atmos,i)
  df = pd.read_csv(this_file,infer_datetime_format= True,
                   parse_dates=True,index_col='time_stamp') 
  # sort by datetime
  df = df.sort_values(by=['time_stamp'])
  df = df.resample('h').mean()
  num+=1
  ax = fig.add_subplot(3,3, num)
  ax.plot(df[var], marker='', color='blue', linewidth=1.9, alpha=0.9, label=i)
 # Add title
  plt.title(i[:-4]+var, loc='center', fontsize=12, fontweight=0, color="black")
# general title
  plt.suptitle("EWIN Mex Atmos ", fontsize=25, fontweight=0,
               color='black', style='italic', y=1.02)

  months = MonthLocator()
  monthsFmt = DateFormatter("%b-%Y")
  ax.xaxis.set_major_locator(months)
  ax.xaxis.set_major_formatter(monthsFmt)

'/content'