# Exploring new German data

Information from Philipp about the file naming:
    
%proj%=%spec%=%conn%=%tset%=%date%=%ptst%=%test%=%equi%=%tid%

%proj%  Project Title<br>
%spec%  Specimen Name<br>
%conn%  Connection Name<br>
%tset%  Testset Name<br>
%date%  Start Date<br>
%ptst%  Parent Test Name (empty if not existing)<br>
%test%  Test Name<br>
%equi%  Equipment Name<br>
%tid%   Unique Id for Test based on data location
        (available if test was imported to ahjo)<br>


In [1]:
import glob
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import sys
from tqdm import tqdm

In [3]:
# Set up the translation into English for the column headers
# German column headers
german = ["Schritt","Zustand","Zeit","Programmdauer","Schrittdauer","Zyklus",
          "Zyklusebene","Prozedur","Prozedurebene","AhAkku","AhLad","AhEla",
          "AhStep","Energie","WhStep","Spannung","Strom","Temp13"]

# English translations
english = ["step", "state", "time", "programme duration", "step duration",
           "cycle", "cycle level", "procedure", "procedure level", "Qacc",
           "Qcha", "Qdch", "AhStep", "energy", "WhStep", "voltage",
           "current", "temp13"]

# Check list lengths match
assert(len(german) == len(english))

# Create a dictionary and view a test entry
translate = dict(zip(german, english))
print(translate['Zeit'])

time


In [5]:
# Define where the CSV files are stored and get a list of their paths
file_dir = "D:/Dropbox/UoE_Batteries/new_german_data/"
files = glob.glob(file_dir + "**/*.csv", recursive=True)
# Make a list of just the CSV file names (not paths) for easier file locating
csv_names = [file.split("\\")[1] for file in files]


# Specify a converter dictionary for use with pd.read_csv, to specify data types
# of columns contained within the CSV. Use the original German column names.
# Need to find out what to do with the time column. Leave as "object" for now.
dtypes = [int, str, object, float, float, int, int, str, int,
          float, float, float, float, float, float, float, float, float]

converter = dict(zip(german, dtypes))

## Build a function to read the data from a file and handle the translation

In [13]:
def load_from_csv(fpath, converter, fields=None, translation=True): 
    '''
    Load data from CSV files provided by Philipp at Aachen.
    Handle data type conversion, loading a specific set of columns and translating column headers.
    
    Inputs:
        fpath (type: str)
            Path to the CSV file you want to load
        
        converter (type: dict)
            Dict to map data in columns to desired data types.
            Keys: German column names.
            Values: Data type of column
        
        fields (type: list)
            A list of German column names that should be loaded from the CSV file
            
        translation (type: bool, default: True)
            Whether or not to translate the German column headers into English
            (requires a dict called "translate", defined outside the scope of this function)
        
    '''
    
    df = pd.read_csv(fpath, skiprows=[1], header=0, dtype=converter)
    
    # Initialise variable name so there's something to return irrespective of translation bool state
    translation_error_log = None
    
    # Go through translation routine if required. Check for failed translations.
    if translation:
        # Translate German columns where a translation exists in the dictionary, else leave the German.
        translated_cols = [translate[ger_col] if ger_col in translate.keys() else ger_col for ger_col in df.columns]
        
        # Store any column names that haven't been translated
        failed_translations = np.where([col not in translate.values() for col in translated_cols])[0]
        
        # This condition fails if len(failed_translations)==0
        if np.any(failed_translations):
            # Add the file path, as well as all failed translations and their column index
            translation_error_log = [fpath, [(idx, df.columns[idx]) for idx in failed_translations]]
        
        # Replace the column names with the translated names
        df.columns = translated_cols
        
    
    # Get rid of null rows, if present
    df.dropna(inplace=True)
    # Reset the indices in case of null row deletion
    df.reset_index(inplace=True, drop=True)
    
    return df, translation_error_log



In [14]:
# Load an example file using the function
data_from_fn, _ = load_from_csv(files[10], converter, translation=True)

### Test the load_from_csv function and examine translation results
Some files have additional columns. Find all unique column names and see if we need these extra ones. If so, add the German and English to the translate dictionary

In [15]:
# Initialise a list to store filepaths for files that raise an exception inside the function
failed_files = []
# Initialise a set to store unique German column names
unique_columns = set()

for i, f in tqdm(enumerate(files)):
    try:
        # Set translation to False so we get the German column names
        data, trans_error_log = load_from_csv(f, converter, translation=False)
        # Get the German column names and add them to the set
        for col in data.columns:
            # Add every column name from every file to the set
            unique_columns.add(col)
            
    except:
        e = sys.exc_info()[1]
        failed_files.append([f, e])

69it [00:08,  7.91it/s]


In [16]:
# Now let's look at the column names that are not already contained in our "german" list
# Get the column names that are present in both "unique_columns" and "german" variables
intersection = np.intersect1d(list(unique_columns), german)

# Find the column names in "unique_columns" but NOT in "german".
# symmetric_difference is a method of the set class. It returns a set
new_cols = list(unique_columns.symmetric_difference(german))

# We can see that for this first batch of files, at least, these additional columns
# don't seem to be important for us. They are mostly related to temperatures.
# TODO - find out what Agilent is. Translator doesn't work.
print(new_cols)

['Temp23', 'Temp0029', 'ActTemp', 'ClimaEN', 'Temp0028', 'Temp', 'SetTemp', 'Temp0030', 'Agilent', 'ClimaOn', 'Temp0027']


## Look at the data from an example file

In [35]:
data, _ = load_from_csv(files[12], converter, translation=True)
print(f"{len(data)} rows")
print(data.head())

25791 rows
   step state                    time  ...   voltage  current   temp13
0     4   PAU  2013-03-11 08:39:05.56  ...  3.853787      0.0  28.0000
1     4   PAU  2013-03-11 08:39:15.59  ...  3.853787      0.0  28.0000
2     4   PAU  2013-03-11 08:39:25.59  ...  3.853787      0.0  27.9375
3     4   PAU  2013-03-11 08:39:35.58  ...  3.853420      0.0  28.0000
4     4   PAU  2013-03-11 08:39:45.61  ...  3.853420      0.0  28.0000

[5 rows x 18 columns]


In [36]:
# Find the unique step values
print(data['step'].unique())

# Find the unique state values
print(data['state'].unique())

# Look at the state value for each of these steps
for step_num in data['step'].unique():
    temp_df = data[data['step'] == step_num]
    print(step_num)
    print(temp_df.head(1))
    print()

[   4    7    8    1    2    3    6    5    9   13   14   15   17   18
   19   23   24   25   27   28   29   33   34   35   37   38   39   43
   44   45   47   48   49   36 9999]
['PAU' 'DCH' 'CHA' 'STO']
4
   step state                    time  ...   voltage  current  temp13
0     4   PAU  2013-03-11 08:39:05.56  ...  3.853787      0.0    28.0

[1 rows x 18 columns]

7
     step state                    time  ...   voltage  current   temp13
362     7   PAU  2013-03-11 09:39:05.76  ...  3.854154      0.0  28.0625

[1 rows x 18 columns]

8
     step state                    time  ...   voltage  current   temp13
364     8   PAU  2013-03-11 09:39:05.96  ...  3.854154      0.0  28.0625

[1 rows x 18 columns]

1
     step state                    time  ...   voltage   current   temp13
366     1   DCH  2013-03-11 09:39:06.37  ...  3.786102 -1.712262  28.0625

[1 rows x 18 columns]

2
      step state                    time  ...   voltage   current   temp13
1219     2   CHA  2013-03-11 11:31

In [39]:
# Plot some data
# Get a DataFrame for a CHA step
cha_df = data[data['state']=='CHA']
V = cha_df['voltage'].to_numpy()
I = cha_df['current'].to_numpy()

fig, ax = plt.subplots(1,2)
ax[0].plot(V)
ax[0].plot(I)

plt.show()

In [38]:
# Find out how many cycles there are
print("Number of cycles: ", len(cha_df['cycle'].unique()))
print("Min. cycle number: ", np.min(cha_df['cycle']))
print("Max cycle number: ", np.max(cha_df['cycle']))

Number of cycles:  1
Min. cycle number:  0
Max cycle number:  0


In [42]:
# Get the data from a particular step
step_44_data = data[data['step']==44]
V = step_44_data['voltage'].to_numpy()
I = step_44_data['current'].to_numpy()
Q = step_44_data['Qcha'].to_numpy()

fig, ax = plt.subplots(1,2)
ax[0].plot(V)
ax[0].plot(I)
ax[1].plot(Q)
plt.show()