To open a `.zip` archive containing `.tar.gz` files, extract the desired file, read it into a Pandas DataFrame, discard the header and extra lines, and then convert it into a Polars DataFrame, you can use the following code:

In [2]:
import os 
import zipfile
import tarfile
import pandas as pd
import polars as pl
import io
import re
import shutil
root = "C:/Users/localadmin/Documents/git/thermo/data"
tmp = os.path.join(root, "tmp")
os.makedirs(tmp, exist_ok=True)

In [None]:
instr = "tei49c"
files = os.listdir(os.path.join(root, instr))
# extract all archives to temporary directory
for file in files:
    with zipfile.ZipFile(os.path.join(root, instr, file), "r") as zfh:
        zfh.extractall(tmp)


In [81]:
def read_tei49c_dat(source: str) -> pl.DataFrame:
  if re.search(".zip", source):
    try:
      with zipfile.ZipFile(source, "r") as obj:
        source=obj
    except Exception as err:
      print(err)
      return pl.DataFrame()      
  else:
    try:
      # assumes the "standard" format with columns
      # "pcdate", "pctime", "time", "date", "o3", "flags", "cellai", "cellbi", "bncht", "lmpt", "o3lt", "flowa", "flowb", "pres"
      df = pl.read_csv(source=source, has_header=True, separator=" ", comment_char="l",)
        # dtypes=["utf8", "utf8", "utf8", "utf8", "f16", "utf8", "i16", "i16", "f16", "f16", "f16", "f16", "f16", "f16"])
      
      # fix error in header in some older files
      if len(df.columns[2])==0:
        df = pl.read_csv(source=source, has_header=False, separator=" ", comment_char="l", skip_rows=1,
          new_columns=["time", "date", "o3", "flags", "cellai", "cellbi", "bncht", "lmpt", "o3lt", "flowa", "flowb", "pres"])
        # df = df.cast({"pres": pl.Float32}, strict=False)
        # df = df.with_columns({"pcdate": [None for i in range(len(df))], "pctime": [None for i in range(len(df))]})
      elif len(df.columns[4])==0:
        df = pl.read_csv(source=source, has_header=False, separator=" ", comment_char="l", skip_rows=1,
          new_columns=["pcdate", "pctime", "time", "date", "o3", "flags", "cellai", "cellbi", "bncht", "lmpt", "o3lt", "flowa", "flowb", "pres"])
        df = df.with_columns({"pcdate": [None for i in range(len(df))], "pctime": [None for i in range(len(df))]})

      return df
    except Exception as err:
      print(err)
      return pl.DataFrame()

In [None]:
df_tot = pl.DataFrame()
for base, dirs, files in os.walk(tmp):
    for file in files:
        source = os.path.join(base, file)
        print(source)
        if re.search(r"tei49c-\d+.[zip|dat]", file):
            df = read_tei49c_dat(source)
            if df.is_empty:
                shutil.move(src=source, dst=os.path.join(tmp, "failed", source))
            else:
                df_tot = df_tot.vstack(df, in_place=True)
                shutil.move(src=source, dst=os.path.join(tmp, "success", source))


In [82]:
source = "C:/Users/localadmin/Documents/git/thermo/data/tmp/tei49c-202108190910.dat"
df1 = read_tei49c_dat(source)
print(df1.schema)
print(df1.columns)
df1.head()
# source = "C:/Users/localadmin/Documents/git/thermo/data/tmp/tei49c-202108261710.dat"
# df2 = read_tei49c_dat(source)
# source = "C:/Users/localadmin/Documents/git/thermo/data/tmp/tei49c-202206010050.dat"
# df3 = read_tei49c_dat(source)

# df = pl.DataFrame()
# # df.vstack(df1, in_place=True)
# print(df.schema)
# print(df.columns)
# df.head()

# df.vstack(df2, in_place=True)
# print(df.schema)
# print(df.columns)
# df.head()

# df.vstack(df3, in_place=True)
# print(df.schema)
# print(df.columns)
# df.head()


PanicException: index out of bounds: the len is 0 but the index is 0

In [13]:
df.columns[2]

''

In [None]:
for file in zfh.namelist():
            if re.search(r"tei49c-\d+.dat", file):
                if i > 3:
                    break
                with zfh.open(file, "r") as obj:
                    print(file)
                    df = pl.read_csv(obj)
                    print(df.schema)
                i = i + 1
            elif re.search(r"tei49c-\d+.zip", file):
                if i > 3:
                    break
                with zfh.open(file, "r") as obj:
                    print(file)
                    df = pl.read_csv(obj.open(file, "r")                            
                        )
                        print(df.schema)
                i = i + 1
                # zfh.extract(zfiles[15])

        # with zipfile.ZipFile(os.path.join(root, "tei49c", zfiles[15].filename), "r") as zf:
        #     df = pl.read_csv(zf)



In [None]:

  print("ok")
  name = os.path.join(root, zfiles[15].filename)
  with zipfile.ZipFile(name, "r") as zfh:
    df = pl.read_csv(zfh.extract(name))

In [None]:
# Path to the ZIP archive containing .tar.gz files
zip_file_path = 'path/to/your/archive.zip'

# Extract the .tar.gz file from the ZIP archive
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Assuming you know the name of the .tar.gz file you want to extract
    # Replace 'your_file.tar.gz' with the actual file name
    tar_gz_file_name = 'your_file.tar.gz'
    zip_ref.extract(tar_gz_file_name)

# Open the .tar.gz file and read its contents
with tarfile.open(tar_gz_file_name, 'r:gz') as tar:
    # Assuming 'A11.csv' is the name of the CSV file you want to read
    # Replace it with the actual file name if necessary
    csv_file_name = 'A11.csv'
    csv_file = tar.extractfile(csv_file_name)

    # Read the CSV file into a Pandas DataFrame, skipping header and extra lines
    pd_df = pd.read_csv(io.BytesIO(csv_file.read()))
    
    # Now convert the Pandas DataFrame into a Polars DataFrame
    pl_df = pl.DataFrame(pd_df)

# You can now work with the 'pl_df' Polars DataFrame as needed


Make sure to replace `'path/to/your/archive.zip'` with the actual path to your `.zip` archive and adjust the file names as necessary for your specific use case. This code will extract the `.tar.gz` file, read the CSV file from it into a Pandas DataFrame, and then convert it to a Polars DataFrame.