In [19]:
# DESCRIPTION:
# Generalize inspect.ipynb to create a parent data frame for all SOLs available

In [20]:
# Imports
from IPython.display import display
import numpy as np
import pandas as pd
import os

In [21]:
# Dictionary generating function for weather columns
def create_weather_dict(series, descriptive_stat):
    """Iterates through series and creates weather data fields dictionary.
    
    :param series: <class 'pandas.core.series.Series'> to be iterated.
    :param descriptive_stat: <class 'str'> to determine if max, mean, or min dictionary 
    :return: <class 'dict'> weather_dict
    """
    weather_dict = {
        # Features
        f"{descriptive_stat} Wind Speed (m/s)": series[0],

        f"{descriptive_stat} Wind Direction (degrees)": series[1],

        f"{descriptive_stat} Ground Temperature (K)": series[2],

        f"{descriptive_stat} Air Temperature (K) for Boom 1": series[3],

        f"{descriptive_stat} Air Temperature (K) for Boom 2": series[4],

        f"{descriptive_stat} Local Relative Humidity (%)": series[6],

        f"{descriptive_stat} Atmospheric Pressure (Pa)": series[7],

        # Output
        f"{descriptive_stat} Ambient Air Temperature (K)": series[5],
    }

    return weather_dict

In [22]:
# List of files in data directory
data_dir = "C:\\Dev\\python\\MachineLearning\\BMI\\data\\raw_data"
data_file_list = os.listdir(data_dir)

In [23]:
# List of sols in directory
sol_list = []
for ele in data_file_list:
    start = ele.find("RMD") + 3
    stop = start + 4
    sol_list.append(ele[start:stop])

In [24]:
# Parent dataframe to which each SOL's descriptive statistic dataframe will be added
parent_df = pd.DataFrame()

In [25]:
# Read through all files in data_dir and extract data, then add to parent_df
for index, data_file in enumerate(data_file_list):
    # Encompassing <class 'pd.DataFrame'> for all columns in data
    data_path = os.path.join(data_dir, data_file)
    df = pd.DataFrame(np.genfromtxt(data_path, delimiter=","))
    
    # <class 'pd.DataFrame'> for only relevant features/output
    new_df = df[[3, 5, 7, 11, 13, 15, 30, 37]]
   
    # Create descriptive statistic <class 'pd.DataFrame'>s 
    # Must use method reset_index(drop=True) to remove [3,5,7...] index column
    min_df = new_df.min().reset_index(drop=True)
    mean_df = new_df.mean().reset_index(drop=True)
    max_df = new_df.max().reset_index(drop=True)

    # Create weather dictionaries
    min_dict = create_weather_dict(min_df, "Minimum")
    mean_dict = create_weather_dict(mean_df, "Mean")
    max_dict = create_weather_dict(max_df, "Maximum")

    # Aggregate dictionaries and add SOL data field
    descriptive_stat_dict = {**min_dict, **mean_dict, **max_dict, "SOL": sol_list[index]}

    # <class 'pd.DataFrame'> for current file --> to be added to <class 'pd.DataFrame'> parent_df
    file_df = pd.DataFrame(descriptive_stat_dict, index=[0])

    # Update <class 'pd.DataFrame'> parent_df
    parent_df = parent_df.append(file_df, ignore_index=True)

In [33]:
# Information about the data
display(parent_df.describe())
print()
print()
display(parent_df.isna().sum())

Unnamed: 0,Minimum Wind Speed (m/s),Minimum Wind Direction (degrees),Minimum Ground Temperature (K),Minimum Air Temperature (K) for Boom 1,Minimum Air Temperature (K) for Boom 2,Minimum Local Relative Humidity (%),Minimum Atmospheric Pressure (Pa),Minimum Ambient Air Temperature (K),Mean Wind Speed (m/s),Mean Wind Direction (degrees),...,Mean Atmospheric Pressure (Pa),Mean Ambient Air Temperature (K),Maximum Wind Speed (m/s),Maximum Wind Direction (degrees),Maximum Ground Temperature (K),Maximum Air Temperature (K) for Boom 1,Maximum Air Temperature (K) for Boom 2,Maximum Local Relative Humidity (%),Maximum Atmospheric Pressure (Pa),Maximum Ambient Air Temperature (K)
count,1394.0,1397.0,2710.0,2710.0,2710.0,2704.0,2711.0,2710.0,1394.0,1397.0,...,2711.0,2710.0,1394.0,1397.0,2710.0,2710.0,2710.0,2704.0,2711.0,2710.0
mean,3.74208,31.931625,152.816077,198.323199,109.634292,0.870847,784.982062,197.317066,6.304692,168.234235,...,829.760166,223.308457,9.383737,326.549943,275.212018,262.410402,262.962768,23.518362,866.562243,258.874114
std,0.772088,32.37433,25.064603,6.491262,28.709935,0.337284,56.259129,6.406104,1.090384,68.639513,...,58.406511,8.257959,2.40232,34.169267,9.161776,12.135932,11.7123,15.831049,58.62516,11.050283
min,0.96,0.02,72.17,182.53,75.05,0.0,662.09,124.59,3.548889,15.627636,...,702.118063,198.601546,4.23,137.5,211.89,213.81,214.34,0.0,706.18,213.81
25%,3.2,7.1,138.0225,193.6525,91.6175,0.61,741.565,193.24,5.598396,115.738889,...,780.463594,216.489194,7.98,307.33,267.7025,252.05,252.015,11.76,815.91,248.8475
50%,3.71,26.2,158.215,198.46,99.52,0.83,796.5,197.325,6.269697,157.292391,...,844.983629,223.87163,9.145,337.24,275.885,263.505,263.695,19.08,883.56,260.0
75%,4.19,50.68,170.6175,202.91,118.7125,1.12,830.945,201.755,6.998407,221.751159,...,874.922395,229.431363,10.44,353.38,282.495,272.735,273.055,29.8725,910.515,268.14
max,7.18,330.24,270.1,274.48,259.9,6.15,921.39,259.9,10.317833,341.09,...,934.730147,267.976294,21.79,359.99,297.53,288.96,286.69,104.43,969.65,285.81






Minimum Wind Speed (m/s)                  1319
Minimum Wind Direction (degrees)          1316
Minimum Ground Temperature (K)               3
Minimum Air Temperature (K) for Boom 1       3
Minimum Air Temperature (K) for Boom 2       3
Minimum Local Relative Humidity (%)          9
Minimum Atmospheric Pressure (Pa)            2
Minimum Ambient Air Temperature (K)          3
Mean Wind Speed (m/s)                     1319
Mean Wind Direction (degrees)             1316
Mean Ground Temperature (K)                  3
Mean Air Temperature (K) for Boom 1          3
Mean Air Temperature (K) for Boom 2          3
Mean Local Relative Humidity (%)             9
Mean Atmospheric Pressure (Pa)               2
Mean Ambient Air Temperature (K)             3
Maximum Wind Speed (m/s)                  1319
Maximum Wind Direction (degrees)          1316
Maximum Ground Temperature (K)               3
Maximum Air Temperature (K) for Boom 1       3
Maximum Air Temperature (K) for Boom 2       3
Maximum Local

In [1]:
# Write unclean data to csv
parent_df.to_csv(os.path.join(os.getcwd(), '../data/unclean.csv'), index=False)

NameError: name 'parent_df' is not defined