# Importing Libraries

The FitnessGram™ Pacer Test is a multistage aerobic capacity test that progressively gets more difficult as it continues. The 20 meter pacer test will begin in 30 seconds. Line up at the start. The running speed starts slowly, but gets faster each minute after you hear this signal. <i>beep</i> A single lap should be completed each time you hear this sound. <i>ding</i> Remember to run in a straight line, and run as long as possible. The second time you fail to complete a lap before the sound, your test is over. The test will begin on the word start. <b>On your mark, get ready, start.</b>

In [None]:
import ase.io # for reading *.xyz
import pandas as pd # for dataframes?
import matplotlib.pyplot as plt # for plotting error distribution

# Setting Variables and Functions

In [None]:
DATASET_LOCATION = "./data_structures_errors_head.xyz"

In [None]:
def sort_df(df, sort_method): # to sort df by certain column and display the line plot
    df_sorted = df.sort_values(by=[sort_method])

    plt.figure(figsize=(10,5))
    plt.xlabel(sort_method.capitalize())
    plt.ylabel("Energy (Error)")
    plt.ylim(-1.1*df["energy (error)"].abs().max(), 1.1*df["energy (error)"].abs().max())
    plt.plot(df_sorted[sort_method], df_sorted['energy (error)'])
    plt.scatter(df_sorted[sort_method], df_sorted['energy (error)'])
    plt.axhline(y=0, color='r', linestyle='--')

    plt.show()

# Reading and Importing Dataset

In [None]:
dataset_list = list(ase.io.iread(filename=DATASET_LOCATION, format="extxyz")) # reading *.xyz file as a generator, then converting into list
df = pd.DataFrame(dataset_list)

df["number of atoms"] = [dataset_list[i].get_global_number_of_atoms() for i in range(len(dataset_list))]
df["chemical formula"] = [dataset_list[i].get_chemical_formula() for i in range(len(dataset_list))]
df["Lattice (unit cell with three vectors)"] = [dataset_list[i].get_cell()[:] for i in range(len(dataset_list))]
df["volume of unit cell (derived from Lattice)"] = [dataset_list[i].get_cell().volume for i in range(len(dataset_list))]
df["energy (error)"] = [dataset_list[i].info["energy"] for i in range(len(dataset_list))]
# Properties part not impt; pbc is "T T T" for all

display(df.head())

# possible way of extracting data out from Atom object
# print(df[0][0])
# print(df[0][0].symbol)
# print(df[0][0].position)

# Exploratory Analysis

In [None]:
display(df.describe())
sort_df(df, "number of atoms")
sort_df(df, "volume of unit cell (derived from Lattice)")


: 