# Dataframes

## Importing libraries

In [None]:
''' Required if dataframes should be used '''
import pandas as pd

''' Not required but good to have '''
import numpy as np

## Loading files into memory

A variety of files can be imported into python. To ensure consistency between different users and operating systems, .csv is the usual choice for data in sheet format (2d array). Commonly pandas is used for this step but numpy offers similar functionality, the method of handling the data is very different between these two approaches however so keep this in mind!

In [None]:
dataframe = pd.read_csv('path/to/file.csv')

Print out dataframe to get a rough idea whether the data has been succesfully imported

In [None]:
display(dataframe)

set a column as index, this will prevent this column from being altered and it will transfer to all sublists. It is useful to think of this as identifying the "thing" to which all data in the same row relate to.

In [None]:
dataframe = dataframe.set_index('index')

## Manipulating Dataframes

Manipulating dataframes works differently from numpy, which both expands their functionality but makes them a bit harder to work with.

In [None]:
''' Initializing a dataframe '''
dataframe = pd.DataFrame({"x1":["x", "y", "x", "y", "x", "x"],  # Create pandas DataFrame
                     "x2":range(15, 21),
                     "x3":["a", "b", "c", "d", "e", "f"],
                     "x4":range(20, 8, - 2)})

''' Creating a copy (Do this before major alterations)'''
dataframe_modified = dataframe.copy()   

''' Removing columns '''
dataframe_modified = dataframe_modified.drop('column name', axis = 1) # old method
dataframe_modified = dataframe_modified.drop(columns = ['column name 1', 'column name 2']) # cleaner method that detects axis automatically

''' Add new column with data '''
new_list = [a, b, c, d]

# this method always adds the new column at the end
dataframe_modified['new column name'] = new_list

# this method allows position to be chosen of the new data
dataframe_modified.insert(pos, 'new column name', new_list, True)

''' Merging two frames by column '''
dataframe_merged = pd.merge(dataframe,                                # Merge two pandas DataFrames
                      new_dataframe,
                      on = "column/index with matching values",       # can be set to none to combine without overlap
                      how = "outer")                                  # be careful with this and read up on SQL merging techniques

''' Rename columns '''
dataframe_modified.columns = ['column name 1', 'column name 2', 'column name 3']

''' Remove row '''
dataframe_modified = dataframe_modified[dataframe_modified.column1 != "condition"]     # Removes rows if "condition" is met within column1

''' Add new row '''
new_row = [a, b, c, d]
dataframe_modified = dataframe.loc[6] = new_row

''' Merge two frames by rows '''
data_append = pd.concat([dataframe,                             # Append DataFrames
                        new_rows_dataframe],
                        ignore_index = True,
                        sort = False)

''' Sort rows '''
dataframe_sorted = dataframe_sorted.sort_values("column by which to sort")

''' Replace data '''
dataframe_modified = dataframe_modified.replace('value to be replaced', 'what it should be replaced with')

''' drop NaN values '''
dataframe_modified = dataframe_modified.dropna(axis=0, how='any')

# CBFV

composition-based feature vectors, Uses a lookup table to generate compositional vectors based on "chemical" formula

## Extract Formula in compliant format

In [None]:
parse_form_str = 'COMP_PARSED FORMULA'

'CBFV features (composition-based feature vector)'
df_temp = df_import[parse_form_str].copy()
df_temp = df_temp.reset_index()

df_temp.head()

## Execute the CBFV function

In [None]:
feat_CBFV = gen_CBFV(df_temp, parse_form_str, 'group')
display(feat_CBFV)

## Clean up unknown values

In [None]:
'drop NaN which where created in the feature generation'
feat_CBFV = feat_CBFV.dropna(axis=0, how='any')

## restrict features and CTE to overlapping entries

In [None]:
print(df_import.shape, feat_CBFV.shape)
df_labeled = material_group_select(df_import, feat_CBFV.index)

## Functions

Function to generate CBFV vector (which is separated) and combines X and Y into a single vector

In [None]:
def gen_CBFV(df,form_col,target_col):
    
    df_prev = df.copy()
    
    rename_dict = {form_col: 'formula', target_col: 'target'}
    df_prev = df_prev.rename(columns=rename_dict)
    #print(df_prev)

    X, y, formulae, skipped = composition.generate_features(df_prev, elem_prop='oliynyk',
                                                            drop_duplicates=False, 
                                                            extend_features=False, 
                                                            sum_feat=True)
    print(f'\n the skipped elements are: {skipped[:15]}...')
    
    df_CBFV = X.copy()
    df_CBFV[target_col] = y
    #print(df_CBFV)
    
    df_CBFV = df_CBFV.set_index(target_col)
    return df_CBFV

Comparing two sets of data and adjusting them to the same length

In [None]:
def material_group_select(df,group_list):
    '''restrict materials to materials listed in group_list'''
    #print(df.head())
    df_return = df.copy()
    df_return = df_return.loc[group_list]
    #print(df.shape,df_return.shape,len(group_list))

    print('By restricting dataframe to selected material list:')
    compare_rows(df, df_return)
    return df_return

def compare_rows(df_before,df_after):
    # print(f'DataFrame shape before processing: {df_before.shape}')
    # print(f'DataFrame shape after processing: {df_after.shape}')
    dif_row = df_before.shape[0] - df_after.shape[0]
    print(f'\t you deleted {dif_row} entries')
