# Library Import
This section imports all the necessary libraries required for data processing, feature extraction, and cloud interaction.

In [12]:
import pandas as pd
import numpy as np
from utils.feature_extraction import *
import copy
from keras.models import load_model

# Function to allow a custom prediction on our website
- This function remove any null data from the csv that we'll get from the user **(Attention point from the challenge webpage)**
- The function also verify if the user's csv have the minimum fields to generate a prediction

## Creating aditional features for the output catalog that the webpage'll show on the user's screen:
- identified_arrival_time_rel(sec): time_rel(s) in the first row considered an earthquake
- detection_duration(sec): time_rel(s) in the last row of the entire subset of a record
- selection_duration(sec): detection_duration(sec) - identified_arrival_time_rel(sec)
- features_at_detection: the features in the first row considered an earthquake
- file_original_size(kb): pandas.memory_usage() method on the complete subset of a detection
- file_selection_size(kb): pandas.memory_usage() method on the subset trimmed from the detection index to the end
- original_broadcast: file_original_size(kb) / transmission rate
- selection_broadcast: file_selection_size(kb) / transmission rate

- Data transmission rates:

    - Apollo 12: 51.2 kbps
    - Apollo 15: 85.6 kbps
    - Apollo 16: 85.6 kbps
    - InSight: 256 kbps

In [13]:
def predict(filename):
    import json
    #### Colocar aqui o recebimento do CSV e transformar ele em dataframe:
    df_data_csv = pd.read_csv('./example_data/'+filename+'.csv') # pegar da api
    
    sampling_rate = 6.625 #pegar da api, é um valor float
    
    columns_to_check = ['time_rel(sec)', 'velocity(m/s)']
    missing_columns = [col for col in columns_to_check if col not in df_data_csv.columns]
    df_data_csv.dropna(inplace=True)
    
    if len(missing_columns)>0:
        return "Error, time_rel(sec) and velocity(m/s) columns are missing on the csv"

    # Concatenando novas features
    df_data_csv['filename']=filename
    features = process_seismic_data(df_data_csv, sampling_rate)
    df_data_csv['mean_velocity']= features['mean_velocity']
    df_data_csv['std_velocity']= features['std_velocity']
    df_data_csv['max_velocity']= features['max_velocity']
    df_data_csv['min_velocity']= features['min_velocity']
    df_data_csv['total_energy']= features['total_energy']
    df_data_csv['rms_value']= features['rms_value']
    df_data_csv['peak_count']= features['peak_count']
    df_data_csv['valley_count']= features['valley_count']
    df_data_csv['fft_values']= features['fft_values']
    df_data_csv['fft_freqs']= features['fft_freqs']
    df_data_csv['autocorrelation']= features['autocorrelation']
    df_data_csv['acceleration']= features['acceleration']
    df_data_csv['jerk']= features['jerk']
    df_data_csv['cumulative_energy']= features['cumulative_energy']
    
    
    temp = copy.deepcopy(df_data_csv)
    temp.drop(['time_abs(%Y-%m-%dT%H:%M:%S.%f)','filename'],axis=1,inplace=True)
    
    model = load_model('./model/best_model_nasa.keras')
    y_pred=model.predict(temp,verbose=2)
    df_data_csv['y_pred'] = (y_pred > 0.5).astype(int)  # Convert to 0 or 1

    #Generating output features
    result = df_data_csv[df_data_csv['y_pred'] == 1].groupby('filename', as_index=False).nth(0).reset_index()
    memory_usage_per_group = df_data_csv[df_data_csv['y_pred'] == 1].groupby('filename').apply(lambda group: group.memory_usage(deep=True).sum())
    result['file_selection_size(kb)'] = result['filename'].map(memory_usage_per_group)
    
    features = result.drop(['file_selection_size(kb)'],axis=1)
    
    json_list = []
    for index, row in features.iterrows():
        row_dict = row.to_dict()
        
        # Convert complex numbers to strings
        for key, value in row_dict.items():
            if isinstance(value, complex):
                row_dict[key] = str(value)
        
        json_list.append(json.dumps(row_dict))
        
    result = result[['filename', 'time_rel(sec)','velocity(m/s)','index','file_selection_size(kb)']].rename(columns={"index":"index_predict","time_rel(sec)":"identified_arrival_time_rel(sec)"})
    
    result2 = df_data_csv.groupby('filename', as_index=False).tail(1).reset_index()[['filename','index', 'time_rel(sec)']].rename(
        columns={"index": "index_tail", "time_rel(sec)": "detection_duration(sec)"}
    )
    memory_usage_per_group = df_data_csv.groupby('filename').apply(lambda group: group.memory_usage(deep=True).sum())
    result2['file_original_size(kb)'] = result2['filename'].map(memory_usage_per_group)

    result3 = pd.concat([result, result2.drop(['filename'],axis=1)],axis=1)
    result3['selection_duration'] = result3['detection_duration(sec)'] - result3['identified_arrival_time_rel(sec)']
    result3['features']=json_list
    conditions = [
        result3['filename'].str.contains('s12'),
        result3['filename'].str.contains('s15'),
        result3['filename'].str.contains('s16')
    ]
    values = [51.2, 85.6, 85.6]
    result3['transmission_speed'] = np.select(conditions, values, default=256)
    result3['original_broadcast'] = result3['file_original_size(kb)'] / result3['transmission_speed']
    result3['selection_broadcast'] = result3['file_selection_size(kb)'] / result3['transmission_speed']
    result3.drop(['index_predict','index_tail','transmission_speed'],inplace=True,axis=1)
    return result3

In [14]:
filename = "xa.s12.00.mhz.1970-01-19HR00_evid00002" #Receber o nome do arquivo da api
result = predict(filename) #Pode passar tanto o filename quanto ajustar para já passar o dataframe direto, o que for melhor
result

  return arr.astype(dtype, copy=True)


17888/17888 - 17s - 934us/step


  memory_usage_per_group = df_data_csv[df_data_csv['y_pred'] == 1].groupby('filename').apply(lambda group: group.memory_usage(deep=True).sum())
  memory_usage_per_group = df_data_csv.groupby('filename').apply(lambda group: group.memory_usage(deep=True).sum())


Unnamed: 0,filename,identified_arrival_time_rel(sec),velocity(m/s),file_selection_size(kb),detection_duration(sec),file_original_size(kb),selection_duration,features,original_broadcast,selection_broadcast
0,xa.s12.00.mhz.1970-01-19HR00_evid00002,41071.849057,-1.346839e-10,97902364,86402.113208,186607290,45330.264151,"{""index"": 272101, ""time_abs(%Y-%m-%dT%H:%M:%S....",3644674.0,1912156.0
