In [2]:
import numpy as np
import pandas as pd
import sklearn
import keras

Using TensorFlow backend.


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import average_precision_score, f1_score, precision_recall_curve

from scipy.optimize import minimize_scalar

In [4]:
def get_week_data_filename_QD(week_number, simulated_org_number):
    head_folder_name = "C:/Users/Mimran/OneDrive - George Mason University/C4I PC Backup/SCITE/RCPs Fifth Quarter/RCP13/Dev/July 12 PopSyn"
    full_filename = "{}/{}/org ({}).csv".format(head_folder_name, 
                                                     week_number, 
                                                     simulated_org_number)
    return full_filename

In [5]:
def read_week_data_QD(week_number, simulated_org_number):
    full_filename = get_week_data_filename_QD(week_number, simulated_org_number)
    if week_number < 34:
        week_df = pd.read_csv(full_filename) # Note this assumes similar order of users everywhere
        week_df.replace([-np.inf,np.inf], np.nan, inplace=True) #(no matching of users is necessary *under this assumption*)
    else: 
        week_df = pd.read_csv(full_filename, usecols = list(range(1,67))) # Note this assumes similar order of users everywhere
        week_df.replace([-np.inf,np.inf], np.nan, inplace=True) #(no matching of users is necessary *under this assumption*)
    return week_df.dropna()

In [6]:
feature_list = read_week_data_QD(34, 4).columns.tolist()

In [7]:
det_names = feature_list[1:]
det_names

['X001a',
 'X001b',
 'X001c',
 'X014a',
 'X015a',
 'X021a',
 'X021d',
 'X021e',
 'X021f',
 'X021g',
 'X021h',
 'X021i',
 'X021j',
 'X022a',
 'X022d',
 'X022e',
 'X022f',
 'X022g',
 'X022h',
 'X022i',
 'X022j',
 'X027a',
 'X027d',
 'X027e',
 'X027f',
 'X027g',
 'X027h',
 'X027i',
 'X027j',
 'X028a',
 'X028d',
 'X028e',
 'X028f',
 'X028g',
 'X028h',
 'X028i',
 'X028j',
 'X029a',
 'X030a',
 'X031a',
 'X032a',
 'X033a',
 'X034a',
 'X035a',
 'X036a',
 'X037a',
 'X038a',
 'X039a',
 'X040a',
 'X041a',
 'X042a',
 'X043a',
 'X044a',
 'X045a',
 'X046a',
 'X047a',
 'X048a',
 'X049a',
 'X050a',
 'X051a',
 'X052a',
 'X053a',
 'X058a',
 'X059a',
 'X060a']

In [8]:
ES = keras.callbacks.EarlyStopping(monitor = 'val_loss', min_delta = 10 ** (-5), patience = 2)

def create_keras_model():
    model = keras.models.Sequential()
    model.add(keras.layers.Dense(325, input_shape = (325,), activation='tanh'))
    model.add(keras.layers.Dense(325, activation='tanh'))
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer = 'sgd', loss = 'binary_crossentropy')
    return model    

def get_model(S1_x, S1_y, S2_x, S2_y, test_class_weight):
    current_model = create_keras_model()
    current_model.fit(S1_x, S1_y, 
                      callbacks = [ES], 
                      validation_data = (S2_x, S2_y), 
                      epochs = 100, 
                      class_weight = {0:1, 1:test_class_weight}, verbose = 0)
    return current_model

def get_model_predictions(S1_x, S1_y, S2_x, S2_y, test_class_weight, S3_x):
    current_model = create_keras_model()
    current_model.fit(S1_x, S1_y, 
                      callbacks = [ES], 
                      validation_data = (S2_x, S2_y), 
                      epochs = 100, 
                      class_weight = {0:1, 1:test_class_weight})
    return current_model.predict(S3_x)

In [9]:
det_names_in_correct_order = [detector + time_period for detector in det_names for time_period in ["_t-2", "_t-1", "_t", "_t+1", "_t+2"] ]
det_names_in_correct_order

['X001a_t-2',
 'X001a_t-1',
 'X001a_t',
 'X001a_t+1',
 'X001a_t+2',
 'X001b_t-2',
 'X001b_t-1',
 'X001b_t',
 'X001b_t+1',
 'X001b_t+2',
 'X001c_t-2',
 'X001c_t-1',
 'X001c_t',
 'X001c_t+1',
 'X001c_t+2',
 'X014a_t-2',
 'X014a_t-1',
 'X014a_t',
 'X014a_t+1',
 'X014a_t+2',
 'X015a_t-2',
 'X015a_t-1',
 'X015a_t',
 'X015a_t+1',
 'X015a_t+2',
 'X021a_t-2',
 'X021a_t-1',
 'X021a_t',
 'X021a_t+1',
 'X021a_t+2',
 'X021d_t-2',
 'X021d_t-1',
 'X021d_t',
 'X021d_t+1',
 'X021d_t+2',
 'X021e_t-2',
 'X021e_t-1',
 'X021e_t',
 'X021e_t+1',
 'X021e_t+2',
 'X021f_t-2',
 'X021f_t-1',
 'X021f_t',
 'X021f_t+1',
 'X021f_t+2',
 'X021g_t-2',
 'X021g_t-1',
 'X021g_t',
 'X021g_t+1',
 'X021g_t+2',
 'X021h_t-2',
 'X021h_t-1',
 'X021h_t',
 'X021h_t+1',
 'X021h_t+2',
 'X021i_t-2',
 'X021i_t-1',
 'X021i_t',
 'X021i_t+1',
 'X021i_t+2',
 'X021j_t-2',
 'X021j_t-1',
 'X021j_t',
 'X021j_t+1',
 'X021j_t+2',
 'X022a_t-2',
 'X022a_t-1',
 'X022a_t',
 'X022a_t+1',
 'X022a_t+2',
 'X022d_t-2',
 'X022d_t-1',
 'X022d_t',
 'X022d_

In [10]:
all_trait_names = ["trait_" + str(trait_num) for trait_num in range(4, 21, 2)]

In [11]:
def read_org_data(simulated_org_number, first_full_week, last_full_week):
    all_full_week_dfs = []
    list_of_dfs_for_feature_vectors = [read_week_data_QD(week_number, simulated_org_number) for week_number in range(first_full_week - 2, first_full_week + 2)]
    for current_week in range(first_full_week, last_full_week + 1):
        list_of_dfs_for_feature_vectors.append(read_week_data_QD(current_week + 2, simulated_org_number))
        current_week_df = pd.concat([list_of_dfs_for_feature_vectors[0].rename(columns = lambda some_str: some_str + "_t-2"), 
                                     list_of_dfs_for_feature_vectors[1].rename(columns = lambda some_str: some_str + "_t-1"), 
                                     list_of_dfs_for_feature_vectors[2].rename(columns = lambda some_str: some_str + "_t"), 
                                     list_of_dfs_for_feature_vectors[3].rename(columns = lambda some_str: some_str + "_t+1"), 
                                     list_of_dfs_for_feature_vectors[4].rename(columns = lambda some_str: some_str + "_t+2")], 
                                    axis = 1)
        current_week_df.dropna(inplace=True)
        all_full_week_dfs.append(current_week_df)
        del list_of_dfs_for_feature_vectors[0]
    return pd.concat(all_full_week_dfs)[det_names_in_correct_order + ['Target_t-2',
                                                                              'Target_t-1',
                                                                              'Target_t',
                                                                              'Target_t+1',
                                                                              'Target_t+2']]

In [12]:
def split_training_data(sample_training_data_df):
    S12_x, S3_x, S12_y, S3_y = train_test_split(sample_training_data_df.drop(['Target_t-2',
                                                                              'Target_t-1',
                                                                              'Target_t',
                                                                              'Target_t+1',
                                                                              'Target_t+2'], 1),
                                                sample_training_data_df['Target_t'],
                                                test_size = 0.2, stratify = sample_training_data_df['Target_t'])
    scaler = StandardScaler().fit(S12_x)
    S12_x = scaler.transform(S12_x)
    S3_x = scaler.transform(S3_x)
    S1_x, S2_x, S1_y, S2_y = train_test_split(S12_x, S12_y, test_size = 0.125, stratify = S12_y)
    return S1_x, S2_x, S3_x, S1_y.values, S2_y.values, S3_y.values, scaler

In [13]:
def read_org_test_data(simulated_org_number, first_full_week, last_full_week):
    all_full_week_dfs = []
    list_of_dfs_for_feature_vectors = [read_week_data_QD(week_number, simulated_org_number) for week_number in range(first_full_week - 2, first_full_week + 2)]
    for current_week in range(first_full_week, last_full_week + 1):
        list_of_dfs_for_feature_vectors.append(read_week_data_QD(current_week + 2, simulated_org_number))
        current_week_df = pd.concat([list_of_dfs_for_feature_vectors[0].rename(columns = lambda some_str: some_str + "_t-2"), 
                                     list_of_dfs_for_feature_vectors[1].rename(columns = lambda some_str: some_str + "_t-1"), 
                                     list_of_dfs_for_feature_vectors[2].rename(columns = lambda some_str: some_str + "_t"), 
                                     list_of_dfs_for_feature_vectors[3].rename(columns = lambda some_str: some_str + "_t+1"), 
                                     list_of_dfs_for_feature_vectors[4].rename(columns = lambda some_str: some_str + "_t+2")], 
                                    axis = 1)
        current_week_df.dropna(inplace=True)
        detector_string = 'X021f'
        current_week_df['trait_4'] = ((current_week_df['{}_t-2'.format(detector_string)] > np.percentile(current_week_df['{}_t-2'.format(detector_string)], 90)) | 
                                      (current_week_df['{}_t-1'.format(detector_string)] > np.percentile(current_week_df['{}_t-1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t'.format(detector_string)] > np.percentile(current_week_df['{}_t'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+1'.format(detector_string)] > np.percentile(current_week_df['{}_t+1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+2'.format(detector_string)] > np.percentile(current_week_df['{}_t+2'.format(detector_string)], 90)))
        detector_string = 'X021h'
        current_week_df['trait_6'] = ((current_week_df['{}_t-2'.format(detector_string)] > np.percentile(current_week_df['{}_t-2'.format(detector_string)], 90)) | 
                                      (current_week_df['{}_t-1'.format(detector_string)] > np.percentile(current_week_df['{}_t-1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t'.format(detector_string)] > np.percentile(current_week_df['{}_t'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+1'.format(detector_string)] > np.percentile(current_week_df['{}_t+1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+2'.format(detector_string)] > np.percentile(current_week_df['{}_t+2'.format(detector_string)], 90)))
        detector_string = 'X022f'
        current_week_df['trait_8'] = ((current_week_df['{}_t-2'.format(detector_string)] > np.percentile(current_week_df['{}_t-2'.format(detector_string)], 90)) | 
                                      (current_week_df['{}_t-1'.format(detector_string)] > np.percentile(current_week_df['{}_t-1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t'.format(detector_string)] > np.percentile(current_week_df['{}_t'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+1'.format(detector_string)] > np.percentile(current_week_df['{}_t+1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+2'.format(detector_string)] > np.percentile(current_week_df['{}_t+2'.format(detector_string)], 90)))
        detector_string = 'X022h'
        current_week_df['trait_10'] = ((current_week_df['{}_t-2'.format(detector_string)] > np.percentile(current_week_df['{}_t-2'.format(detector_string)], 90)) | 
                                      (current_week_df['{}_t-1'.format(detector_string)] > np.percentile(current_week_df['{}_t-1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t'.format(detector_string)] > np.percentile(current_week_df['{}_t'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+1'.format(detector_string)] > np.percentile(current_week_df['{}_t+1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+2'.format(detector_string)] > np.percentile(current_week_df['{}_t+2'.format(detector_string)], 90)))
        detector_string = 'X027f'
        current_week_df['trait_12'] = ((current_week_df['{}_t-2'.format(detector_string)] > np.percentile(current_week_df['{}_t-2'.format(detector_string)], 90)) | 
                                      (current_week_df['{}_t-1'.format(detector_string)] > np.percentile(current_week_df['{}_t-1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t'.format(detector_string)] > np.percentile(current_week_df['{}_t'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+1'.format(detector_string)] > np.percentile(current_week_df['{}_t+1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+2'.format(detector_string)] > np.percentile(current_week_df['{}_t+2'.format(detector_string)], 90)))
        detector_string = 'X027h'
        current_week_df['trait_14'] = ((current_week_df['{}_t-2'.format(detector_string)] > np.percentile(current_week_df['{}_t-2'.format(detector_string)], 90)) | 
                                      (current_week_df['{}_t-1'.format(detector_string)] > np.percentile(current_week_df['{}_t-1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t'.format(detector_string)] > np.percentile(current_week_df['{}_t'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+1'.format(detector_string)] > np.percentile(current_week_df['{}_t+1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+2'.format(detector_string)] > np.percentile(current_week_df['{}_t+2'.format(detector_string)], 90)))
        detector_string = 'X028f'
        current_week_df['trait_16'] = ((current_week_df['{}_t-2'.format(detector_string)] > np.percentile(current_week_df['{}_t-2'.format(detector_string)], 90)) | 
                                      (current_week_df['{}_t-1'.format(detector_string)] > np.percentile(current_week_df['{}_t-1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t'.format(detector_string)] > np.percentile(current_week_df['{}_t'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+1'.format(detector_string)] > np.percentile(current_week_df['{}_t+1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+2'.format(detector_string)] > np.percentile(current_week_df['{}_t+2'.format(detector_string)], 90)))
        detector_string = 'X028h'
        current_week_df['trait_18'] = ((current_week_df['{}_t-2'.format(detector_string)] > np.percentile(current_week_df['{}_t-2'.format(detector_string)], 90)) | 
                                      (current_week_df['{}_t-1'.format(detector_string)] > np.percentile(current_week_df['{}_t-1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t'.format(detector_string)] > np.percentile(current_week_df['{}_t'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+1'.format(detector_string)] > np.percentile(current_week_df['{}_t+1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+2'.format(detector_string)] > np.percentile(current_week_df['{}_t+2'.format(detector_string)], 90)))
        detector_string = 'X058a'
        current_week_df['trait_20'] = ((current_week_df['{}_t-2'.format(detector_string)]).astype(int) | 
                                      (current_week_df['{}_t-1'.format(detector_string)]).astype(int) |
                                      (current_week_df['{}_t'.format(detector_string)]).astype(int) |
                                      (current_week_df['{}_t+1'.format(detector_string)]).astype(int) |
                                      (current_week_df['{}_t+2'.format(detector_string)]).astype(int))
        all_full_week_dfs.append(current_week_df)
        del list_of_dfs_for_feature_vectors[0]
    return pd.concat(all_full_week_dfs)[det_names_in_correct_order + ['Target_t-2',
                                                                              'Target_t-1',
                                                                              'Target_t',
                                                                              'Target_t+1',
                                                                              'Target_t+2'] + all_trait_names]

In [14]:
training_data = read_org_data(4, 7, 33)

In [15]:
training_data

Unnamed: 0,X001a_t-2,X001a_t-1,X001a_t,X001a_t+1,X001a_t+2,X001b_t-2,X001b_t-1,X001b_t,X001b_t+1,X001b_t+2,...,X060a_t-2,X060a_t-1,X060a_t,X060a_t+1,X060a_t+2,Target_t-2,Target_t-1,Target_t,Target_t+1,Target_t+2
0,51319.0,32561.0,40373.0,53542.0,38087.0,27662,20315,12886,39223,6954,...,0,21,3,7,2,0,0,0,0,0
1,47672.0,50258.0,78576.0,15041.0,93388.0,15000,29285,34765,27478,836339,...,1,8,6,31,3,0,0,0,0,0
2,54557.0,46921.0,25039.0,204117.0,99591.0,25086,16658,353,22488,73414,...,28,10,25,31,1,0,0,0,0,0
3,34137.0,11335.0,105917.0,6597.0,31816.0,31113,3309,19155,0,10195,...,11,4,9,15,37,0,0,0,0,0
4,32677.0,16279.0,54113.0,46927.0,31032.0,28054,27607,23826,43680,18786,...,0,7,0,2,3,0,0,0,0,0
5,35068.0,6295.0,65612.0,36111.0,29759.0,23150,10846,354629,43801,15232,...,1,6,6,4,0,0,0,0,0,0
6,30295.0,41583.0,21932.0,49545.0,90340.0,16233,6346,149621,18163,591134,...,0,11,10,30,27,0,0,0,0,0
7,25082.0,37911.0,79251.0,41831.0,3729.0,3437,25452,24362,42934,0,...,35,3,1,4,29,0,0,0,0,0
8,37294.0,43064.0,24066.0,10784.0,7542.0,5587,32727,5600,47150,1613,...,23,14,1,5,9,0,0,0,0,0
9,29365.0,20729.0,14712.0,37131.0,59901.0,21431,27115,50160,32090,3492,...,0,21,4,12,18,0,0,0,0,0


In [16]:
some_test_data = read_org_test_data(4, 34, 49)

In [17]:
some_test_data

Unnamed: 0,X001a_t-2,X001a_t-1,X001a_t,X001a_t+1,X001a_t+2,X001b_t-2,X001b_t-1,X001b_t,X001b_t+1,X001b_t+2,...,Target_t+2,trait_4,trait_6,trait_8,trait_10,trait_12,trait_14,trait_16,trait_18,trait_20
0,20566,3987,36640,56993,61717,6072,178,24258,10777,12068,...,0,True,False,True,False,True,False,False,False,0
1,16964,15335,13515,13617,14708,3152,1587,18126,14823,1519,...,0,True,True,True,True,True,True,True,True,0
2,67375,42045,6520,6679,7183,8711,14760,3219,2054,14781,...,0,True,False,True,False,True,False,True,False,0
3,6562,11434,25268,25327,27388,364,7955,2291,1620,33071,...,0,True,False,False,False,False,False,False,False,0
4,15269,15450,680,1185,1132,2022,8034,15638,0,6068,...,0,False,True,False,True,False,True,False,False,0
5,536,25084,1881,2381,2428,781,3092,4533,18554,13485,...,0,True,False,True,False,False,False,True,False,0
6,14609,14540,36569,56848,61541,981,3085,3394,11952,2540,...,0,True,True,True,True,True,True,True,True,0
7,43633,15059,22829,22886,24755,7256,6613,7508,4367,9861,...,0,False,False,False,True,False,False,False,False,0
8,8647,23966,9453,9593,10344,5748,3511,6018,4462,16378,...,0,True,True,True,True,True,True,True,True,0
9,8898,21596,34768,52664,56476,4492,0,12522,15944,19342,...,0,True,True,True,True,True,True,True,True,0


In [18]:
def split_test_data(sample_test_data_df, scaler_from_training_data):
    T_x, T_y, T_generated_attributes = (sample_test_data_df[det_names_in_correct_order], 
                                        sample_test_data_df['Target_t'], 
                                        sample_test_data_df[all_trait_names])
    T_x = scaler_from_training_data.transform(T_x)
    return T_x, T_y.values, T_generated_attributes

In [19]:
def find_threshold(true_y_values, pred_results):
    prec, rec, thres = precision_recall_curve(true_y_values, pred_results)
    score = np.zeros(len(thres))

    # Score Thresholds
    for i in range(0,len(thres)):
        score[i] = f1_score(true_y_values, (pred_results > thres[i]).astype('int'))
    return thres[np.argmax(score)], score.max()

In [20]:
def RCP14_Algorithm_2(S1_x, S2_x, S3_x, S1_y, S2_y, S3_y, T_x, T_y, 
                      T_generated_attributes, chosen_weight, num_iterations = 1):
    answer_dict = {"Answer_" + str(answer_num) : np.zeros(num_iterations) for answer_num in range(1, 22)}
    for iteration_num in range(num_iterations):
        current_model = get_model(S1_x, S1_y, S2_x, S2_y, chosen_weight)
        model_prediction_output = current_model.predict(S3_x)[:,0]
        #print("Model prediction output looks like")
        #print(model_prediction_output[0:10])
        chosen_tau, best_score = find_threshold(true_y_values=S3_y, pred_results= model_prediction_output)
        print("optimized cutoff is {}".format(chosen_tau))
        print("The F1 score for this choice is {}.".format(best_score))
        prediction_output_for_test_data = current_model.predict(T_x)[:,0]
        T_labels = (prediction_output_for_test_data > chosen_tau).astype(int)
        print("Number of alerts is {}.".format(T_labels.sum()))
        answer_dict["Answer_1"][iteration_num] = (T_y & T_labels).sum() / T_y.sum()
        answer_dict["Answer_2"][iteration_num] = (T_y & T_labels).sum() / T_labels.sum()
        answer_dict["Answer_3"][iteration_num] = (T_y & T_labels).sum() / (T_y ^ 1).sum()
        answer_dict["Answer_4"][iteration_num] = (T_generated_attributes['trait_4'].values & T_labels).mean() / (T_labels).mean()
        answer_dict["Answer_5"][iteration_num] = (T_generated_attributes['trait_4'].values & T_labels).mean() / np.mean(T_generated_attributes['trait_4'])
        answer_dict["Answer_6"][iteration_num] = (T_generated_attributes['trait_6'].values & T_labels).mean() / (T_labels).mean()
        answer_dict["Answer_7"][iteration_num] = (T_generated_attributes['trait_6'].values & T_labels).mean() / np.mean(T_generated_attributes['trait_6'])
        answer_dict["Answer_8"][iteration_num] = (T_generated_attributes['trait_8'].values & T_labels).mean() / (T_labels).mean()
        answer_dict["Answer_9"][iteration_num] = (T_generated_attributes['trait_8'].values & T_labels).mean() / np.mean(T_generated_attributes['trait_8'])
        answer_dict["Answer_10"][iteration_num] = (T_generated_attributes['trait_10'].values & T_labels).mean() / (T_labels).mean()
        answer_dict["Answer_11"][iteration_num] = (T_generated_attributes['trait_10'].values & T_labels).mean() / np.mean(T_generated_attributes['trait_10'])
        answer_dict["Answer_12"][iteration_num] = (T_generated_attributes['trait_12'].values & T_labels).mean() / (T_labels).mean()
        answer_dict["Answer_13"][iteration_num] = (T_generated_attributes['trait_12'].values & T_labels).mean() / np.mean(T_generated_attributes['trait_12'])
        answer_dict["Answer_14"][iteration_num] = (T_generated_attributes['trait_14'].values & T_labels).mean() / (T_labels).mean()
        answer_dict["Answer_15"][iteration_num] = (T_generated_attributes['trait_14'].values & T_labels).mean() / np.mean(T_generated_attributes['trait_14'])
        answer_dict["Answer_16"][iteration_num] = (T_generated_attributes['trait_16'].values & T_labels).mean() / (T_labels).mean()
        answer_dict["Answer_17"][iteration_num] = (T_generated_attributes['trait_16'].values & T_labels).mean() / np.mean(T_generated_attributes['trait_16'])
        answer_dict["Answer_18"][iteration_num] = (T_generated_attributes['trait_18'].values & T_labels).mean() / (T_labels).mean()
        answer_dict["Answer_19"][iteration_num] = (T_generated_attributes['trait_18'].values & T_labels).mean() / np.mean(T_generated_attributes['trait_18'])
        answer_dict["Answer_20"][iteration_num] = (T_generated_attributes['trait_20'].values & T_labels).mean() / (T_labels).mean()
        answer_dict["Answer_21"][iteration_num] = (T_generated_attributes['trait_20'].values & T_labels).mean() / np.mean(T_generated_attributes['trait_20'])
    return answer_dict

In [21]:
def get_all_answers_for_org(simulated_org_number, iter_per_weight = 25, answer_iterations = 1):
    training_data = read_org_data(simulated_org_number, 7, 33)
    training_data.replace([-np.inf,np.inf], np.nan, inplace=True)
    training_data.dropna(inplace=True)
    S1_x, S2_x, S3_x, S1_y, S2_y, S3_y, scaler_from_training_data = split_training_data(training_data)
    #weight_to_use = RCP14_Algorithm_1(S1_x, S2_x, S3_x, S1_y, S2_y, S3_y, 
    #                                  iter_per_weight = iter_per_weight)
    weight_to_use = 180
    test_data = read_org_test_data(simulated_org_number, 34, 49)
    test_data.replace([-np.inf,np.inf], np.nan, inplace=True)
    test_data.dropna(inplace=True)
    T_x, T_y, T_generated_attributes = split_test_data(test_data, scaler_from_training_data)
    return RCP14_Algorithm_2(S1_x, S2_x, S3_x, S1_y, S2_y, S3_y, T_x, T_y, 
                      T_generated_attributes, weight_to_use, num_iterations = answer_iterations)

In [23]:
Answers_for_each_org_dict = {}
for org_num in range(6, 11):
    Answers_for_each_org_dict[org_num] = get_all_answers_for_org(org_num)
    pd.DataFrame(Answers_for_each_org_dict[org_num]).to_csv("C:/Users/Mimran/OneDrive - George Mason University/C4I PC Backup/SCITE/RCPs Fifth Quarter/RCP14/Dev/Answers_Zhengyang_{}.csv".format(org_num), index=False)

  'precision', 'predicted', average, warn_for)


optimized cutoff is 0.9675455093383789
The F1 score for this choice is 0.7567567567567568.
Number of alerts is 17.
optimized cutoff is 0.9845302104949951
The F1 score for this choice is 0.6111111111111112.
Number of alerts is 68.
optimized cutoff is 0.41465070843696594
The F1 score for this choice is 0.761904761904762.
Number of alerts is 52.
optimized cutoff is 0.48525843024253845
The F1 score for this choice is 0.5517241379310345.
Number of alerts is 83.
optimized cutoff is 0.9882998466491699
The F1 score for this choice is 0.6046511627906977.
Number of alerts is 54.


In [56]:
some_arr = np.array([[2.62939807e-07],
 [  9.73050701e-05],
 [  3.06557581e-07],
 [  2.80383620e-06],
 [  2.15645372e-07],
 [  4.46738454e-07],
 [  5.90954005e-06],
 [  9.25215500e-06],
 [  3.62556960e-07],
 [  1.25229633e-06]])

In [62]:
some_arr[:,0]

array([  2.62939807e-07,   9.73050701e-05,   3.06557581e-07,
         2.80383620e-06,   2.15645372e-07,   4.46738454e-07,
         5.90954005e-06,   9.25215500e-06,   3.62556960e-07,
         1.25229633e-06])

In [24]:
print(Answers_for_each_org_dict[org_num])

{'Answer_12': array([ 0.24074074]), 'Answer_14': array([ 0.14814815]), 'Answer_20': array([ 1.]), 'Answer_15': array([ 0.00040766]), 'Answer_1': array([ 0.75675676]), 'Answer_3': array([ 0.00056549]), 'Answer_5': array([ 0.00070746]), 'Answer_11': array([ 0.00050226]), 'Answer_19': array([ 0.00067066]), 'Answer_7': array([ 0.00120458]), 'Answer_8': array([ 0.18518519]), 'Answer_21': array([ 0.04791482]), 'Answer_10': array([ 0.18518519]), 'Answer_18': array([ 0.24074074]), 'Answer_6': array([ 0.44444444]), 'Answer_16': array([ 0.16666667]), 'Answer_17': array([ 0.00048646]), 'Answer_2': array([ 0.51851852]), 'Answer_4': array([ 0.25925926]), 'Answer_13': array([ 0.00071764]), 'Answer_9': array([ 0.00050816])}


In [28]:
pd.DataFrame(Answers_for_each_org_dict).T.astype(float).to_csv("Zhengyang_Answers_for_6_thru_11.csv", index = False)

In [49]:
Answers_for_each_org_dict[org_num]["Answer_1"]

array([ 52.])

In [101]:
training_data = read_org_data(3, 7, 33)
S1_x, S2_x, S3_x, S1_y, S2_y, S3_y, scaler_from_training_data = split_training_data(training_data)

In [102]:
S1_y.sum()

79

In [103]:
S2_y.sum()

11

In [104]:
S3_y.sum()

22

In [76]:
76+13+23

112

In [79]:
training_data['Target_t'].sum()

112

In [86]:
89/112

0.7946428571428571

In [90]:
11/89

0.12359550561797752

In [82]:
23/112

0.20535714285714285

In [107]:
test_data = read_org_test_data(4, 34, 49)

In [108]:
test_data['trait_20'].sum()

1192

In [111]:
test_data['X058a_t'].sum()

500