In [1]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"

In [2]:
import numpy as np
import pandas as pd
import sklearn
import keras
import csv

Using TensorFlow backend.


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import average_precision_score, f1_score, precision_recall_curve
from numpy.random import choice
from scipy.optimize import minimize_scalar

In [4]:
def get_week_data_filename_QD(week_number, simulated_org_number):
    head_folder_name = "C:/Users/Mimran/OneDrive - George Mason University/C4I PC Backup/SCITE/RCPs Fifth Quarter/RCP14/Dev/Mohanad July 23 Pop"
    full_filename = "{}/Week{:0>3}/Week{}_{:0>3}.csv".format(head_folder_name, 
                                                     week_number, week_number,
                                                     simulated_org_number)
    return full_filename

In [5]:
get_week_data_filename_QD(5, 20)

'C:/Users/Mimran/OneDrive - George Mason University/C4I PC Backup/SCITE/RCPs Fifth Quarter/RCP14/Dev/Mohanad July 23 Pop/Week005/Week5_020.csv'

In [6]:
def read_week_data_QD(week_number, simulated_org_number):
    full_filename = get_week_data_filename_QD(week_number, simulated_org_number)
    week_df = pd.read_csv(full_filename, usecols = list(range(1,67))) # Note this assumes similar order of users everywhere
    week_df.replace([-np.inf,np.inf], np.nan, inplace=True) #(no matching of users is necessary *under this assumption*)
    return week_df.dropna()

In [7]:
feature_list = read_week_data_QD(34, 4).columns.tolist()

In [8]:
det_names = feature_list[1:]
det_names

['001a',
 '001b',
 '001c',
 '014a',
 '015a',
 '021a',
 '021d',
 '021e',
 '021f',
 '021g',
 '021h',
 '021i',
 '021j',
 '022a',
 '022d',
 '022e',
 '022f',
 '022g',
 '022h',
 '022i',
 '022j',
 '027a',
 '027d',
 '027e',
 '027f',
 '027g',
 '027h',
 '027i',
 '027j',
 '028a',
 '028d',
 '028e',
 '028f',
 '028g',
 '028h',
 '028i',
 '028j',
 '029a',
 '030a',
 '031a',
 '032a',
 '033a',
 '034a',
 '035a',
 '036a',
 '037a',
 '038a',
 '039a',
 '040a',
 '041a',
 '042a',
 '043a',
 '044a',
 '045a',
 '046a',
 '047a',
 '048a',
 '049a',
 '050a',
 '051a',
 '052a',
 '053a',
 '058a',
 '059a',
 '060a']

In [9]:
ES = keras.callbacks.EarlyStopping(monitor = 'val_loss', min_delta = 10 ** (-5), patience = 2)

def create_keras_model():
    model = keras.models.Sequential()
    model.add(keras.layers.Dense(325, input_shape = (325,), activation='tanh'))
    model.add(keras.layers.Dense(325, activation='tanh'))
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer = 'sgd', loss = 'binary_crossentropy')
    return model    

def get_model(S1_x, S1_y, S2_x, S2_y, test_class_weight):
    current_model = create_keras_model()
    current_model.fit(S1_x, S1_y, 
                      callbacks = [ES], 
                      validation_data = (S2_x, S2_y), 
                      epochs = 100, 
                      class_weight = {0:1, 1:test_class_weight}, verbose = 0)
    return current_model

def get_model_predictions(S1_x, S1_y, S2_x, S2_y, test_class_weight, S3_x):
    current_model = create_keras_model()
    current_model.fit(S1_x, S1_y, 
                      callbacks = [ES], 
                      validation_data = (S2_x, S2_y), 
                      epochs = 100, 
                      class_weight = {0:1, 1:test_class_weight}, verbose = 0)
    return current_model.predict(S3_x)

In [10]:
det_names_in_correct_order = [detector + time_period for detector in det_names for time_period in ["_t-2", "_t-1", "_t", "_t+1", "_t+2"] ]
det_names_in_correct_order

['001a_t-2',
 '001a_t-1',
 '001a_t',
 '001a_t+1',
 '001a_t+2',
 '001b_t-2',
 '001b_t-1',
 '001b_t',
 '001b_t+1',
 '001b_t+2',
 '001c_t-2',
 '001c_t-1',
 '001c_t',
 '001c_t+1',
 '001c_t+2',
 '014a_t-2',
 '014a_t-1',
 '014a_t',
 '014a_t+1',
 '014a_t+2',
 '015a_t-2',
 '015a_t-1',
 '015a_t',
 '015a_t+1',
 '015a_t+2',
 '021a_t-2',
 '021a_t-1',
 '021a_t',
 '021a_t+1',
 '021a_t+2',
 '021d_t-2',
 '021d_t-1',
 '021d_t',
 '021d_t+1',
 '021d_t+2',
 '021e_t-2',
 '021e_t-1',
 '021e_t',
 '021e_t+1',
 '021e_t+2',
 '021f_t-2',
 '021f_t-1',
 '021f_t',
 '021f_t+1',
 '021f_t+2',
 '021g_t-2',
 '021g_t-1',
 '021g_t',
 '021g_t+1',
 '021g_t+2',
 '021h_t-2',
 '021h_t-1',
 '021h_t',
 '021h_t+1',
 '021h_t+2',
 '021i_t-2',
 '021i_t-1',
 '021i_t',
 '021i_t+1',
 '021i_t+2',
 '021j_t-2',
 '021j_t-1',
 '021j_t',
 '021j_t+1',
 '021j_t+2',
 '022a_t-2',
 '022a_t-1',
 '022a_t',
 '022a_t+1',
 '022a_t+2',
 '022d_t-2',
 '022d_t-1',
 '022d_t',
 '022d_t+1',
 '022d_t+2',
 '022e_t-2',
 '022e_t-1',
 '022e_t',
 '022e_t+1',
 '022

In [11]:
all_trait_names = ["trait_" + str(trait_num) for trait_num in range(4, 21, 2)]

In [12]:
def read_org_data(simulated_org_number, first_full_week, last_full_week):
    all_full_week_dfs = []
    list_of_dfs_for_feature_vectors = [read_week_data_QD(week_number, simulated_org_number) for week_number in range(first_full_week - 2, first_full_week + 2)]
    for current_week in range(first_full_week, last_full_week + 1):
        list_of_dfs_for_feature_vectors.append(read_week_data_QD(current_week + 2, simulated_org_number))
        current_week_df = pd.concat([list_of_dfs_for_feature_vectors[0].rename(columns = lambda some_str: some_str + "_t-2"), 
                                     list_of_dfs_for_feature_vectors[1].rename(columns = lambda some_str: some_str + "_t-1"), 
                                     list_of_dfs_for_feature_vectors[2].rename(columns = lambda some_str: some_str + "_t"), 
                                     list_of_dfs_for_feature_vectors[3].rename(columns = lambda some_str: some_str + "_t+1"), 
                                     list_of_dfs_for_feature_vectors[4].rename(columns = lambda some_str: some_str + "_t+2")], 
                                    axis = 1)
        current_week_df.dropna(inplace=True)
        all_full_week_dfs.append(current_week_df)
        del list_of_dfs_for_feature_vectors[0]
    return pd.concat(all_full_week_dfs)[det_names_in_correct_order + ['Target_t-2',
                                                                              'Target_t-1',
                                                                              'Target_t',
                                                                              'Target_t+1',
                                                                              'Target_t+2']]

In [13]:
def split_training_data(sample_training_data_df):
    S12_x, S3_x, S12_y, S3_y = train_test_split(sample_training_data_df.drop(['Target_t-2',
                                                                              'Target_t-1',
                                                                              'Target_t',
                                                                              'Target_t+1',
                                                                              'Target_t+2'], 1),
                                                sample_training_data_df['Target_t'],
                                                test_size = 0.2, stratify = sample_training_data_df['Target_t'])
    scaler = StandardScaler().fit(S12_x)
    S12_x = scaler.transform(S12_x)
    S3_x = scaler.transform(S3_x)
    S1_x, S2_x, S1_y, S2_y = train_test_split(S12_x, S12_y, test_size = 0.125, stratify = S12_y)
    return S1_x, S2_x, S3_x, S1_y.values, S2_y.values, S3_y.values, scaler

In [14]:
def read_org_test_data(simulated_org_number, first_full_week, last_full_week):
    all_full_week_dfs = []
    list_of_dfs_for_feature_vectors = [read_week_data_QD(week_number, simulated_org_number) for week_number in range(first_full_week - 2, first_full_week + 2)]
    for current_week in range(first_full_week, last_full_week + 1):
        list_of_dfs_for_feature_vectors.append(read_week_data_QD(current_week + 2, simulated_org_number))
        current_week_df = pd.concat([list_of_dfs_for_feature_vectors[0].rename(columns = lambda some_str: some_str + "_t-2"), 
                                     list_of_dfs_for_feature_vectors[1].rename(columns = lambda some_str: some_str + "_t-1"), 
                                     list_of_dfs_for_feature_vectors[2].rename(columns = lambda some_str: some_str + "_t"), 
                                     list_of_dfs_for_feature_vectors[3].rename(columns = lambda some_str: some_str + "_t+1"), 
                                     list_of_dfs_for_feature_vectors[4].rename(columns = lambda some_str: some_str + "_t+2")], 
                                    axis = 1)
        current_week_df.dropna(inplace=True)
        detector_string = '021f'
        current_week_df['trait_4'] = ((current_week_df['{}_t-2'.format(detector_string)] > np.percentile(current_week_df['{}_t-2'.format(detector_string)], 90)) | 
                                      (current_week_df['{}_t-1'.format(detector_string)] > np.percentile(current_week_df['{}_t-1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t'.format(detector_string)] > np.percentile(current_week_df['{}_t'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+1'.format(detector_string)] > np.percentile(current_week_df['{}_t+1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+2'.format(detector_string)] > np.percentile(current_week_df['{}_t+2'.format(detector_string)], 90)))
        detector_string = '021h'
        current_week_df['trait_6'] = ((current_week_df['{}_t-2'.format(detector_string)] > np.percentile(current_week_df['{}_t-2'.format(detector_string)], 90)) | 
                                      (current_week_df['{}_t-1'.format(detector_string)] > np.percentile(current_week_df['{}_t-1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t'.format(detector_string)] > np.percentile(current_week_df['{}_t'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+1'.format(detector_string)] > np.percentile(current_week_df['{}_t+1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+2'.format(detector_string)] > np.percentile(current_week_df['{}_t+2'.format(detector_string)], 90)))
        detector_string = '022f'
        current_week_df['trait_8'] = ((current_week_df['{}_t-2'.format(detector_string)] > np.percentile(current_week_df['{}_t-2'.format(detector_string)], 90)) | 
                                      (current_week_df['{}_t-1'.format(detector_string)] > np.percentile(current_week_df['{}_t-1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t'.format(detector_string)] > np.percentile(current_week_df['{}_t'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+1'.format(detector_string)] > np.percentile(current_week_df['{}_t+1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+2'.format(detector_string)] > np.percentile(current_week_df['{}_t+2'.format(detector_string)], 90)))
        detector_string = '022h'
        current_week_df['trait_10'] = ((current_week_df['{}_t-2'.format(detector_string)] > np.percentile(current_week_df['{}_t-2'.format(detector_string)], 90)) | 
                                      (current_week_df['{}_t-1'.format(detector_string)] > np.percentile(current_week_df['{}_t-1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t'.format(detector_string)] > np.percentile(current_week_df['{}_t'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+1'.format(detector_string)] > np.percentile(current_week_df['{}_t+1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+2'.format(detector_string)] > np.percentile(current_week_df['{}_t+2'.format(detector_string)], 90)))
        detector_string = '027f'
        current_week_df['trait_12'] = ((current_week_df['{}_t-2'.format(detector_string)] > np.percentile(current_week_df['{}_t-2'.format(detector_string)], 90)) | 
                                      (current_week_df['{}_t-1'.format(detector_string)] > np.percentile(current_week_df['{}_t-1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t'.format(detector_string)] > np.percentile(current_week_df['{}_t'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+1'.format(detector_string)] > np.percentile(current_week_df['{}_t+1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+2'.format(detector_string)] > np.percentile(current_week_df['{}_t+2'.format(detector_string)], 90)))
        detector_string = '027h'
        current_week_df['trait_14'] = ((current_week_df['{}_t-2'.format(detector_string)] > np.percentile(current_week_df['{}_t-2'.format(detector_string)], 90)) | 
                                      (current_week_df['{}_t-1'.format(detector_string)] > np.percentile(current_week_df['{}_t-1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t'.format(detector_string)] > np.percentile(current_week_df['{}_t'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+1'.format(detector_string)] > np.percentile(current_week_df['{}_t+1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+2'.format(detector_string)] > np.percentile(current_week_df['{}_t+2'.format(detector_string)], 90)))
        detector_string = '028f'
        current_week_df['trait_16'] = ((current_week_df['{}_t-2'.format(detector_string)] > np.percentile(current_week_df['{}_t-2'.format(detector_string)], 90)) | 
                                      (current_week_df['{}_t-1'.format(detector_string)] > np.percentile(current_week_df['{}_t-1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t'.format(detector_string)] > np.percentile(current_week_df['{}_t'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+1'.format(detector_string)] > np.percentile(current_week_df['{}_t+1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+2'.format(detector_string)] > np.percentile(current_week_df['{}_t+2'.format(detector_string)], 90)))
        detector_string = '028h'
        current_week_df['trait_18'] = ((current_week_df['{}_t-2'.format(detector_string)] > np.percentile(current_week_df['{}_t-2'.format(detector_string)], 90)) | 
                                      (current_week_df['{}_t-1'.format(detector_string)] > np.percentile(current_week_df['{}_t-1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t'.format(detector_string)] > np.percentile(current_week_df['{}_t'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+1'.format(detector_string)] > np.percentile(current_week_df['{}_t+1'.format(detector_string)], 90)) |
                                      (current_week_df['{}_t+2'.format(detector_string)] > np.percentile(current_week_df['{}_t+2'.format(detector_string)], 90)))
        detector_string = '058a'
        current_week_df['trait_20'] = ((current_week_df['{}_t-2'.format(detector_string)]).astype(int) | 
                                      (current_week_df['{}_t-1'.format(detector_string)]).astype(int) |
                                      (current_week_df['{}_t'.format(detector_string)]).astype(int) |
                                      (current_week_df['{}_t+1'.format(detector_string)]).astype(int) |
                                      (current_week_df['{}_t+2'.format(detector_string)]).astype(int))
        all_full_week_dfs.append(current_week_df)
        del list_of_dfs_for_feature_vectors[0]
    return pd.concat(all_full_week_dfs)[det_names_in_correct_order + ['Target_t-2',
                                                                              'Target_t-1',
                                                                              'Target_t',
                                                                              'Target_t+1',
                                                                              'Target_t+2'] + all_trait_names]

In [15]:
training_data = read_org_data(4, 7, 33)

In [16]:
training_data

Unnamed: 0,001a_t-2,001a_t-1,001a_t,001a_t+1,001a_t+2,001b_t-2,001b_t-1,001b_t,001b_t+1,001b_t+2,...,060a_t-2,060a_t-1,060a_t,060a_t+1,060a_t+2,Target_t-2,Target_t-1,Target_t,Target_t+1,Target_t+2
0,91647.0,27684.0,0.0,0.0,99559.0,0.0,0.0,0.0,0.0,13990.0,...,29,6,5,4,20,0,0,0,0,0
1,27571.0,33708.0,94522.0,62116.0,1783.0,0.0,0.0,0.0,0.0,0.0,...,3,26,29,9,2,0,0,0,0,0
2,0.0,61715.0,21997.0,0.0,9995.0,0.0,51320.0,45063.0,0.0,7292.0,...,2,6,11,26,9,0,0,0,0,0
3,26547.0,0.0,66372.0,25894.0,51525.0,2889.0,0.0,1825.0,13571.0,49611.0,...,13,23,1,3,5,0,0,0,0,0
4,33367.0,37137.0,73056.0,0.0,171270.0,0.0,0.0,26857.0,0.0,3037.0,...,16,10,4,25,26,0,0,0,0,0
5,0.0,0.0,0.0,163290.0,0.0,7204.0,0.0,0.0,30155.0,0.0,...,17,2,24,1,13,0,0,0,0,0
6,0.0,64394.0,81503.0,69709.0,101300.0,0.0,0.0,0.0,65600.0,0.0,...,29,1,25,29,6,0,0,0,0,0
7,194200.0,0.0,14739.0,0.0,58333.0,28951.0,0.0,14922.0,0.0,57559.0,...,1,12,10,1,17,0,0,0,0,0
8,0.0,49526.0,0.0,133950.0,44800.0,0.0,0.0,0.0,0.0,3428.0,...,23,10,25,4,3,0,0,0,0,0
9,299030.0,0.0,0.0,0.0,189050.0,244870.0,0.0,0.0,0.0,28193.0,...,30,8,3,27,5,0,0,0,0,0


In [17]:
some_test_data = read_org_test_data(4, 34, 49)

In [18]:
some_test_data

Unnamed: 0,001a_t-2,001a_t-1,001a_t,001a_t+1,001a_t+2,001b_t-2,001b_t-1,001b_t,001b_t+1,001b_t+2,...,Target_t+2,trait_4,trait_6,trait_8,trait_10,trait_12,trait_14,trait_16,trait_18,trait_20
0,0.0,36155.0,50680.0,34752.0,0.0,6362.0,0.0,0.0,0.0,0.0,...,0,True,False,False,False,True,False,False,False,0
1,0.0,94800.0,34704.0,7322.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,False,False,False,False,False,False,False,False,0
2,0.0,44961.0,43250.0,19081.0,0.0,0.0,46872.0,43345.0,6818.0,0.0,...,0,False,True,False,True,False,True,True,True,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,True,True,False,True,False,False,False,False,0
4,0.0,82054.0,59264.0,123750.0,40571.0,0.0,0.0,0.0,0.0,0.0,...,0,False,False,False,False,False,False,False,False,0
5,14976.0,123110.0,43542.0,14490.0,15043.0,0.0,7764.0,3464.0,2506.0,0.0,...,0,False,False,False,False,False,False,False,False,0
6,9023.0,36821.0,30637.0,28443.0,26175.0,0.0,1017.0,5417.0,29407.0,24584.0,...,0,True,False,False,False,True,False,False,False,0
7,27558.0,72947.0,45228.0,36956.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,False,False,False,False,False,False,False,False,0
8,128660.0,13090.0,0.0,0.0,0.0,4599.0,0.0,0.0,0.0,0.0,...,0,False,False,False,False,False,False,False,False,0
9,96713.0,14020.0,9382.0,7810.0,34125.0,0.0,2256.0,4013.0,9524.0,0.0,...,0,True,False,True,False,True,False,False,False,0


In [19]:
def split_test_data(sample_test_data_df, scaler_from_training_data):
    T_x, T_y, T_generated_attributes = (sample_test_data_df[det_names_in_correct_order], 
                                        sample_test_data_df['Target_t'], 
                                        sample_test_data_df[all_trait_names])
    T_x = scaler_from_training_data.transform(T_x)
    return T_x, T_y.values, T_generated_attributes

In [20]:
def find_threshold(true_y_values, pred_results):
    prec, rec, thres = precision_recall_curve(true_y_values, pred_results)
    score = np.zeros(len(thres))

    # Score Thresholds
    for i in range(0,len(thres)):
        score[i] = f1_score(true_y_values, (pred_results > thres[i]).astype('int'))
    return thres[np.argmax(score)], score.max()

In [21]:
def RCP14_Algorithm_1_Ted_enhanced(S1_x, S2_x, S3_x, S1_y, S2_y, S3_y, iter_per_weight_per_round):
    weights_to_test = list(range(10, 210, 10))
    weight_scores = {trial_weight : [] for trial_weight in range(10, 210, 10)}
    for elimination_round in range(3):
        for trial_weight in weights_to_test:
            for iteration_num in range(iter_per_weight_per_round):
                model_prediction_output = get_model_predictions(S1_x, S1_y, S2_x, S2_y, trial_weight, S3_x)
                weight_scores[trial_weight].append(average_precision_score(S3_y, model_prediction_output))
                print("current area is {} for weight {} in elimination round {} for iteration number {}".format(weight_scores[trial_weight][-1], trial_weight, elimination_round, iteration_num))
            #np.savetxt(fname = "weight_scores_weight_{}_round_{}_{}.csv".format(trial_weight, elimination_round, added_name_string), 
            #           X = weight_scores[trial_weight], delimiter = ",")
        weight_avg_scores = {trial_weight : np.mean(weight_scores[trial_weight]) for trial_weight in range(10, 210, 10)}
        print("mean score is {} for trial weight {}".format(weight_avg_scores, trial_weight))
        worst_5_weights = sorted(weights_to_test, key=lambda k: weight_avg_scores[k])[0:5]
        weights_to_test = [some_weight for some_weight in weights_to_test if some_weight not in worst_5_weights]
    return weights_to_test, weight_scores

In [22]:
def choose_weights(possible_weights, number_of_results, relative_probability_vector):
    if len(possible_weights) != len(relative_probability_vector):
        raise Exception("Weights vector must be the same size as the probability vector")
    relative_probability_vector = np.array(relative_probability_vector)
    abs_prob_vector = relative_probability_vector/relative_probability_vector.sum()
    return choice(a = possible_weights, size = number_of_results, p = abs_prob_vector)

In [23]:
def get_weights_for_org(simulated_org_number, iter_per_weight_per_round = 5):
    training_data = read_org_data(simulated_org_number, 7, 33)
    training_data.replace([-np.inf,np.inf], np.nan, inplace=True)
    training_data.dropna(inplace=True)
    S1_x, S2_x, S3_x, S1_y, S2_y, S3_y, scaler_from_training_data = split_training_data(training_data)
    return RCP14_Algorithm_1_Ted_enhanced(S1_x, S2_x, S3_x, S1_y, S2_y, S3_y, iter_per_weight_per_round)

In [24]:
# weights_to_test = {}
# weight_scores = {}
# for org_num in range(5):
#     weights_to_test, weight_scores = get_weights_for_org(org_num, iter_per_weight_per_round= 5)
#     np.savetxt("C:/Users/Mimran/Google Drive/GMU SCITE/RCPs Fifth Quarter/RCP14/Weights Mohanad July 23 Data 5 org 5 iter per weight/Weights_to_test_{}.csv".format(org_num), weights_to_test, fmt='%d', delimiter=',')
#     with open('C:/Users/Mimran/Google Drive/GMU SCITE/RCPs Fifth Quarter/RCP14/Weights Weights Mohanad July 23 Data 5 org 5 iter per weight/Weights_Scores_{}.csv'.format(org_num), 'w') as f:
#         [f.write('{0},{1}\n'.format(key, value)) for key, value in weight_scores.items()]

In [38]:
# weights_to_test

[120, 130, 140, 150, 160, 170, 180, 200]

In [45]:
# pd.DataFrame(weight_scores[10])

Unnamed: 0,0
0,0.061082
1,0.007515
2,0.01298
3,0.013239
4,0.013134
5,0.011031
6,0.018007
7,0.020807
8,0.033022
9,0.045257


In [46]:
# weight_scores

{10: [0.061082167814673877,
  0.007514857962241041,
  0.012979761729968372,
  0.013238761158291125,
  0.013133776874725753,
  0.011030789630907875,
  0.018007440086793756,
  0.020807336688939512,
  0.033022376048929158,
  0.045256797976872855,
  0.066212365544360385,
  0.0076917838948872812,
  0.017321870241851715,
  0.021923362162220201,
  0.023264824864840165],
 20: [0.018984626265678248,
  0.016869852339701844,
  0.033491664378541695,
  0.031456180385196771,
  0.011055513133428027,
  0.013960145886283835,
  0.046956039035560081,
  0.023490136751551503,
  0.010890527254517621,
  0.023683942332320598],
 30: [0.0052505685399517471,
  0.029998791479341674,
  0.028053660159869862,
  0.021318578566349528,
  0.011403225693389515],
 40: [0.027769569964377072,
  0.030147640905485006,
  0.0085582899584106323,
  0.013407382796418248,
  0.011571114115683866],
 50: [0.014856144493694039,
  0.017364583176357404,
  0.024921692574342583,
  0.0093654942990161229,
  0.024209537136816176],
 60: [0.016

In [52]:
# pd.DataFrame(weights_to_test)

Unnamed: 0,0
0,120
1,130
2,140
3,150
4,160
5,170
6,180
7,200


In [50]:
# weight_scores_df = pd.concat([pd.DataFrame(weight_scores[trial_weight]) for trial_weight in range(10, 210, 10)], axis = 1)
# weight_scores_df.columns = list(range(10, 210, 10))
# weight_scores_df.to_csv("C:/Users/Mimran/Google Drive/GMU SCITE/RCPs Fifth Quarter/RCP14/Weights Mohanad July 23 Data 5 org 5 iter per weight/Weight_scores_sample.csv")

In [27]:
weights_to_test = {}
weight_scores = {}
for org_num in range(7, 11):
    weights_to_test[org_num], weight_scores[org_num] = get_weights_for_org(org_num, iter_per_weight_per_round= 5)
    #np.savetxt("C:/Users/Mimran/Google Drive/GMU SCITE/RCPs Fifth Quarter/RCP14/Weights Mohanad July 23 Data 5 org 5 iter per weight/Weights_to_test_{}.csv".format(org_num), weights_to_test, fmt='%d', delimiter=',')
    pd.DataFrame(weights_to_test[org_num]).to_csv("C:/Users/Mimran/Google Drive/GMU SCITE/RCPs Fifth Quarter/RCP14/Weights Mohanad July 23 Data 5 org 5 iter per weight/Weights_to_test_pd_{}.csv".format(org_num))
    weight_scores_df = pd.concat([pd.DataFrame(weight_scores[org_num][trial_weight]) for trial_weight in range(10, 210, 10)], axis = 1)
    weight_scores_df.columns = list(range(10, 210, 10))
    weight_scores_df.to_csv("C:/Users/Mimran/Google Drive/GMU SCITE/RCPs Fifth Quarter/RCP14/Weights Mohanad July 23 Data 5 org 5 iter per weight/Weight_scores_pd_{}.csv".format(org_num))

current area is 0.016612887734727 for weight 10 in elimination round 0 for iteration number 0
current area is 0.008665230405834569 for weight 10 in elimination round 0 for iteration number 1
current area is 0.00847474809277708 for weight 10 in elimination round 0 for iteration number 2
current area is 0.014578627362434116 for weight 10 in elimination round 0 for iteration number 3
current area is 0.013341233860005993 for weight 10 in elimination round 0 for iteration number 4
current area is 0.01301910618329903 for weight 20 in elimination round 0 for iteration number 0
current area is 0.011633046740431991 for weight 20 in elimination round 0 for iteration number 1
current area is 0.01349502676393164 for weight 20 in elimination round 0 for iteration number 2
current area is 0.01516013800476934 for weight 20 in elimination round 0 for iteration number 3
current area is 0.013852842607918312 for weight 20 in elimination round 0 for iteration number 4
current area is 0.011354235023090645 

KeyboardInterrupt: 

In [26]:
pd.DataFrame(weights_to_test[org_num]).to_csv("C:/Users/Mimran/Google Drive/GMU SCITE/RCPs Fifth Quarter/RCP14/Weights Mohanad July 23 Data 5 org 5 iter per weight/Weights_to_test_pd_{}.csv".format(org_num))
weight_scores_df = pd.concat([pd.DataFrame(weight_scores[org_num][trial_weight]) for trial_weight in range(10, 210, 10)], axis = 1)
weight_scores_df.columns = list(range(10, 210, 10))
weight_scores_df.to_csv("C:/Users/Mimran/Google Drive/GMU SCITE/RCPs Fifth Quarter/RCP14/Weights Mohanad July 23 Data 5 org 5 iter per weight/Weight_scores_pd_{}.csv".format(org_num))

In [45]:
np.savetxt("/Users/MImran/Google Drive/GMU SCITE/RCPs Fifth Quarter/RCP14/Weights Mohanad July 15 Data 1 org 1 iter per weight/Weights_to_test_{}.csv".format(org_num), weights_to_test, fmt='%d', delimiter=',')

In [50]:
with open('/Users/MImran/Google Drive/GMU SCITE/RCPs Fifth Quarter/RCP14/Weights Mohanad July 15 Data 1 org 1 iter per weight/Weights_Scores_{}.csv'.format(org_num), 'w') as f:
    [f.write('{0},{1}\n'.format(key, value)) for key, value in weight_scores.items()]

In [34]:
weights_to_test

[30, 50, 60, 70, 90, 140, 160, 170]

In [32]:
weight_scores

{10: [0.058039417874286243],
 20: [0.067525396393766174],
 30: [0.10025576619213784, 0.080912224862505294, 0.10467214450142019],
 40: [0.078930014922692526, 0.048523329444326573],
 50: [0.11439317048893004, 0.11193340517110428, 0.10023429911384485],
 60: [0.10808264935492361, 0.1191104006677256, 0.073948390686569623],
 70: [0.11832274456690846, 0.12585452383735235, 0.069601219665661282],
 80: [0.082628704116853607, 0.091688458640399206],
 90: [0.10562039558522815, 0.079274481601603095, 0.10505202298325028],
 100: [0.088737142960236465, 0.073646109646157532],
 110: [0.082043792683616334, 0.10507903773977208, 0.073961426933138313],
 120: [0.092209971282326691, 0.087685121566188559, 0.080375393731227443],
 130: [0.079666179592155134, 0.10350052017237781, 0.081330225860471964],
 140: [0.11261391266078653, 0.10456438909776479, 0.11280571992016052],
 150: [0.10188485577479107, 0.092302479009531577, 0.072830885205577803],
 160: [0.090260061343646045, 0.097981704328078442, 0.081331088540706767

In [47]:
def RCP14_Algorithm_2(S1_x, S2_x, S3_x, S1_y, S2_y, S3_y, T_x, T_y, 
                      T_generated_attributes, best_weights, num_iterations = 1):
    answer_dict = {"Answer_" + str(answer_num) : np.zeros(num_iterations) for answer_num in range(1, 22)}
    chosen_weights = choose_weights(possible_weights= best_weights, number_of_results= num_iterations,
                                    relative_probability_vector= (#later on))
    for iteration_num in range(num_iterations):
        current_model = get_model(S1_x, S1_y, S2_x, S2_y, chosen_weights[iteration_num])
        model_prediction_output = current_model.predict(S3_x)[:,0]
        #print("Model prediction output looks like")
        #print(model_prediction_output[0:10])
        chosen_tau, best_score = find_threshold(true_y_values=S3_y, pred_results= model_prediction_output)
        print("optimized cutoff is {}".format(chosen_tau))
        print("The F1 score for this choice is {}.".format(best_score))
        prediction_output_for_test_data = current_model.predict(T_x)[:,0]
        T_labels = (prediction_output_for_test_data > chosen_tau).astype(int)
        print("Number of alerts is {}.".format(T_labels.sum()))
        answer_dict["Answer_1"][iteration_num] = (T_y & T_labels).sum() / T_y.sum()
        answer_dict["Answer_2"][iteration_num] = (T_y & T_labels).sum() / T_labels.sum()
        answer_dict["Answer_3"][iteration_num] = (T_y & T_labels).sum() / (T_y ^ 1).sum()
        answer_dict["Answer_4"][iteration_num] = (T_generated_attributes['trait_4'].values & T_labels).mean() / (T_labels).mean()
        answer_dict["Answer_5"][iteration_num] = (T_generated_attributes['trait_4'].values & T_labels).mean() / np.mean(T_generated_attributes['trait_4'])
        answer_dict["Answer_6"][iteration_num] = (T_generated_attributes['trait_6'].values & T_labels).mean() / (T_labels).mean()
        answer_dict["Answer_7"][iteration_num] = (T_generated_attributes['trait_6'].values & T_labels).mean() / np.mean(T_generated_attributes['trait_6'])
        answer_dict["Answer_8"][iteration_num] = (T_generated_attributes['trait_8'].values & T_labels).mean() / (T_labels).mean()
        answer_dict["Answer_9"][iteration_num] = (T_generated_attributes['trait_8'].values & T_labels).mean() / np.mean(T_generated_attributes['trait_8'])
        answer_dict["Answer_10"][iteration_num] = (T_generated_attributes['trait_10'].values & T_labels).mean() / (T_labels).mean()
        answer_dict["Answer_11"][iteration_num] = (T_generated_attributes['trait_10'].values & T_labels).mean() / np.mean(T_generated_attributes['trait_10'])
        answer_dict["Answer_12"][iteration_num] = (T_generated_attributes['trait_12'].values & T_labels).mean() / (T_labels).mean()
        answer_dict["Answer_13"][iteration_num] = (T_generated_attributes['trait_12'].values & T_labels).mean() / np.mean(T_generated_attributes['trait_12'])
        answer_dict["Answer_14"][iteration_num] = (T_generated_attributes['trait_14'].values & T_labels).mean() / (T_labels).mean()
        answer_dict["Answer_15"][iteration_num] = (T_generated_attributes['trait_14'].values & T_labels).mean() / np.mean(T_generated_attributes['trait_14'])
        answer_dict["Answer_16"][iteration_num] = (T_generated_attributes['trait_16'].values & T_labels).mean() / (T_labels).mean()
        answer_dict["Answer_17"][iteration_num] = (T_generated_attributes['trait_16'].values & T_labels).mean() / np.mean(T_generated_attributes['trait_16'])
        answer_dict["Answer_18"][iteration_num] = (T_generated_attributes['trait_18'].values & T_labels).mean() / (T_labels).mean()
        answer_dict["Answer_19"][iteration_num] = (T_generated_attributes['trait_18'].values & T_labels).mean() / np.mean(T_generated_attributes['trait_18'])
        answer_dict["Answer_20"][iteration_num] = (T_generated_attributes['trait_20'].values & T_labels).mean() / (T_labels).mean()
        answer_dict["Answer_21"][iteration_num] = (T_generated_attributes['trait_20'].values & T_labels).mean() / np.mean(T_generated_attributes['trait_20'])
    return answer_dict

SyntaxError: invalid syntax (<ipython-input-47-a33eef50a6fa>, line 6)

In [51]:
def get_all_answers_for_org(simulated_org_number, iter_per_weight = 25, answer_iterations = 1):
    training_data = read_org_data(simulated_org_number, 7, 33)
    training_data.replace([-np.inf,np.inf], np.nan, inplace=True)
    training_data.dropna(inplace=True)
    S1_x, S2_x, S3_x, S1_y, S2_y, S3_y, scaler_from_training_data = split_training_data(training_data)
    #weight_to_use = RCP14_Algorithm_1(S1_x, S2_x, S3_x, S1_y, S2_y, S3_y, 
    #                                  iter_per_weight = iter_per_weight)
    weight_to_use = 180
    test_data = read_org_test_data(simulated_org_number, 34, 49)
    test_data.replace([-np.inf,np.inf], np.nan, inplace=True)
    test_data.dropna(inplace=True)
    T_x, T_y, T_generated_attributes = split_test_data(test_data, scaler_from_training_data)
    return RCP14_Algorithm_2(S1_x, S2_x, S3_x, S1_y, S2_y, S3_y, T_x, T_y, 
                      T_generated_attributes, weight_to_use, num_iterations = answer_iterations)

In [53]:
Answers_for_each_org_dict = {}
for org_num in range(5):
    Answers_for_each_org_dict[org_num] = get_all_answers_for_org(org_num)
    pd.DataFrame(Answers_for_each_org_dict[org_num]).to_csv("C:/Users/Mimran/OneDrive - George Mason University/C4I PC Backup/SCITE/RCPs Fifth Quarter/RCP14/Dev/Answers_Mohanad_{}.csv".format(org_num), index=False)

  'precision', 'predicted', average, warn_for)


optimized cutoff is 0.294933944940567
The F1 score for this choice is 0.05405405405405406.
Number of alerts is 2578.
optimized cutoff is 0.17648626863956451
The F1 score for this choice is 0.0425531914893617.
Number of alerts is 2431.
optimized cutoff is 0.2771953046321869
The F1 score for this choice is 0.09523809523809525.
Number of alerts is 2091.
optimized cutoff is 0.9988564252853394
The F1 score for this choice is 0.03680981595092024.
Number of alerts is 447.
optimized cutoff is 0.9968794584274292
The F1 score for this choice is 0.032.
Number of alerts is 142.


In [60]:
pd.DataFrame(Answers_for_each_org_dict).T.astype(float).to_csv("Mohanad_prelim_answers.csv", index = False)

In [68]:
def get_all_answers_for_org_alt_Mohanad(simulated_org_number, iter_per_weight = 25, answer_iterations = 1):
    training_data = read_org_data(simulated_org_number, 7, 33)
    training_data.replace([-np.inf,np.inf], np.nan, inplace=True)
    training_data.dropna(inplace=True)
    S1_x, S2_x, S3_x, S1_y, S2_y, S3_y, scaler_from_training_data = split_training_data(training_data)
    #weight_to_use = RCP14_Algorithm_1(S1_x, S2_x, S3_x, S1_y, S2_y, S3_y, 
    #                                  iter_per_weight = iter_per_weight)
    weight_to_use = 180
    test_data = read_org_test_data(simulated_org_number, 36, 49)
    test_data.replace([-np.inf,np.inf], np.nan, inplace=True)
    test_data.dropna(inplace=True)
    T_x, T_y, T_generated_attributes = split_test_data(test_data, scaler_from_training_data)
    return RCP14_Algorithm_2(S1_x, S2_x, S3_x, S1_y, S2_y, S3_y, T_x, T_y, 
                      T_generated_attributes, weight_to_use, num_iterations = answer_iterations)

In [None]:
Answers_for_each_org_dict = {}
for org_num in range(20):
    Answers_for_each_org_dict[org_num] = get_all_answers_for_org_alt_Mohanad(org_num, answer_iterations=10)
    pd.DataFrame(Answers_for_each_org_dict[org_num]).to_csv("C:/Users/Mimran/OneDrive - George Mason University/C4I PC Backup/SCITE/RCPs Fifth Quarter/RCP14/Dev/20Org10Iter_Answers_Mohanad_{}.csv".format(org_num), index=False)

  'precision', 'predicted', average, warn_for)


optimized cutoff is 0.8155121803283691
The F1 score for this choice is 0.08695652173913045.
Number of alerts is 665.
optimized cutoff is 0.9124085903167725
The F1 score for this choice is 0.07692307692307693.
Number of alerts is 625.
optimized cutoff is 0.9997060894966125
The F1 score for this choice is 0.07692307692307693.
Number of alerts is 62.
optimized cutoff is 0.9179666638374329
The F1 score for this choice is 0.08695652173913045.
Number of alerts is 607.
optimized cutoff is 0.9999988079071045
The F1 score for this choice is 0.04878048780487805.
Number of alerts is 147.
optimized cutoff is 0.8741991519927979
The F1 score for this choice is 0.08333333333333334.
Number of alerts is 676.
optimized cutoff is 0.9641646146774292
The F1 score for this choice is 0.08695652173913045.
Number of alerts is 386.
optimized cutoff is 0.8126572370529175
The F1 score for this choice is 0.08333333333333334.
Number of alerts is 668.
optimized cutoff is 0.9999444484710693
The F1 score for this choi

In [79]:
pd.concat(pd.DataFrame(Answers_for_each_org_dict[org_num]) for org_num in range(20)).to_csv("MultiIter10Answers_for_first_20_orgs.csv", index = False)

In [63]:
pd.DataFrame(Answers_for_each_org_dict).T.astype(float).to_csv("Mohanad_alt__prelim_answers.csv", index = False)

In [64]:
Answers_for_each_org_dict = {}
for org_num in range(5):
    Answers_for_each_org_dict[org_num] = get_all_answers_for_org(org_num)
    #pd.DataFrame(Answers_for_each_org_dict[org_num]).to_csv("C:/Users/Mimran/OneDrive - George Mason University/C4I PC Backup/SCITE/RCPs Fifth Quarter/RCP14/Dev/Answers_Mohanad_{}.csv".format(org_num), index=False)

  'precision', 'predicted', average, warn_for)


optimized cutoff is 0.8509586453437805
The F1 score for this choice is 0.24000000000000002.
Number of alerts is 939.
optimized cutoff is 0.20511609315872192
The F1 score for this choice is 0.09302325581395349.
Number of alerts is 2077.
optimized cutoff is 0.14999797940254211
The F1 score for this choice is 0.14035087719298245.
Number of alerts is 3227.
optimized cutoff is 0.10508623719215393
The F1 score for this choice is 0.1038961038961039.
Number of alerts is 4539.
optimized cutoff is 0.06337732821702957
The F1 score for this choice is 0.0449438202247191.
Number of alerts is 3427.


In [65]:
pd.DataFrame(Answers_for_each_org_dict).T.astype(float).to_csv("Mohanad_more_prelim_answers.csv", index = False)

In [56]:
some_arr = np.array([[2.62939807e-07],
 [  9.73050701e-05],
 [  3.06557581e-07],
 [  2.80383620e-06],
 [  2.15645372e-07],
 [  4.46738454e-07],
 [  5.90954005e-06],
 [  9.25215500e-06],
 [  3.62556960e-07],
 [  1.25229633e-06]])

In [62]:
some_arr[:,0]

array([  2.62939807e-07,   9.73050701e-05,   3.06557581e-07,
         2.80383620e-06,   2.15645372e-07,   4.46738454e-07,
         5.90954005e-06,   9.25215500e-06,   3.62556960e-07,
         1.25229633e-06])

In [66]:
Answers_for_each_org_dict = {}
for org_num in range(50):
    Answers_for_each_org_dict[org_num] = get_all_answers_for_org(org_num)
    #pd.DataFrame(Answers_for_each_org_dict[org_num]).to_csv("C:/Users/Mimran/OneDrive - George Mason University/C4I PC Backup/SCITE/RCPs Fifth Quarter/RCP14/Dev/Answers_Mohanad_{}.csv".format(org_num), index=False)

  'precision', 'predicted', average, warn_for)


optimized cutoff is 0.7352255582809448
The F1 score for this choice is 0.20689655172413793.
Number of alerts is 1511.
optimized cutoff is 0.08171864598989487
The F1 score for this choice is 0.02702702702702703.
Number of alerts is 3580.
optimized cutoff is 0.5472603440284729
The F1 score for this choice is 0.13793103448275862.
Number of alerts is 1624.
optimized cutoff is 0.9986234903335571
The F1 score for this choice is 0.06896551724137931.
Number of alerts is 14.
optimized cutoff is 0.7334756851196289
The F1 score for this choice is 0.08333333333333334.
Number of alerts is 1346.
optimized cutoff is 0.20547465980052948
The F1 score for this choice is 0.14285714285714282.
Number of alerts is 1907.
optimized cutoff is 0.9969298243522644
The F1 score for this choice is 0.11940298507462685.
Number of alerts is 399.
optimized cutoff is 0.9954180717468262
The F1 score for this choice is 0.02898550724637681.
Number of alerts is 322.
optimized cutoff is 0.04548053443431854
The F1 score for t

In [67]:
pd.DataFrame(Answers_for_each_org_dict).T.astype(float).to_csv("Mohanad_even_more_prelim_answers.csv", index = False)

In [66]:
print(Answers_for_each_org_dict[org_num])

{'Answer_20': array([ 1.]), 'Answer_13': array([ 0.00214158]), 'Answer_11': array([ 0.00178117]), 'Answer_1': array([ 0.85]), 'Answer_10': array([ 0.2734375]), 'Answer_16': array([ 0.21875]), 'Answer_21': array([ 0.10847458]), 'Answer_7': array([ 0.00219421]), 'Answer_6': array([ 0.3359375]), 'Answer_18': array([ 0.3125]), 'Answer_5': array([ 0.00263484]), 'Answer_2': array([ 0.265625]), 'Answer_9': array([ 0.00201384]), 'Answer_3': array([ 0.0006867]), 'Answer_19': array([ 0.00210393]), 'Answer_4': array([ 0.3984375]), 'Answer_8': array([ 0.3046875]), 'Answer_12': array([ 0.2890625]), 'Answer_15': array([ 0.00225722]), 'Answer_14': array([ 0.3359375]), 'Answer_17': array([ 0.00166468])}


In [49]:
Answers_for_each_org_dict[org_num]["Answer_1"]

array([ 52.])

In [101]:
training_data = read_org_data(3, 7, 33)
S1_x, S2_x, S3_x, S1_y, S2_y, S3_y, scaler_from_training_data = split_training_data(training_data)

In [102]:
S1_y.sum()

79

In [103]:
S2_y.sum()

11

In [104]:
S3_y.sum()

22

In [76]:
76+13+23

112

In [79]:
training_data['Target_t'].sum()

112

In [86]:
89/112

0.7946428571428571

In [90]:
11/89

0.12359550561797752

In [82]:
23/112

0.20535714285714285

In [107]:
test_data = read_org_test_data(4, 34, 49)

In [108]:
test_data['trait_20'].sum()

1192

In [111]:
test_data['X058a_t'].sum()

500