In [1]:
import numpy as np
import pandas as pd
from pyod.models import hbos, loda, lof, qmcd
from pyod.models.ecod import ECOD
from pyod.models.inne import INNE
from pyod.models.lmdd import LMDD
from pyod.models.alad import ALAD
from pyod.models.cblof import CBLOF
from pyod.models.cof import COF
from pyod.models.deep_svdd import DeepSVDD
from pyod.models.hbos import HBOS
from pyod.models.loda import LODA
from pyod.models.lof import LOF
from pyod.models.qmcd import QMCD
from pyod.models.sampling import Sampling
from pyod.models.lunar import LUNAR
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler


# Initialize individual models
models = {
    #"ALAD": ALAD(),
    "CBLOF": CBLOF(),
    "COF": COF(),
    #"DeepSVDD": DeepSVDD(),
    #"HBOS": HBOS(),
    #"LODA": LODA(),
    "LOF": LOF(),
    #"QMCD": QMCD(),
    #"ECOD": ECOD(),
    #"INNE": INNE(),
    "LMDD": LMDD(),
    #"Sampling": Sampling()
    "LUNAR": LUNAR()
}

# Function for creating and training ensemble models
def create_and_train_ensemble(models, X_train):
    ensemble = {}
    for model_name, model in models.items():
        model.fit(X_train)
        ensemble[model_name] = model
    return ensemble

# Function for ensemble prediction
def ensemble_predict(ensemble, X):
    predictions = np.zeros(len(X))
    for model in ensemble.values():
        predictions += model.predict(X)  # Or use decision_function for some models
    predictions /= len(ensemble)
    return predictions

# Example usage
# Assuming X_train, X_test are defined and preprocessed accordingly
#ensemble = create_and_train_ensemble(models, X_train)
#predictions = ensemble_predict(ensemble, X_test)


In [3]:
odtable = pd.read_csv(r"updated_final_best_scores_table.csv")
odtable

Unnamed: 0,Outlier,Best F1 Score,Best F1 Columns,Best F1 Score Type,Best Accuracy Score,Best Accuracy Columns,Best Accuracy Score Type
0,China 1992 - 1995,0.16,LMDD_minmax_scale,Linear,1.0,LMDD_minmax_scale,Linear
1,Tunisia 2011 - 2020,0.588235,QMCD,Probabilistic,0.555556,QMCD,Probabilistic
2,Costa Rica 1949 - 2015,0.21374,INNE_robust_scale,Ensemble,0.297872,INNE_robust_scale,Ensemble
3,Botswana 1990 - 2006,0.084112,LOF_robust_scale,Proximity,0.529412,LOF_robust_scale,Proximity
4,United States 2004 - 2004,0.166667,HBOS_minmax_scale,Proximity,1.0,ECOD,Probabilistic
5,United States 2004 - 2011,0.158416,ALAD,Neural Network,1.0,ALAD,Neural Network
6,IND 2000 - 2015,0.171123,INNE_scale,Ensemble,1.0,QMCD,Probabilistic
7,BRA 1995 - 2006,0.066225,COF,Proximity,0.416667,COF,Proximity
8,MEX 1948 - 2008,0.077626,HBOS_scale,Proximity,0.435897,HBOS_scale,Proximity
9,HUN 2010 - 2023,0.6,LODA,Ensemble,0.9,CBLOF,Proximity


In [4]:
f1type = ['Ensemble', 'Proximity', 'Probabilistic', 'Neural Network', 'Linear']
acctype = ['Probabilistic', 'Proximity', 'Ensemble', 'Neural Network', 'Linear']
f1weight = [.3571, .2857, .1429, .1429, .0714]
accweight = [.2857, .2857, .2143, .1429, .0714]

In [5]:
from itertools import product
import pandas as pd

# Define the models and their corresponding types
model_types = {
    #"ALAD": "Neural Network",
    "CBLOF": "Proximity",
    "COF": "Proximity",
    #"DeepSVDD": "Neural Network",
    #"HBOS": "Linear",
    #"LODA": "Ensemble",
    "LOF": "Proximity",
    #"QMCD": "Probabilistic",
    #"ECOD": "Probabilistic",
    #"INNE": "Ensemble",
    "LMDD": "Linear",
    #"Sampling": "Probabilistic"
    "LUNAR": 'Graph-Based'
}

# Grouping models by their types
models_by_type = {}
for model, type_ in model_types.items():
    if type_ not in models_by_type:
        models_by_type[type_] = []
    models_by_type[type_].append(model)

# Generate all combinations, one from each type
all_combinations = product(*models_by_type.values())

# Creating a DataFrame to store these combinations
combinations_df = pd.DataFrame(all_combinations, columns=models_by_type.keys())

combinations_df  # Displaying the first few rows of the DataFrame



Unnamed: 0,Proximity,Linear,Graph-Based
0,CBLOF,LMDD,LUNAR
1,COF,LMDD,LUNAR
2,LOF,LMDD,LUNAR


In [6]:
from itertools import product
import pandas as pd
from pyod.models.alad import ALAD
from pyod.models.cblof import CBLOF  # Uncomment and import other models as needed
from pyod.models.cof import COF
from pyod.models.deep_svdd import DeepSVDD
from pyod.models.hbos import HBOS
from pyod.models.loda import LODA
from pyod.models.lof import LOF
from pyod.models.qmcd import QMCD
from pyod.models.ecod import ECOD
from pyod.models.inne import INNE
from pyod.models.lmdd import LMDD
from pyod.models.sampling import Sampling

# Instantiate the models
models = {
    #"ALAD": ALAD(),
    "CBLOF": CBLOF(),
    "COF": COF(),
    #"DeepSVDD": DeepSVDD(),
    #"HBOS": HBOS(),
    #"LODA": LODA(),
    "LOF": LOF(),
    #"QMCD": QMCD(),
    #"ECOD": ECOD(),
    #"INNE": INNE(),
    "LMDD": LMDD(),
    #"Sampling": Sampling()
    "LUNAR": LUNAR()
}

# Define the types for each model
model_types = {
    #ALAD: "Neural Network",
    CBLOF: "Proximity",
    COF: "Proximity",
    #DeepSVDD: "Neural Network",
    #HBOS: "Linear",
    #LODA: "Ensemble",
    LOF: "Proximity",
    #QMCD: "Probabilistic",
    #ECOD: "Probabilistic",
    #INNE: "Ensemble",
    LMDD: "Linear",
    #Sampling: "Probabilistic"
    LUNAR: 'Graph-Based'
}

# Grouping instantiated models by their types
models_by_type = {}
for model_name, model_instance in models.items():
    model_type = model_types[type(model_instance)]
    if model_type not in models_by_type:
        models_by_type[model_type] = []
    models_by_type[model_type].append(model_instance)

# Generate all combinations, one from each type
all_combinations = product(*models_by_type.values())

# Creating a DataFrame to store these combinations with constructor names
combinations_df_instances = pd.DataFrame(
    [[c[0], c[1], c[2]] for c in all_combinations],
    columns=models_by_type.keys()
)

#combinations_df_instances  # Displaying the first few rows of the DataFrame

combinations_df_constructors = combinations_df_instances
combinations_df_constructors


Unnamed: 0,Proximity,Linear,Graph-Based
0,"CBLOF(alpha=0.9, beta=5, check_estimator=False...","LMDD(contamination=0.1, dis_measure='aad', n_i...","LUNAR(epsilon=0.1, lr=0.001, model_type='WEIGH..."
1,"COF(contamination=0.1, method='fast', n_neighb...","LMDD(contamination=0.1, dis_measure='aad', n_i...","LUNAR(epsilon=0.1, lr=0.001, model_type='WEIGH..."
2,"LOF(algorithm='auto', contamination=0.1, leaf_...","LMDD(contamination=0.1, dis_measure='aad', n_i...","LUNAR(epsilon=0.1, lr=0.001, model_type='WEIGH..."


In [8]:
#fill missing w/ median by country
master = pd.read_csv(r"./data/testing/Master_Data_V6.csv")
master

Unnamed: 0,Country Name,Year,healthcare_spending_of_gdp,healthcare_spending_pc,fdi,natural_resources,gdp,gdp_growth,gdp_pc,gdp_ppp_pc,...,population,union_strength,peace,left_representation,education,log_gdp_pc,fdi_pc,migration_surplus,liberal_immigration_policy,cpi
0,Albania,1970,,,,,,,,,...,2135479.0,-2.114,,-3.8090,,,,11102.0,0.005199,
1,Albania,1971,,,,,,,,,...,2187853.0,-2.114,,,,,,10007.0,0.004574,
2,Albania,1972,,,,,,,,,...,2243126.0,-2.114,,,,,,8796.0,0.003921,
3,Albania,1973,,,,,,,,,...,2296752.0,-2.114,,,,,,7346.0,0.003198,
4,Albania,1974,,,,,,,,,...,2350124.0,-2.114,,-3.8090,,,,5834.0,0.002482,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5780,United States,2018,16.640944,10284.55469,2.150000e+11,0.595679,2.050000e+13,2.945385,62823.30944,62823.30944,...,326838199.0,0.237,4.0,0.5165,,11.048081,657.817846,1200796.0,0.003674,
5781,United States,2019,16.676474,10661.02832,3.160000e+11,0.557098,2.140000e+13,2.294439,65120.39466,65120.39466,...,328329953.0,0.237,,,,11.083993,962.446457,1158444.0,0.003528,
5782,United States,2020,18.815826,11702.40918,1.380000e+11,0.329506,2.110000e+13,-2.767803,63528.63430,63528.63430,...,331501080.0,0.237,,,,11.059246,416.288236,675560.0,0.002038,
5783,United States,2021,,,4.930000e+11,1.279944,2.330000e+13,5.945485,70219.47245,70219.47245,...,331893745.0,0.237,,,,11.159381,1485.415159,561580.0,0.001692,


In [227]:
def correct_nans_and_drop_country_code(X):
  # For each variable: use median of country, if all values empty then use median of dataset
  for country_code in X['Country Code'].unique():
    country_subset_mask = X['Country Code'] == country_code

    for column in X.columns:
      if column != 'Country Code':
        if (len(X[country_subset_mask][column].dropna()) > 0):
          country_subset_column_median = X.loc[country_subset_mask, column].drop(columns = ['Country Code']).median()
          X.loc[country_subset_mask, column] = X.loc[country_subset_mask, column].fillna(country_subset_column_median)
  
  X = X.drop(columns = ['Country Code'])
  return X.fillna(X.median())

# columns_with_na = master.columns[master.isna().any()].tolist()
# for column in columns_with_na:
#     master[column] = master.groupby('Country Name')[column].transform(lambda x: x.fillna(x.median()))

master

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

Unnamed: 0,Country Name,Year,healthcare_spending_of_gdp,healthcare_spending_pc,fdi,natural_resources,gdp,gdp_growth,gdp_pc,gdp_ppp_pc,...,population,union_strength,peace,left_representation,education,log_gdp_pc,fdi_pc,migration_surplus,liberal_immigration_policy,cpi
0,Albania,1970,6.092626,240.443390,-3.211598e+08,1.722989,4.135085e+09,3.802599,1353.392023,5646.529242,...,2135479.0,-2.114,4.5,-3.809000,0.91927,7.208963,-106.697810,11102.0,0.005199,2.5
1,Albania,1971,6.092626,240.443390,-3.211598e+08,1.722989,4.135085e+09,3.802599,1353.392023,5646.529242,...,2187853.0,-2.114,4.5,-0.155083,0.91927,7.208963,-106.697810,10007.0,0.004574,2.5
2,Albania,1972,6.092626,240.443390,-3.211598e+08,1.722989,4.135085e+09,3.802599,1353.392023,5646.529242,...,2243126.0,-2.114,4.5,-0.155083,0.91927,7.208963,-106.697810,8796.0,0.003921,2.5
3,Albania,1973,6.092626,240.443390,-3.211598e+08,1.722989,4.135085e+09,3.802599,1353.392023,5646.529242,...,2296752.0,-2.114,4.5,-0.155083,0.91927,7.208963,-106.697810,7346.0,0.003198,2.5
4,Albania,1974,6.092626,240.443390,-3.211598e+08,1.722989,4.135085e+09,3.802599,1353.392023,5646.529242,...,2350124.0,-2.114,4.5,-3.809000,0.91927,7.208963,-106.697810,5834.0,0.002482,2.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5780,United States,2018,16.640944,10284.554690,2.150000e+11,0.595679,2.050000e+13,2.945385,62823.309440,62823.309440,...,326838199.0,0.237,4.0,0.516500,,11.048081,657.817846,1200796.0,0.003674,7.6
5781,United States,2019,16.676474,10661.028320,3.160000e+11,0.557098,2.140000e+13,2.294439,65120.394660,65120.394660,...,328329953.0,0.237,2.0,0.516500,,11.083993,962.446457,1158444.0,0.003528,7.6
5782,United States,2020,18.815826,11702.409180,1.380000e+11,0.329506,2.110000e+13,-2.767803,63528.634300,63528.634300,...,331501080.0,0.237,2.0,0.516500,,11.059246,416.288236,675560.0,0.002038,7.6
5783,United States,2021,16.116940,7832.266602,4.930000e+11,1.279944,2.330000e+13,5.945485,70219.472450,70219.472450,...,331893745.0,0.237,2.0,0.516500,,11.159381,1485.415159,561580.0,0.001692,7.6


In [9]:
outliers = pd.read_csv(r"./data/outliers_filtered.csv")
outliers

Unnamed: 0,country,country_code,independent,dependent,outlying_start_year,outlying_end_year
0,China,CHN,democracy,fdi_pc,1992,1995
1,Tunisia,TUN,is_mena,democracy,2011,2020
2,Costa Rica,CRI,is_latam,log_gdp_pc; fdi_pc,1949,2015
3,Botswana,BWA,cpi; natural_resources,gdp_growth,1990,2006
4,United States of America,USA,healthcare_spending_pc,disability_adjusted_life_years,2004,2004
5,United States of America,USA,log_gdp_pc,union_strength; left_representation,2004,2011
6,India,IND,female_literacy; is_mena,female_workforce_participation,2000,2015
7,Brazil,BRA,democracy,military_spending_of_gdp,1995,2006
8,Mexico,MEX,democracy,human_rights,1948,2008
9,Hungary,HUN,is_eu,democracy,2010,2023


In [229]:
rows_as_arrays = []
for index, row in combinations_df_constructors.iterrows():
    row_array = row.values
    rows_as_arrays.append(row_array)

In [230]:
def no_scaler(X):
    return X

In [231]:
scalers = [
    no_scaler,
    MinMaxScaler(),
    StandardScaler(),
    MaxAbsScaler(),
    RobustScaler()
]


In [232]:
weights = [.34, .33, .33]

for h in range(0, len(outliers['country'])):
    
    print(outliers['independent'][h])
    print(outliers['dependent'][h])
    if(';' in outliers['independent'][h]):
        ind = outliers['independent'][h].split('; ') 
    else:
        ind = [outliers['independent'][h]]
    if(';' in outliers['dependent'][h]):
        dep = outliers['dependent'][h].split('; ') 
    else:
        dep = [outliers['dependent'][h]]

    #print(ind)
    #print(dep)
    columns = ['Country Name', 'Year', 'Country Code'] + ind + dep
    X_original = master[columns]
    print(columns)
    X = master[columns]

    # Need to do country-wise medians
    X = correct_nans_and_drop_country_code(X)

    for scaler in scalers:
        accuracylist = []
        additionalo = []
        print(scaler)
        # Apply the current scaler
        X = X_original.copy()

        if ind[0] == 'is_mena':
            X = X[X['is_mena'] == 1]
            # Need to drop geographic vars after subsetting on them
            X = X.drop(columns = ['is_mena'])
        if ind[0] == 'is_latam':
            X = X[X['is_latam'] == 1]
            X = X.drop(columns = ['is_latam'])
        if ind[0] == 'is_eu':
            X = X[X['is_eu'] == 1]
            X = X.drop(columns = ['is_eu'])

        X = X.dropna()
        if scaler != no_scaler:
            X[dep] = scaler.fit_transform(X[dep])

        #print(len(rows_as_arrays))
        for i in range(0, len(rows_as_arrays)):
            score = []
            result = 0
            for j in range(0, len(rows_as_arrays[i])):
                model = rows_as_arrays[i][j]
                print("Model {0}: {1}".format(str(j + 1), type(model)))

                features = X[dep].to_numpy()
                if features.ndim == 1:
                    features = features.reshape(-1, 1)

                features = features.astype(np.float64)
                #print(features)
                scores = model.fit_predict(features)
                score.append(weights[j] * scores)
            final = []
            #print(score)
            df = pd.DataFrame(score)
            df_transposed = df.T    
            #print(df_transposed)
            row_sums = df_transposed.sum(axis=1)
            row_sums_list = row_sums.tolist()
            #print(row_sums_list)
            for k in row_sums_list:
                if k >= 0.5:
                    result = 1
                final.append(result)
            #print(final)
            X['Predicted'] = final
            start_year = outliers['outlying_start_year'][h]
            end_year = outliers['outlying_end_year'][h]
            country = outliers['country_code'][h]

            years_list = list(range(start_year, end_year + 1))
            actual = []
            for index, row in X.iterrows():
                if row['Year'] in years_list and row['Country Code'] == outliers['country_code'][h]:
                    actual.append(1)
                else:
                    actual.append(0)
            X['Actual'] = actual
            #X.to_csv('Xtestoutput.csv')
            #print(X['Actual'].value_counts())
            #print(X['Predicted'].value_counts())

            filtered_X = X[(X['Country Code'] == country) & (X['Year'] >= start_year) & (X['Year'] <= end_year)]

            # Calculate accuracy
            correct_predictions = (filtered_X['Actual'] == filtered_X['Predicted']).sum()
            total_predictions = len(filtered_X)
            accuracy = correct_predictions / total_predictions if total_predictions else 0
            print(accuracy)

            noutliers = X[(X['Country Code'] == country) & ((X['Year'] < start_year) | (X['Year'] > end_year)) & (X['Predicted'] == 1)].shape[0]
            print(noutliers)

            accuracylist.append(accuracy)
            additionalo.append(noutliers)
        combinations_df_constructors['{0} {1}-{2} {3}: Accuracy'.format(country, start_year, end_year, str(scaler))] = accuracylist
        combinations_df_constructors['{0} {1}-{2} {3}: Additional Outliers'.format(country, start_year, end_year, str(scaler))] = additionalo



        #print(result)
    
        #APPEND result TO X, and CALC SCORES
    

#print(final)
#add all to df/calc f1 and accuracy
#scores = (scores == -1).astype(int)  # Convert to 0 (inlier) and 1 (outlier)





democracy
fdi_pc
['Country Name', 'Year', 'Country Code', 'democracy', 'fdi_pc']
<function no_scaler at 0x000001CCE40340E0>
Model 1: <class 'pyod.models.cblof.CBLOF'>
Model 2: <class 'pyod.models.lmdd.LMDD'>


  super()._check_params_vs_input(X, default_n_init=10)


Model 3: <class 'pyod.models.lunar.LUNAR'>




1.0
46
Model 1: <class 'pyod.models.cof.COF'>


  cof_.append((ac_dist[_g] * self.n_neighbors_) /
  cof_.append((ac_dist[_g] * self.n_neighbors_) /
  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)


Model 2: <class 'pyod.models.lmdd.LMDD'>
Model 3: <class 'pyod.models.lunar.LUNAR'>




1.0
46
Model 1: <class 'pyod.models.lof.LOF'>
Model 2: <class 'pyod.models.lmdd.LMDD'>




Model 3: <class 'pyod.models.lunar.LUNAR'>




1.0
46
MinMaxScaler()
Model 1: <class 'pyod.models.cblof.CBLOF'>
Model 2: <class 'pyod.models.lmdd.LMDD'>


  super()._check_params_vs_input(X, default_n_init=10)


Model 3: <class 'pyod.models.lunar.LUNAR'>




In [None]:
combinations_df_constructors

Unnamed: 0,Proximity,Linear,Graph-Based,CHN 1992-1995: Accuracy,CHN 1992-1995: Additional Outliers,TUN 2011-2020: Accuracy,TUN 2011-2020: Additional Outliers,CRI 1949-2015: Accuracy,CRI 1949-2015: Additional Outliers,BWA 1990-2006: Accuracy,...,HUN 2010-2023: Accuracy,HUN 2010-2023: Additional Outliers,SGP 1965-2005: Accuracy,SGP 1965-2005: Additional Outliers,TZA 1992-2020: Accuracy,TZA 1992-2020: Additional Outliers,COL 2000-2016: Accuracy,COL 2000-2016: Additional Outliers,NOR 1970-2013: Accuracy,NOR 1970-2013: Additional Outliers
0,"CBLOF(alpha=0.9, beta=5, check_estimator=False...","LMDD(contamination=0.1, dis_measure='aad', n_i...","LUNAR(contamination=0.1, epsilon=0.1, lr=0.001...",1.0,46,1.0,41,1.0,4,1.0,...,1.0,40,1.0,14,1.0,22,1.0,33,1.0,6
1,"COF(contamination=0.1, method='fast', n_neighb...","LMDD(contamination=0.1, dis_measure='aad', n_i...","LUNAR(contamination=0.1, epsilon=0.1, lr=0.001...",1.0,46,1.0,41,1.0,4,1.0,...,1.0,40,1.0,14,1.0,22,1.0,33,1.0,6
2,"LOF(algorithm='auto', contamination=0.1, leaf_...","LMDD(contamination=0.1, dis_measure='aad', n_i...","LUNAR(contamination=0.1, epsilon=0.1, lr=0.001...",1.0,46,1.0,41,1.0,4,1.0,...,1.0,40,1.0,14,1.0,22,1.0,33,1.0,6
