a file with functions to do all of the processing steps I always do

In [None]:
# test function
def add_a_b(a,b):
    return a+b

In [None]:
# a function for transforming positive skew data to normal distribution, this is done before t-tests. https://machinelearningmastery.com/skewness-be-gone-transformative-tricks-for-data-scientists/

def transform_to_normal(df,name_of_data_column_to_transform,n_quantiles=500): # distances df, column name with distance measure

    from scipy import stats
    from sklearn.preprocessing import QuantileTransformer

    name = name_of_data_column_to_transform

    # quantile transformation --
    # note it reduces the influence of outliers, which may not be good. also check about recommended values for the n_quantiles parameter
    qt = QuantileTransformer(output_distribution='normal',n_quantiles=500, random_state=0)
    X = [[val] for val in df[name]]
    new_name = str('qt_'+name)
    df[new_name] = qt.fit_transform(X)
    min_val = abs(min(df[new_name]))
    df[new_name] = df[new_name] + min_val # transpose so that there are no negative values

    return df

In [None]:
# a function for min-max normalizing the data, this is done before t-tests

def min_max_normalize(df,name_of_data_column_to_normalize): # distances df, column name with distance measure

  import pandas as pd
  import numpy as np
  from tqdm import tqdm

  name = name_of_data_column_to_normalize
  distances = df

  # read in discipline information
  publisher = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/citation_stacking/data/df_publisher.csv')

  # no na vals and all vals are <= 0
  print(str(sum(distances[name].isna()))+' nan values in your data column were dropped') # = 1902... a lot
  distances.dropna(axis=0,subset=name,inplace=True)

  # merge discipline info onto distances df
  check1 = len(distances)
  distances['id'] = distances['id'].astype(int)
  distances = distances.merge(publisher[['journal_id','main_concept']],left_on='id',right_on='journal_id',how='inner')
  check2 = len(distances)
  if check1 != check2:
    print('error: the length of your df changed')

  # also check for na discipline values and drop these
  print(str(sum(distances['main_concept'].isna()))+' nan discipline values were dropped') # 0
  distances.dropna(axis=0,subset='main_concept',inplace=True)

  # normalize by discipline
  distances.index = distances.journal_id
  distances['normalized_dst'] = np.nan
  for subj in tqdm(distances['main_concept'].unique()):
    temp = distances[distances['main_concept']==subj]
    max_num = temp[name].max()
    min_num = temp[name].min()
    for j in temp['journal_id']:
      val = temp.at[j,name]
      if val > 0:
        normalized = (val - min_num) / (max_num - min_num)
      else: # journal has dst = 0
        normalized = 0
      # save normalized val into df
      distances.at[j,'normalized_dst'] = normalized

  distances.reset_index(inplace=True,drop=True)

  print('your normalized data is stored in the normalized_dst column')

  return distances

In [None]:
# function for comparing distance distributions by njr ranking

def compare_njr(distances,dst_name):

  import matplotlib.pyplot as plt
  import pandas as pd
  import numpy as np

  # norwegian journal ranking: https://kanalregister.hkdir.no/publiseringskanaler/Om
  njr = pd.read_excel('/content/drive/MyDrive/CitationsProject/validation_data/Validation_data_with_openalex/openalex_NorwegianJournalRank.xlsx')
  njr['Level 2022'].value_counts()

  njr = njr[['Level 2022','journal_openalex_id']]
  distances_merge = distances[['normalized_dst','journal_id']]
  ttest_df = njr.merge(distances_merge,left_on='journal_openalex_id',right_on='journal_id')
  len(ttest_df)

  # level 2 journals are top level 0 should be rejected
  data1 = ttest_df[ttest_df['Level 2022']==2]['normalized_dst'].to_numpy()
  data2 = ttest_df[ttest_df['Level 2022']==1]['normalized_dst'].sample(n=len(data1),random_state=1).to_numpy()
  data3 = ttest_df[ttest_df['Level 2022']==0]['normalized_dst'].to_numpy()

  # or instead we could just compare top journnals to the rest
  #data2 = ttest_df[(ttest_df['Level 2022']==0) | (ttest_df['Level 2022']==1)].sample(n=len(data1),random_state=1)['qt_citing_cited'].to_numpy()

  plt.hist(data1,bins=100,color=[1,0,0,0.5])
  plt.hist(data2,bins=100,color=[0,0,1,0.5])
  plt.hist(data3,bins=100,color=[0,1,0,0.5])
  plt.legend(['level 2, mean='+str(round(np.mean(data1),2)), 'level 1, mean='+str(round(np.mean(data2),2)), 'level 0, mean='+str(round(np.mean(data3),2))])

  plt.title('Distributions of '+dst_name+' Distance by NJR Rank Categories')
  plt.ylabel('number of journals')
  plt.xlabel('mean '+dst_name+' incoming citation length')
  plt.savefig('njr_dst_distributions.png')

  plt.show()

  # display table with t-tests
  from scipy.stats import ttest_ind

  # function for coloring cells differently for significant results
  def _color_red_or_green(val):
    color = 'red' if val > 0.05 else 'green'
    return 'color: %s' % color

  display_df = pd.DataFrame({'Level 2': ['X','X','X'],'Level 1': ['X','X','X'],'Level 0':['X','X','X']},index=['Level 2','Level 1','Level 0'])

  for i in range(0,3):

    print('Level '+str(i)+', n = '+str(len(ttest_df[ttest_df['Level 2022']==i])))

    for j in range(0,3):

      data1 = ttest_df[ttest_df['Level 2022']==i]['normalized_dst'].dropna(axis=0)
      data2 = ttest_df[ttest_df['Level 2022']==j]['normalized_dst'].dropna(axis=0)
      stat, p = ttest_ind(data1, data2)

      display_df.at[str('Level '+str(i)),str('Level '+str(j))] = p #str('p = ' + str(round(p,4)) + ',  n = ' + str(len(data1)+len(data2)))

  print('p values:')

  return display_df.style.applymap(_color_red_or_green)

In [None]:
# function for comparing distance distributions by jufo ranking

def compare_jufo(distances,dst_name):

  import matplotlib.pyplot as plt
  import pandas as pd
  import numpy as np

  # finish journal ranking:
  jufo = pd.read_excel('/content/drive/MyDrive/CitationsProject/validation_data/Validation_data_with_openalex/JUFO_Journal_Rank.xlsx')
  jufo['TASO/LEVEL/NIVÅ'].value_counts()

  jufo = jufo[['TASO/LEVEL/NIVÅ','journal_openalex_id']]
  distances_merge = distances[['normalized_dst','journal_id']]
  ttest_df = jufo.merge(distances_merge,left_on='journal_openalex_id',right_on='journal_id')
  len(ttest_df)

  # level 2 journals are top level 0 should be rejected
  data2 = ttest_df[ttest_df['TASO/LEVEL/NIVÅ']==2]['normalized_dst'].to_numpy()
  data1 = ttest_df[ttest_df['TASO/LEVEL/NIVÅ']==1]['normalized_dst'].sample(n=len(data2),random_state=1).to_numpy()
  data0 = ttest_df[ttest_df['TASO/LEVEL/NIVÅ']==0]['normalized_dst'].to_numpy()
  data3 = ttest_df[ttest_df['TASO/LEVEL/NIVÅ']==3]['normalized_dst'].to_numpy()

  # or instead we could just compare top journnals to the rest
  #data2 = ttest_df[(ttest_df['Level 2022']==0) | (ttest_df['Level 2022']==1)].sample(n=len(data1),random_state=1)['qt_citing_cited'].to_numpy()

  plt.hist(data3,bins=100,color=[0,0,0,0.5])
  plt.hist(data2,bins=100,color=[1,0,0,0.5])
  plt.hist(data1,bins=100,color=[0,0,1,0.5])
  plt.hist(data0,bins=100,color=[0,1,0,0.5])
  plt.legend(['level 3, mean='+str(round(np.mean(data3),2)), 'level 2, mean='+str(round(np.mean(data2),2)), 'level 1, mean='+str(round(np.mean(data1),2)), 'level 0, mean='+str(round(np.mean(data0),2))])

  plt.title('Distributions of '+dst_name+' Distance by JUFO Rank Categories')
  plt.ylabel('number of journals')
  plt.xlabel('mean '+dst_name+' incoming citation length')
  plt.savefig('jufo_dst_distributions.png')

  plt.show()

  # display table with t-tests
  from scipy.stats import ttest_ind

  # function for coloring cells differently for significant results
  def _color_red_or_green(val):
    color = 'red' if val > 0.05 else 'green'
    return 'color: %s' % color

  display_df = pd.DataFrame({'Level 3': ['X','X','X','X'],'Level 2': ['X','X','X','X'],'Level 1':['X','X','X','X'],'Level 0':['X','X','X','X']},index=['Level 3','Level 2','Level 1','Level 0'])

  for i in range(0,4):

    print('Level '+str(i)+', n = '+str(len(ttest_df[ttest_df['TASO/LEVEL/NIVÅ']==i])))

    for j in range(0,4):

      data1 = ttest_df[ttest_df['TASO/LEVEL/NIVÅ']==i]['normalized_dst'].dropna(axis=0)
      data2 = ttest_df[ttest_df['TASO/LEVEL/NIVÅ']==j]['normalized_dst'].dropna(axis=0)
      stat, p = ttest_ind(data1, data2)

      display_df.at[str('Level '+str(i)),str('Level '+str(j))] = p #str('p = ' + str(round(p,4)) + ',  n = ' + str(len(data1)+len(data2)))

  print('p values:')

  return display_df.style.applymap(_color_red_or_green)

In [None]:
# function for comparing distance distributions by scimago ranking

def compare_scimago(distances,dst_name):

  import matplotlib.pyplot as plt
  import pandas as pd
  import numpy as np

  # scimago journal ranking:
  scimago = pd.read_excel('/content/drive/MyDrive/CitationsProject/validation_data/Validation_data_with_openalex/openalex_scimagojr_2022_RAW.xlsx')
  scimago.drop_duplicates(subset = 'journal_openalex_id', inplace=True) # this has to be redone so as not to use this line
  len(scimago)
  scimago['SJR Best Quartile'].value_counts()

  scimago = scimago[['SJR Best Quartile','journal_openalex_id']]
  distances_merge = distances[['normalized_dst','journal_id']]
  ttest_df = scimago.merge(distances_merge,left_on='journal_openalex_id',right_on='journal_id')
  len(ttest_df)

  # level 2 journals are top level 0 should be rejected
  data3 = ttest_df[ttest_df['SJR Best Quartile']=='Q3']['normalized_dst'].to_numpy()
  data1 = ttest_df[ttest_df['SJR Best Quartile']=='Q1']['normalized_dst'].sample(n=len(data3),random_state=1).to_numpy()
  data2 = ttest_df[ttest_df['SJR Best Quartile']=='Q2']['normalized_dst'].sample(n=len(data3),random_state=1).to_numpy()
  data4 = ttest_df[ttest_df['SJR Best Quartile']=='Q4']['normalized_dst'].to_numpy()

  # or instead we could just compare top journnals to the rest
  #data2 = ttest_df[(ttest_df['Level 2022']==0) | (ttest_df['Level 2022']==1)].sample(n=len(data1),random_state=1)['qt_citing_cited'].to_numpy()

  plt.hist(data1,bins=100,color=[1,0,0,0.5])
  plt.hist(data2,bins=100,color=[0,0,1,0.5])
  plt.hist(data3,bins=100,color=[0,1,0,0.5])
  plt.hist(data4,bins=100,color=[0,0,0,0.5])
  plt.legend(['Q1, mean='+str(round(np.mean(data1),2)), 'Q2, mean='+str(round(np.mean(data2),2)), 'Q3, mean='+str(round(np.mean(data3),2)), 'Q4, mean='+str(round(np.mean(data4),2))])

  plt.title('Distributions of '+dst_name+' Distance by SJR Quartiles')
  plt.ylabel('number of journals')
  plt.xlabel('mean '+dst_name+' incoming citation length')
  plt.savefig('sjr_dst_distributions.png')

  plt.show()

  # display table with t-tests
  from scipy.stats import ttest_ind

  # function for coloring cells differently for significant results
  def _color_red_or_green(val):
    color = 'red' if val > 0.05 else 'green'
    return 'color: %s' % color

  display_df = pd.DataFrame({'Q1': ['X','X','X','X'],'Q2': ['X','X','X','X'],'Q3':['X','X','X','X'],'Q4':['X','X','X','X']},index=['Q1','Q2','Q3','Q4'])

  for i in ['Q1','Q2','Q3','Q4']:

    print(i+', n = '+str(len(ttest_df[ttest_df['SJR Best Quartile']==i])))

    for j in ['Q1','Q2','Q3','Q4']:

      data1 = ttest_df[ttest_df['SJR Best Quartile']==i]['normalized_dst'].dropna(axis=0)
      data2 = ttest_df[ttest_df['SJR Best Quartile']==j]['normalized_dst'].dropna(axis=0)
      stat, p = ttest_ind(data1, data2)

      display_df.at[i,j] = p #str('p = ' + str(round(p,4)) + ',  n = ' + str(len(data1)+len(data2)))

  print('p values:')

  return display_df.style.applymap(_color_red_or_green)

In [None]:
# function for comparing distance distributions by jcr_suppressed journals

def compare_jcr_suppressed(distances,dst_name):

  import matplotlib.pyplot as plt
  import pandas as pd
  import numpy as np

  # jcr_suppressed journal ranking:
  jcr_suppressed = pd.read_json('/content/drive/MyDrive/CitationsProject/validation_data/Validation_data_with_openalex/JCR_suppressed_journal_openalex.json')
  jcr_suppressed['suppressed'] = 1
  jcr_suppressed.head()

  jcr_suppressed = jcr_suppressed[['suppressed','journal_id']]
  distances_merge = distances[['normalized_dst','journal_id']]
  ttest_df = jcr_suppressed.merge(distances_merge,on='journal_id',how='right',indicator=True)
  print(len(ttest_df))
  ttest_df['_merge'].value_counts()

  # rejected journals are merged with 'both' indicator while normal come from the 'right_only' df
  data1 = ttest_df[ttest_df['_merge']=='both']['normalized_dst'].to_numpy()
  data0 = ttest_df[ttest_df['_merge']=='right_only']['normalized_dst'].sample(n=len(data1),random_state=1).to_numpy()

  plt.hist(data1,bins=50,color=[1,0,0,0.5])
  plt.hist(data0,bins=50,color=[0,0,1,0.5])
  plt.legend(['suppressed, mean='+str(round(np.mean(data1),2)), 'normal, mean='+str(round(np.mean(data0),2))])

  plt.title('Distributions of '+dst_name+' for JCR-Suppressed and normal journals')
  plt.ylabel('number of journals')
  plt.xlabel('mean '+dst_name+' incoming citation length')
  plt.savefig('jcr_suppressed_dst_distributions.png')
  plt.show()

  # display table with t-tests
  from scipy.stats import ttest_ind

  # function for coloring cells differently for significant results
  def _color_red_or_green(val):
    color = 'red' if val > 0.05 else 'green'
    return 'color: %s' % color

  display_df = pd.DataFrame({'both': ['X','X'],'right_only': ['X','X']},index=['both','right_only'])

  for i in ['both','right_only']:

    # print n, number of items in t-test comparison groups
    if i=='both':
      name='suppressed'
    else:
      name='normal'
    print(name+', n = '+str(len(ttest_df[ttest_df['_merge']==i])))

    for j in ['both','right_only']:

      data1 = ttest_df[ttest_df['_merge']==i]['normalized_dst'].dropna(axis=0)
      data2 = ttest_df[ttest_df['_merge']==j]['normalized_dst'].dropna(axis=0)
      stat, p = ttest_ind(data1, data2)

      display_df.at[i,j] = p #str('p = ' + str(round(p,4)) + ',  n = ' + str(len(data1)+len(data2)))

  print('p values:')

  display_df.rename({'both':'suppressed','right_only':'normal'},inplace=True,axis=1)
  display_df.index = ['suppressed','normal']

  return display_df.style.applymap(_color_red_or_green)

In [None]:
# function for comparing distance distributions by jcr_suppressed journals

def compare_jcr_suppressed_supplemental(distances,dst_name):

  import matplotlib.pyplot as plt
  import pandas as pd
  import numpy as np

  # jcr_suppressed journal ranking:
  jcr_suppressed = pd.read_json('/content/drive/MyDrive/CitationsProject/validation_data/Validation_data_with_openalex/JCR_suppressed_journal_openalex.json')
  jcr_suppressed['suppressed'] = 1
  jcr_suppressed.head()

  jcr_suppressed = jcr_suppressed[['suppressed','journal_id']]

  # ADD in our calculated JCR suppressed too
  import pickle
  with open('/content/drive/MyDrive/CitationsProject/validation_data/Validation_data_with_openalex/JCR_suppressed_supplemental.pkl', 'rb') as f:
      jcr_suplemental = pickle.load(f)

  # check how many of the ones from the actual list we found
  len(jcr_suppressed[jcr_suppressed['journal_id'].isin(jcr_suplemental)]) # not good requires work
  all_jcr = list(jcr_suppressed['journal_id']) + jcr_suplemental
  len(all_jcr)
  # remove duplicates
  all_jcr = set(all_jcr)
  print(len(all_jcr))

  # make into df and then it's the same process as previously
  all_jcr = pd.DataFrame({'journal_id':list(all_jcr)})
  all_jcr['suppressed'] = 1

  jcr_suppressed = pd.concat([jcr_suppressed,all_jcr])

  # DONE adding

  distances_merge = distances[['normalized_dst','journal_id']]
  ttest_df = jcr_suppressed.merge(distances_merge,on='journal_id',how='right',indicator=True)
  print(len(ttest_df))
  ttest_df['_merge'].value_counts()

  # rejected journals are merged with 'both' indicator while normal come from the 'right_only' df
  data1 = ttest_df[ttest_df['_merge']=='both']['normalized_dst'].to_numpy()
  data0 = ttest_df[ttest_df['_merge']=='right_only']['normalized_dst'].sample(n=len(data1),random_state=1).to_numpy()

  plt.hist(data1,bins=50,color=[1,0,0,0.5])
  plt.hist(data0,bins=50,color=[0,0,1,0.5])
  plt.legend(['suppressed, mean='+str(round(np.mean(data1),2)), 'normal, mean='+str(round(np.mean(data0),2))])

  plt.title('Distributions of '+dst_name+' for JCR-Suppressed and normal journals')
  plt.ylabel('number of journals')
  plt.xlabel('mean '+dst_name+' incoming citation length')
  plt.savefig('jcr_suppressed_dst_distributions.png')
  plt.show()

  # display table with t-tests
  from scipy.stats import ttest_ind

  # function for coloring cells differently for significant results
  def _color_red_or_green(val):
    color = 'red' if val > 0.05 else 'green'
    return 'color: %s' % color

  display_df = pd.DataFrame({'both': ['X','X'],'right_only': ['X','X']},index=['both','right_only'])

  for i in ['both','right_only']:

    # print n, number of items in t-test comparison groups
    if i=='both':
      name='suppressed'
    else:
      name='normal'
    print(name+', n = '+str(len(ttest_df[ttest_df['_merge']==i])))

    for j in ['both','right_only']:

      data1 = ttest_df[ttest_df['_merge']==i]['normalized_dst'].dropna(axis=0)
      data2 = ttest_df[ttest_df['_merge']==j]['normalized_dst'].dropna(axis=0)
      stat, p = ttest_ind(data1, data2)

      display_df.at[i,j] = p #str('p = ' + str(round(p,4)) + ',  n = ' + str(len(data1)+len(data2)))

  print('p values:')

  display_df.rename({'both':'suppressed','right_only':'normal'},inplace=True,axis=1)
  display_df.index = ['suppressed','normal']

  return display_df.style.applymap(_color_red_or_green)

In [None]:
# function for comparing distance distributions by cidre anomalous

def compare_cidre(distances,dst_name):

  import matplotlib.pyplot as plt
  import pandas as pd
  import numpy as np

  # jcr_suppressed journal ranking:
  cidre = pd.read_json('/content/drive/MyDrive/CitationsProject/validation_data/Validation_data_with_openalex/cidre_anomalous_009.json') # 1275
  cidre.head()

  cidre = cidre[['anomalous','journal_id']]
  distances_merge = distances[['normalized_dst','journal_id']]
  ttest_df = cidre.merge(distances_merge,on='journal_id',how='right',indicator=True)
  print(len(ttest_df))
  ttest_df['_merge'].value_counts()

  # rejected journals are merged with 'both' indicator while normal come from the 'right_only' df
  data1 = ttest_df[ttest_df['anomalous']==1]['normalized_dst'].to_numpy()
  data0 = ttest_df[ttest_df['anomalous']==0]['normalized_dst'].sample(n=len(data1),random_state=1).to_numpy()

  plt.hist(data1,bins=50,color=[1,0,0,0.5])
  plt.hist(data0,bins=50,color=[0,0,1,0.5])
  plt.legend(['anomalous, mean='+str(round(np.mean(data1),2)), 'normal, mean='+str(round(np.mean(data0),2))])

  plt.title('Distributions of '+dst_name+' for CIDRE-identified anomalous journals')
  plt.ylabel('number of journals')
  plt.xlabel('mean '+dst_name+' incoming citation length')
  plt.savefig('cidre_anomalous_dst_distributions.png')
  plt.show()

  # display table with t-tests
  from scipy.stats import ttest_ind

  # function for coloring cells differently for significant results
  def _color_red_or_green(val):
    color = 'red' if val > 0.05 else 'green'
    return 'color: %s' % color

  display_df = pd.DataFrame({1: ['X','X'],0: ['X','X']},index=[1,0])

  for i in [0,1]:

    if i == 0:
      name = 'normal'
    else:
      name = 'anomalous'
    print(name+', n = '+str(len(ttest_df[ttest_df['anomalous']==i])))

    for j in [0,1]:

      data1 = ttest_df[ttest_df['anomalous']==i]['normalized_dst'].dropna(axis=0)
      data2 = ttest_df[ttest_df['anomalous']==j]['normalized_dst'].dropna(axis=0)
      stat, p = ttest_ind(data1, data2)

      display_df.at[i,j] = p #str('p = ' + str(round(p,4)) + ',  n = ' + str(len(data1)+len(data2)))

  print('p values:')

  display_df.rename({1:'anomalous',0:'normal'},inplace=True,axis=1)
  display_df.index = ['anomalous','normal']

  return display_df.style.applymap(_color_red_or_green)

In [None]:
# jif comparison function

def jif_comparison(distances):

  from sklearn import metrics
  import pandas as pd
  import matplotlib.pyplot as plt
  import numpy as np

  # load jif data
  print('loading jif data')
  jif = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/citation_stacking/data/validation_data/our_calculated_jif.csv')
  jif.drop('Unnamed: 0',axis=1,inplace=True)

  # load validation data
  print('loading njr data')
  njr = pd.read_excel('/content/drive/MyDrive/CitationsProject/validation_data/Validation_data_with_openalex/openalex_NorwegianJournalRank.xlsx')
  njr = njr[['Level 2022','journal_openalex_id']]

  print('loading jufo data')
  jufo = pd.read_excel('/content/drive/MyDrive/CitationsProject/validation_data/Validation_data_with_openalex/JUFO_Journal_Rank.xlsx')
  jufo = jufo[['TASO/LEVEL/NIVÅ','journal_openalex_id']]

  print('loading scimago data')
  scimago = pd.read_excel('/content/drive/MyDrive/CitationsProject/validation_data/Validation_data_with_openalex/openalex_scimagojr_2022_RAW.xlsx')
  scimago.drop_duplicates(subset = 'journal_openalex_id', inplace=True)
  scimago = scimago[['SJR Best Quartile','journal_openalex_id']]

  # merge validation data with jif and distances data
  distances_merge = distances[['weighted_JIF','id']]

  roc_df = njr.merge(distances_merge,left_on='journal_openalex_id',right_on='id')
  roc_df = jufo.merge(roc_df,left_on='journal_openalex_id',right_on='id')
  roc_df = scimago.merge(roc_df,left_on='journal_openalex_id',right_on='id')
  roc_df = jif.merge(roc_df, left_on ='journal_id', right_on='id')
  roc_df.dropna(inplace=True)

  # function for plotting roc curve
  def plot_roc_cur(fper, tper, ax, color):
    #ax.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    ax.plot(fper, tper, color=color, label='ROC')
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    #ax.set_title('Receiver Operating Characteristic (ROC) Curve')

  # compare AUC for identifying normal vs excelent journals

  for v_name,col_name,upper,lower in [('Scimago','SJR Best Quartile','Q1','Q4'),('JUFO','TASO/LEVEL/NIVÅ',3,0),('NJR','Level 2022',2,0)]:
    # simple JIF
    samples_from_distrub1 = roc_df[roc_df[col_name]==lower]['jif']
    samples_from_distrub2 = roc_df[roc_df[col_name]==upper]['jif']

    label = np.concatenate([np.zeros(len(samples_from_distrub1)),np.ones(len(samples_from_distrub2))]).astype(bool)
    y_pred = np.concatenate([samples_from_distrub1,samples_from_distrub2])

    fpr, tpr, thresholds = metrics.roc_curve(label, y_pred)
    chance_to_beat = metrics.auc(fpr, tpr)
    #print('JIF: ' + str(round(chance_to_beat,3)))

    # weighted jif
    samples_from_distrub1 = roc_df[roc_df[col_name]==lower]['weighted_JIF']
    samples_from_distrub2 = roc_df[roc_df[col_name]==upper]['weighted_JIF']

    label = np.concatenate([np.zeros(len(samples_from_distrub1)),np.ones(len(samples_from_distrub2))]).astype(bool)
    y_pred = np.concatenate([samples_from_distrub1,samples_from_distrub2])

    fpr_weighted, tpr_weighted, thresholds = metrics.roc_curve(label, y_pred)
    chance_to_beat_weighted = metrics.auc(fpr_weighted, tpr_weighted)
    #print('weighted JIF: ' + str(round(chance_to_beat_weighted,3)))

    fig, ax = plt.subplots()
    plot_roc_cur(fpr, tpr, ax, color='red')
    plot_roc_cur(fpr_weighted, tpr_weighted, ax, color='green')

    plt.title('ROC - '+v_name+' validation set')
    plt.tight_layout()
    plt.legend(['JIF, AUC: '+str(round(chance_to_beat,3)),'weighted, AUC: '+str(round(chance_to_beat_weighted,3))])
    plt.savefig(v_name+'_roc_comparisons.png')
    plt.show()
