<a href="https://colab.research.google.com/github/Kasaligan/Personal_methods/blob/main/autocorrelation_filter().ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#The following method eliminates dataframe atributes based on a Pearson correlation (R) threshold which can be positive or negative.
#If positive the method eliminates atributes with correlations above said threshold. If negative the method eliminates atributes with correlations below the threshold.
#Atributes must be in COLUMNS
#Requires pandas library and numpy library as np

def autocorrelation_filter(df, threshold=1,inplace=False,show_correlations=False,show_erased_atributes=False):      
  #Parameters description: 
  #   - df: base dataframe 
  #   - threshold: R threshold above or below (depending on sign + or -) which the method will filter atributes. Mustt be in range -1<=threshold<=1 
  #   - inplace: Boolean. False will return processed dataframe, True will overwrite base dataframe
  #   - show_correlations: shows the correlation matrix
  #   - show_erased_atributes: shows the erased atributes based on the NaN percentage threshold
  
  if type(inplace)!=bool:                           #checks if inplace parameter is boolean
    print('Inplace parameter must be boolean.')
    return
  if abs(threshold)>1:                              #checks if threshold is in correct range
    print('Threshold value must be between -1 and 1.')
    return

  correlation_method='pearson'      #this function can be modified to allow different correlation methods, though it's not currently implemented

  correlation_matrix=df.corr(method=correlation_method) #calculate correlation matrix

  if show_correlations==True:
    print('Correlation matrix: \n', correlation_matrix)

  if threshold<0:                                       #in case threshold is negative 
    correlation_matrix=-1*correlation_matrix            #correlation matrix and threshold sings are inverted 
    threshold=-threshold                                #this way the code is the same as if threshold was positive

  np.fill_diagonal(correlation_matrix.values, 0)    #autocorrelations are replaced with zero

  correlation_bool=correlation_matrix.copy()        #auxiliary dataframe
  correlation_bool[correlation_bool<threshold]=0    #correlations below threshold are replaced with zero
  correlation_bool[correlation_bool>=threshold]=1   #correlations above or equal to threshold are replaced with one

  if inplace==True:
    df_aux=df             #in case inplace is True modify original dataframe
  elif inplace==False:
    df_aux=df.copy()      #if not create a new dataframe which will be returned at the end
  
  correlations_dumb_total=correlation_bool.sum() #stores the total number of correlations that surpass the threshold for each atribute
  
  #this is done without considering the actual correlation values, which is why it's a "dumb" total

  erased_parameters=np.array([])    #dataframe to store and later inform erased atributes

  while correlations_dumb_total.any().any():       #while there are still correlations above threshold

    parameter_index=np.array(np.where(correlations_dumb_total==max(correlations_dumb_total)))[0] #atribute index with the maximum number of correlations above threshold  
    #np.where returns a tuple, which I here change into a one-dimensional array
    
    if np.size(parameter_index)>1:    #if more than one atribute has the maximum number of correlations above threshold
      correlation_comparative=correlation_matrix.iloc[:,parameter_index].sum() #calculate the sum of the correlations above threshold for each of the atributes of interest
      aux_index=np.array(np.where(correlation_comparative==max(correlation_comparative)))[0] #obtain the index of the maximum of these correlation sums
      #np.where returns a tuple, which I here change into a one-dimensional array      
      aux_index=aux_index[0]       #access the actual number of the array
      parameter_index=parameter_index[aux_index]     #select the atribute index based on previous step
    
    erased_parameters=np.append(erased_parameters,df_aux.columns[parameter_index])   #add the soon to be erased parameter to a list for future reference

    #these next lines erase the atribute from all matrixes of interest
    correlation_matrix.drop(correlation_matrix.columns[parameter_index],axis=1,inplace=True)
    correlation_matrix.drop(correlation_matrix.index[parameter_index],axis=0,inplace=True)    
    correlation_bool.drop(correlation_bool.columns[parameter_index],axis=1,inplace=True)
    correlation_bool.drop(correlation_bool.index[parameter_index],axis=0,inplace=True)
    df_aux.drop(df_aux.columns[parameter_index],axis=1,inplace=True) 

    #Calculate remaining correlations above threshold, sum them and repeat until no correlation remains above threshold      
    correlations_dumb_total=correlation_bool.sum()

  if show_erased_atributes==True:
    print('The following parameters have been erased: ', erased_parameters) #inform which atributes have been deleted

  if inplace==False:  #if inplace=True the base dataframe has already been actualized, if not then return the modified dataframe
    return df_aux
  