In [None]:
"""
This notebook is for preprocessing the data and getting it ready to be clustered. This is done by exploring the data, getting rid of outliers, 
and converting data to binary to only check for presence of a feature. I chose to use a variance threshhold for some feature reduction by only 
getting rid of features where all values were identical. If a feature is just the exact same (present or not present) for every sample of data, it 
contriubutes nothing but noise. I chose not to use Principal Componenet Analysis (PCA) because it assumes the data is continuouse, not binary, causing 
potential issues due to assumptions. Multiple Correspondence Analysis (MCA) would also work for dimensionality reduction, but would likely cause a loss 
in information.
"""

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold

In [None]:
df = pd.read_csv("../data/raw_data.csv")

In [None]:
#use this to see value counts of each column. The builtin value_counts function in the pandas libary is hard to read, so with val_count, it makes it a bit easier

def val_count(data):
    counter = 0
    for i in data.columns:
        print(counter)
        print(data[i].value_counts())
        counter += 1
        print("=========================="*5)
    print(data.shape)

In [None]:
val_count(df)

In [None]:
#getting rid of outliers and duplicates

df = df.drop(columns= df.columns[153:].tolist(), axis = 1) #dropped because features were unamed and VERY sparse
df = df.drop_duplicates()
df = df.drop([46341,40941]) #outliers
df["Complete"] = df["Complete"].replace("TRUE", True )
df["Complete"] = df["Complete"].replace("FALSE", False )
df["properties_trace_trace_feature"] = df["properties_trace_trace_feature"].replace("TRUE", True )
df["properties_inferences_outcrop_in_place"] = df["properties_inferences_outcrop_in_place"].replace("5 - definitely in place", 5 )
df["properties_symbology_lineWidth"] = df["properties_symbology_lineWidth"].replace(2.0,"2")
df["properties_symbology_lineWidth"] = df["properties_symbology_lineWidth"].replace(4.0,"4")
df = df.dropna(axis=1, how= "all")
df.shape

In [None]:
#dropped to prevent data leakage. These features were added in at a later point in time as indicated by their values which are grades of the spots.

df = df.drop(columns= ["contacts drawing", "contacts drawing quality", "classification of spots" , "correctness of spots", "completeness of map","unit labels",  "images", "overall impression", "Complete"])

In [None]:
val_count(df)

In [None]:
def binary_simplification(df):
    """converts a pandas dataframe into binary based off of presence in data

    Args:
        df (pandas.DataFrame): a pandas dataframe of the data that needs to be converted to binary

    Returns:
        pandas.DataFrame: a new dataframe that has now been converted to binary
    """
    df_new = df.copy()
    binary_col_data = {}
    columns_to_drop = []
    
    for col in df.columns:
        print(f"Converting {col} to binary")
        binary_col_data[col] = df_new[col].replace('', np.nan).notna().astype(int)
        columns_to_drop.append(col)
            
    df_new = df_new.drop(columns=columns_to_drop)
    
    # Add all new binary columns in one go using pd.concat
    if binary_col_data:
        df_new = pd.concat([df_new, pd.DataFrame(binary_col_data, index=df_new.index)], axis=1)
        
    return df_new

In [None]:
df = binary_simplification(df)
val_count(df)

In [None]:
# uses variance threshold for feature reduction, removes features where all values are identical

selector = VarianceThreshold(threshold=0) #removes all features with low variance in 100% of samples  (you do (1 - percentage of same values) * percentage of vaues that are the same)
selector.fit_transform(df)

cols_idxs = selector.get_support(indices=True)
df = df.iloc[:,cols_idxs]
val_count(df)


In [None]:
df.to_csv("../data/processed_data.csv", index = False)
