In [2]:
df = pd.read_csv('fairjob.csv')

In [6]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def cosine_similarity(vec1, vec2):
    """Compute the cosine similarity between two vectors."""
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

def merge_correlated_features(data, threshold=0.9):
    """Merge columns with correlation above the threshold by averaging their values."""
    corr_matrix = data.corr().abs()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    merged_features = {}
    for col in upper_tri.columns:
        highly_corr = [col]
        for row in upper_tri.index:
            if upper_tri.loc[row, col] > threshold:
                highly_corr.append(row)

        # Check if the columns exist before accessing them
        highly_corr = [col for col in highly_corr if col in data.columns]

        if len(highly_corr) > 1:
            new_col = "_".join(sorted(highly_corr))
            data[new_col] = data[highly_corr].mean(axis=1)
            merged_features[new_col] = highly_corr
            data.drop(columns=highly_corr, inplace=True, errors='ignore')

    return data, merged_features


def pca(data, dimention):
    """Perform PCA and return transformed data and directions."""
    standardized_data = (data - data.mean(axis=0)) / data.std(axis=0)
    covariance_matrix = np.cov(standardized_data, rowvar=False)
    eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]
    top_eigenvectors = eigenvectors[:, :dimention]
    principal_components = np.dot(standardized_data, top_eigenvectors)
    return pd.DataFrame(principal_components), top_eigenvectors

def findNumPCA(data):
    """Determine the optimal number of PCA dimensions based on explained variance."""
    standardized_data = (data - data.mean(axis=0)) / data.std(axis=0)
    covariance_matrix = np.cov(standardized_data, rowvar=False)
    eigenvalues, _ = np.linalg.eigh(covariance_matrix)
    eigenvalues = np.sort(eigenvalues)[::-1]
    explained_variance = eigenvalues / eigenvalues.sum()
    cumulative_variance = np.cumsum(explained_variance)
    optimal_dims = np.argmax(cumulative_variance >= 0.95) + 1
    return optimal_dims

def calculate_explained_variance(data):
    """Calculate explained variance for PCA dimensions."""
    standardized_data = (data - data.mean(axis=0)) / data.std(axis=0)
    covariance_matrix = np.cov(standardized_data, rowvar=False)
    eigenvalues, _ = np.linalg.eigh(covariance_matrix)
    eigenvalues = np.sort(eigenvalues)[::-1]
    explained_variance = eigenvalues / eigenvalues.sum()
    return explained_variance, np.cumsum(explained_variance)

def visualize_explained_variance(explained_variance):
    """Plot the explained variance and cumulative explained variance."""
    cumulative_variance = np.cumsum(explained_variance)
    plt.figure(figsize=(8, 6))
    plt.plot(cumulative_variance, marker='o', label="Cumulative Explained Variance")
    plt.axhline(y=0.95, color='r', linestyle='--', label="95% Variance Threshold")
    plt.xlabel("Number of Components")
    plt.ylabel("Cumulative Explained Variance")
    plt.legend()
    plt.show()

def perform_pca(df, feature_groups, dimensions):
    """Apply PCA on given feature groups."""
    results = {}
    directions = {}

    for feature_type, features in feature_groups.items():
        dim = dimensions[feature_type]
        pca_result, directions[feature_type] = pca(df[features], dimention=dim)
        results[feature_type] = pca_result.add_prefix(f'{feature_type}_PC')

    combined_pca = pd.concat(results.values(), axis=1)
    df_result = pd.concat([df.reset_index(drop=True), combined_pca], axis=1)
    df_result.drop(columns=sum(feature_groups.values(), []), inplace=True, errors='ignore')
    return df_result, directions

def split_data(df, stratify_col, test_size=0.1, random_state=42):
    """Perform stratified splits."""
    np.random.seed(random_state)
    stratify_values = df[stratify_col].unique()
    stratified_indices = {value: df[df[stratify_col] == value].index.tolist() for value in stratify_values}

    train_indices, dev_indices, test_indices = [], [], []

    for value, indices in stratified_indices.items():
        np.random.shuffle(indices)
        n_test = int(len(indices) * test_size)
        n_dev = n_test
        test_indices.extend(indices[:n_test])
        dev_indices.extend(indices[n_test: n_test + n_dev])
        train_indices.extend(indices[n_test + n_dev:])

    train = df.loc[train_indices]
    dev = df.loc[dev_indices]
    test = df.loc[test_indices]

    return train, dev, test

def process_and_save_splits(df, split_key, stratify_col, feature_groups, output_dir, directions_dict, y_columns):
    """Process splits, merge correlated features, apply PCA, and save results."""
    train, dev, test = split_data(df, stratify_col=stratify_col)

    # Select numeric columns for correlation merging
    numeric_cols = train.select_dtypes(include=np.number).columns.tolist()
    train_numeric = train[numeric_cols]

    train_numeric, merged_features = merge_correlated_features(train_numeric)
    train.update(train_numeric)  # Update original train DataFrame with merged numeric features

    dimensions = {key: findNumPCA(train[features]) for key, features in feature_groups.items()}
    train_pca, directions = perform_pca(train, feature_groups, dimensions)

    directions_dict[split_key] = directions

    for dataset, name in zip([train_pca, dev, test], ['train', 'dev', 'test']):
        # Merge y_columns back
        y_data = dataset[y_columns]
        dataset = dataset.drop(columns=y_columns, errors='ignore')
        dataset = pd.concat([dataset, y_data], axis=1)

        dataset.to_csv(f"{output_dir}/{name}_{split_key}.csv", index=False)


def stratified_split_and_save_and_pca(df, output_dir="output", y_columns=['click']):
    """Main function to perform stratified splits, merge correlated features, PCA, and save results."""
    np.random.seed(42)
    directions_dict = {}

    feature_groups = {
        'user_cat': [f'cat{i}' for i in range(0, 6)],
        'product_cat': [f'cat{i}' for i in range(6, 13)],
        'num': [f'num{i}' for i in range(16, 51)],
    }

    df['gender'] = df['protected_attribute'].apply(lambda x: 'male' if x > 0 else 'female')
    df['rank_category'] = df['rank'].apply(lambda x: 'above_10' if x > 10 else 'below_10')

    for gender in ['male', 'female']:
        gender_df = df[df['gender'] == gender]
        process_and_save_splits(gender_df, f"{gender}", 'click', feature_groups, output_dir, directions_dict, y_columns)

    for click_status, click_df in {'clicked': df[df['click'] == 1], 'not_clicked': df[df['click'] == 0]}.items():
        process_and_save_splits(click_df, f"{click_status}", 'gender', feature_groups, output_dir, directions_dict, y_columns)

    for rank_category in ['above_10', 'below_10']:
        rank_df = df[df['rank_category'] == rank_category]
        process_and_save_splits(rank_df, f"rank_{rank_category}", 'click', feature_groups, output_dir, directions_dict, y_columns)

    for split1_key, split2_key in [
        ('male', 'female'),
        ('clicked', 'not_clicked'),
        ('above_10', 'below_10')
    ]:
        print(f"\nCalculating cosine similarities between {split1_key} and {split2_key} splits:")
        calculate_similarity_between_splits(directions_dict, split1_key, split2_key)

    print("All datasets processed and saved.")
    return directions_dict


def calculate_similarity_between_splits(directions_dict, split1_key, split2_key):
    """Calculate cosine similarity between the PCA directions of two splits."""
    similarities = {}

    if split1_key not in directions_dict or split2_key not in directions_dict:
        print(f"Missing data for splits {split1_key} or {split2_key}")
        return

    directions_split1 = directions_dict[split1_key]
    directions_split2 = directions_dict[split2_key]

    for feature_type in directions_split1.keys():
        pca1 = directions_split1[feature_type]
        pca2 = directions_split2[feature_type]

        min_len = min(pca1.shape[0], pca2.shape[0])

        if pca1.ndim > 1:
            pca1 = pca1[:min_len, 0]
        if pca2.ndim > 1:
            pca2 = pca2[:min_len, 0]

        pca1 = pca1.flatten()
        pca2 = pca2.flatten()

        similarity = cosine_similarity(pca1, pca2)
        similarities[feature_type] = similarity

    for feature_type, similarity in similarities.items():
        print(f"Cosine similarity for {feature_type}: {similarity}")

    return similarities

def print_dataset_sizes(output_dir="output"):
    """Print sizes of datasets in the output directory."""
    size_summary = []
    files = [f for f in os.listdir(output_dir) if f.endswith(".csv")]

    for file in files:
        file_path = os.path.join(output_dir, file)
        data = pd.read_csv(file_path)
        size_summary.append((file, len(data), data.shape[1]))

    print("Dataset Sizes:")
    for name, rows, cols in size_summary:
        print(f"{name}: {rows} rows, {cols} columns")

output_directory = "splits"
os.makedirs(output_directory, exist_ok=True)

# Assuming df is already defined or loaded
stratified_split_and_save_and_pca(df, output_directory)
print_dataset_sizes(output_dir=output_directory)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[new_col] = data[highly_corr].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=highly_corr, inplace=True, errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[new_col] = data[highly_corr].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: h

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[new_col] = data[highly_corr].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=highly_corr, inplace=True, errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[new_col] = data[highly_corr].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: h

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[new_col] = data[highly_corr].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=highly_corr, inplace=True, errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[new_col] = data[highly_corr].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: h

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=highly_corr, inplace=True, errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[new_col] = data[highly_corr].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=highly_corr, inplace=True, errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d


Calculating cosine similarities between male and female splits:
Cosine similarity for user_cat: 0.9996361842878647
Cosine similarity for product_cat: 0.9999587693361679
Cosine similarity for num: 0.9921272609241362

Calculating cosine similarities between clicked and not_clicked splits:
Cosine similarity for user_cat: -0.9984991269863779
Cosine similarity for product_cat: -0.999055426973865
Cosine similarity for num: -0.9180649346735784

Calculating cosine similarities between above_10 and below_10 splits:
Missing data for splits above_10 or below_10
All datasets processed and saved.
Dataset Sizes:
train_not_clicked.csv: 851791 rows, 41 columns
test_male.csv: 53610 rows, 58 columns
test_rank_below_10.csv: 87307 rows, 58 columns
train_clicked.csv: 5993 rows, 42 columns
train_male.csv: 428893 rows, 42 columns
dev_not_clicked.csv: 106473 rows, 58 columns
train_female.csv: 428891 rows, 41 columns
train_rank_below_10.csv: 698468 rows, 41 columns
test_female.csv: 53611 rows, 58 columns
dev_

In [7]:
def print_dataset_sizes_and_similarities(output_dir="output", directions_dict=None):
    """Print sizes of datasets in the output directory and the similarities between the splits."""
    size_summary = []
    files = [f for f in os.listdir(output_dir) if f.endswith(".csv")]

    for file in files:
        file_path = os.path.join(output_dir, file)
        data = pd.read_csv(file_path)
        size_summary.append((file, len(data), data.shape[1]))

    # Print dataset sizes
    print("Dataset Sizes:")
    for name, rows, cols in size_summary:
        print(f"{name}: {rows} rows, {cols} columns")

    # If directions_dict is provided, calculate and print cosine similarities between splits
    if directions_dict:
        print("\nCalculating cosine similarities between splits:")
        for split1_key, split2_key in [
            ('male', 'female'),
            ('clicked', 'not_clicked'),
            ('above_10', 'below_10')
        ]:
            calculate_similarity_between_splits(directions_dict, split1_key, split2_key)

output_directory = "splits"
os.makedirs(output_directory, exist_ok=True)

# Assuming df is already defined or loaded
directions_dict = stratified_split_and_save_and_pca(df, output_directory)

# Now, call the new function to print both dataset sizes and similarities
print_dataset_sizes_and_similarities(output_dir=output_directory, directions_dict=directions_dict)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[new_col] = data[highly_corr].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=highly_corr, inplace=True, errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[new_col] = data[highly_corr].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: h

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=highly_corr, inplace=True, errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[new_col] = data[highly_corr].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=highly_corr, inplace=True, errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[new_col] = data[highly_corr].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=highly_corr, inplace=True, errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[new_col] = data[highly_corr].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: h

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[new_col] = data[highly_corr].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=highly_corr, inplace=True, errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[new_col] = data[highly_corr].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: h


Calculating cosine similarities between male and female splits:
Cosine similarity for user_cat: 0.9996361842878647
Cosine similarity for product_cat: 0.9999587693361679
Cosine similarity for num: 0.9921272609241362

Calculating cosine similarities between clicked and not_clicked splits:
Cosine similarity for user_cat: -0.9984991269863779
Cosine similarity for product_cat: -0.999055426973865
Cosine similarity for num: -0.9180649346735784

Calculating cosine similarities between above_10 and below_10 splits:
Missing data for splits above_10 or below_10
All datasets processed and saved.
Dataset Sizes:
train_not_clicked.csv: 851791 rows, 41 columns
test_male.csv: 53610 rows, 58 columns
test_rank_below_10.csv: 87307 rows, 58 columns
train_clicked.csv: 5993 rows, 42 columns
train_male.csv: 428893 rows, 42 columns
dev_not_clicked.csv: 106473 rows, 58 columns
train_female.csv: 428891 rows, 41 columns
train_rank_below_10.csv: 698468 rows, 41 columns
test_female.csv: 53611 rows, 58 columns
dev_