# This notebook allows one to create custom features based upon conditioning based upon logic

# import cleaned data and featured selected data as 'df'

#df = pd.read_csv('path/data/processed/millionrows_Top500Destinations.csv')

### Basic Relevance Calculation
- **Conditions:**
    - `condition_1`: More than 0 clicks and no transaction (`is_trans == 0`).
    - `condition_2`: More than 0 clicks and at least one transaction (`is_trans > 0`).
- **Relevance:**
    - `1` if `condition_1` is true (click but no transaction).
    - `2` if `condition_2` is true (click and transaction).
    - `0` otherwise (no clicks).


### Price Bucket Category
- **Categories:**
    - `1` if `price_bucket` is less than or equal to 3.
    - `2` otherwise.


### Combined Click Price Bucket Relevance
- **Combination Logic:**
    - For relevance values `1` and `2`, the new feature is `3 * relevance + price_bucket_category`.
    - Keeps the original `relevance` if it’s not `1` or `2`.


### Review Rating Category
- **Categories:**
    - `0` if `review_rating` is less than or equal to 3.
    - `1` if `review_rating` is 4.
    - `2` if `review_rating` is 5.


### Combined Click Review Rating Relevance
- **Combination Logic:**
    - For relevance values `1` and `2`, the new feature is `3 * relevance + review_rating_category`.
    - Keeps the original `relevance` if it’s not `1` or `2`.


### Combined Click Review Rating Review Count Relevance
- **Combination Logic:**
    - Adjusts `combined_click_reviewrating_relevance` based on review count and rating conditions:
        - Adds `2` if `review_count` > 200 and `review_rating` is 4.
        - Adds `3` if `review_count` > 200 and `review_rating` is 5.
        - Subtracts `1` if `review_count` > 200 and `review_rating` <= 3.
        - Adds `1` if `review_count` < 200 and `review_rating` is 4 or 5.
    - Keeps the combined relevance unchanged if none of the conditions are met.
    - Sets to `0` if `relevance` is `0`.

In [None]:
# Define conditions
condition_1 = (df['num_clicks'] > 0) & (df['is_trans'] == 0)
condition_2 = (df['num_clicks'] > 0) & (df['is_trans'] > 0)

# Apply conditions and assign values using numpy's where
df['relevance'] = np.where(condition_1, 1, np.where(condition_2, 2, 0))

df['price_bucket_category'] = np.where(df['price_bucket'] <= 3, 1, 2)
df['comb_click_pricebucket_relevance'] = np.where(df['relevance'].isin([1, 2]), 3 * df['relevance'] + df['price_bucket_category'], df['relevance'])
df['comb_click_pricebucket_relevance'].value_counts()

df['review_rating_category'] = np.where(df['review_rating'] <= 3, 0, np.where(df['review_rating'] == 4, 1, 2))
df['combined_click_reviewrating_relevance'] = np.where(df['relevance'].isin([1, 2]), 3 * df['relevance'] + df['review_rating_category'], df['relevance'])
df['combined_click_reviewrating_relevance'].value_counts()

df['combined_click_reviewrating_review_count_relevance'] = np.where(df['relevance'] != 0,
    np.where(
        (df['review_count'] > 200) & (df['review_rating'] == 4),
        df['combined_click_reviewrating_relevance'] + 2,
        np.where(
            (df['review_count'] > 200) & (df['review_rating'] == 5),
            df['combined_click_reviewrating_relevance'] + 3,
            np.where(
                (df['review_count'] > 200) & (df['review_rating'] <= 3),
                df['combined_click_reviewrating_relevance'] - 1,
                np.where(
                    (df['review_count'] < 200) & ((df['review_rating'] == 4) | (df['review_rating'] == 5)),
                    df['combined_click_reviewrating_relevance'] + 1,
                    df['combined_click_reviewrating_relevance']
                )
            )
        )
    ),
    0
)

### Target Encoding
- **Encoding Logic:**
    - Applies target encoding to specific columns (`point_of_sale`, `geo_location_country`, `destination_id`, `prop_id`) based on different relevance features.
    - Joins the encoded data back to the original dataframe with appropriate suffixes.


In [None]:
encoded_data = target_encoder.fit_transform(df[['point_of_sale', 'geo_location_country', 'destination_id', 'prop_id']], df['comb_click_pricebucket_relevance'])
df = df.join(encoded_data.add_suffix('_cpbc'))

encoded_data = target_encoder.fit_transform(df[['point_of_sale', 'geo_location_country', 'destination_id', 'prop_id']], df['review_rating_category'])
df = df.join(encoded_data.add_suffix('_rrc'))

encoded_data = target_encoder.fit_transform(df[['point_of_sale', 'geo_location_country', 'destination_id', 'prop_id']], df['combined_click_reviewrating_relevance'])
df = df.join(encoded_data.add_suffix('_ccrr'))

encoded_data = target_encoder.fit_transform(df[['point_of_sale', 'geo_location_country', 'destination_id', 'prop_id']], df['combined_click_reviewrating_review_count_relevance'])
df = df.join(encoded_data.add_suffix('_ccrrcr'))

### Feature Selection
- **Target relevance dictionary:** Maps feature names to their suffixes.
- **Selected feature:** `combined_click_reviewrating_review_count_relevance`.


### Removing Non-Selected Features
- **Non-selected keys and values:** Identifies the keys and corresponding suffixes of features to be removed.
- **Columns to remove:** Combines both lists (keys and values) to get the final columns to remove.



In [None]:
target_relevance = {
    'relevance': '_target',
    'price_bucket_category': '_pbc',
    'comb_click_pricebucket_relevance': '_cpbc',
    'review_rating_category': '_rrc',
    'combined_click_reviewrating_relevance': '_ccrr',
    'combined_click_reviewrating_review_count_relevance': '_ccrrcr'
}

selected_key = 'combined_click_reviewrating_review_count_relevance'
selected_value = target_relevance[selected_key]

# Extract names of keys to be removed
non_selected_keys = [key for key in target_relevance.keys() if key != selected_key]

# Extract columns names of values to be removed, columns are named with _suffix
#non_selected_values = [value for value in target_relevance.values() if value != selected_value]

non_selected_values = [target_relevance[key] for key in non_selected_keys]

# Extract columns names of keys to be removed
columns_to_remove_keys = non_selected_keys


# Extract columns names of values to be removed, columns are named with _suffix
columns_to_remove_values = [col for col in train_df.columns if any(col.endswith(suffix) for suffix in non_selected_values)]

# Combine both lists to get the final columns to remove
columns_to_remove = columns_to_remove_keys + columns_to_remove_values

columns_to_remove