In [3]:
import pandas as pd
train_df = pd.read_csv("train.csv")

In [4]:
train_df.head()


Unnamed: 0,Age,latitude,longitude,Ethnicity,Diabetic,Religion,Height,last_name
0,46,43.469852,-109.502869,Italian,Yes,Deism,Short,Hill
1,41,26.040271,-115.130319,Japanese,Yes,Islam,Short,Hill
2,27,33.345173,-72.874953,Japanese,Yes,Druidism,Average,Williams
3,61,37.97199,-97.647942,Mexican,Yes,Paganism,Average,Wilson
4,20,28.489021,-99.225687,Mexican,Yes,Zoroastrianism,Average,Allen


In [5]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

class Top80PercentEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, column, top_percent=80, other_value=0):
        self.column = column
        self.top_percent = top_percent
        self.other_value = other_value
        self.encoding_map = {}
        self.decoding_map = {}
    
    def fit(self, X, y=None):
        category_counts = X[self.column].value_counts(ascending=False)
        total_count = category_counts.sum()
        
        # Start with cumulative count at 0
        cumulative_count = 0
        top_categories = []
        
        
        ### Accumulate Until 80%: As we iterate through the sorted categories, 
        ## we keep adding to cumulative_count until it reaches or exceeds 80% of total_count. 
        ##We stop adding more categories once the cumulative count surpasses the 80% threshold.
        
        for category, count in category_counts.items():
            cumulative_count += count
            if (cumulative_count / total_count)*100 <= self.top_percent: #
                top_categories.append(category)
            else:
                break
        
        # Create encoding dictionary for top categories and 'Other'
        self.encoding_map = {cat: i+1 for i, cat in enumerate(top_categories)}
        self.encoding_map.update({cat: self.other_value for cat in category_counts.index if cat not in top_categories})
        
        # Create reverse mapping for decoding
        self.decoding_map = {v: k for k, v in self.encoding_map.items()}
        self.decoding_map[self.other_value] = 'Other'
        
        return self

    def transform(self, X):
        # Map the column values to their respective encodings, unseen values mapped to 'Other'
        X_copy = X.copy()
        X_copy[self.column] = X_copy[self.column].apply(
            lambda x: self.encoding_map.get(x, self.other_value)  # Map unseen to 'Other'
        )
        return X_copy
    
    def inverse_transform(self, X):
        # Map encoded values back to original categories
        X_copy = X.copy()
        X_copy[self.column] = X_copy[self.column].map(self.decoding_map)
        return X_copy



In [6]:
# Create pipeline with custom transformer
pipeline = Pipeline(steps=[
    ('top_80_encoder', Top80PercentEncoder(column='last_name', top_percent=80, other_value=0))
])

# Fit the pipeline on the training data
pipeline.fit(train_df)

# New test data with unseen values "Neogi" and "Ghosh"
test_data = pd.DataFrame({
    'last_name': ["Hill",'Srivastava',"Lee", "King", 'Kumar', 'Srivastava', 'Sharma', 'Singh', 
                  'Sharma',"Williams", 'Kumar', 'Patel',"Baker", 'Singh', 'Srivastava', 'Gupta',"martin", 'Patel', 'Gupta', 'Agarwal', "Neogi", "Ghosh"]
})

# Transform the test data (handle unseen values)
encoded_test_data = pipeline.transform(test_data)
print("Encoded Test Data:\n", encoded_test_data)

# Decode the data back to original values
decoded_test_data = pipeline.named_steps['top_80_encoder'].inverse_transform(encoded_test_data)
print("\nDecoded Test Data:\n", decoded_test_data)

Encoded Test Data:
     last_name
0           0
1           0
2           1
3           0
4           0
5           0
6           0
7           0
8           0
9           8
10          0
11          0
12         22
13          0
14          0
15          0
16          0
17          0
18          0
19          0
20          0
21          0

Decoded Test Data:
    last_name
0      Other
1      Other
2        Lee
3      Other
4      Other
5      Other
6      Other
7      Other
8      Other
9   Williams
10     Other
11     Other
12     Baker
13     Other
14     Other
15     Other
16     Other
17     Other
18     Other
19     Other
20     Other
21     Other


In [9]:
encoded_test_data

Unnamed: 0,last_name
0,0
1,0
2,1
3,0
4,0
5,0
6,0
7,0
8,0
9,8


## Data preperation part

## Last name seperation transformer

In [10]:
class NameSplitter(BaseEstimator, TransformerMixin):
    def __init__(self, full_name_column='Name', last_name_column='last_name'):
        self.full_name_column = full_name_column
        self.last_name_column = last_name_column
    
    def fit(self, X, y=None):
        # No fitting needed for this transformer
        return self
    
    def transform(self, X):
        # Ensure X is a DataFrame
        X = pd.DataFrame(X)
        
        # Check if the full_name_column exists in the DataFrame
        if self.full_name_column not in X.columns:
            raise ValueError(f"Column '{self.full_name_column}' not found in DataFrame")

        # Split the full names into first and last names
        X[self.last_name_column] = X[self.full_name_column].apply(
            lambda name: name.split()[-1] if isinstance(name, str) and len(name.split()) > 1 else name
        )
        X = X.drop(columns = [self.full_name_column])
        
        return X

In [11]:
data_with_name_file_name = "train_data_with_names.csv"


data_with_names = pd.read_csv(data_with_name_file_name)



In [12]:
data_with_names

Unnamed: 0,Age,latitude,longitude,Ethnicity,Diabetic,Religion,Name,Height
0,46,43.469852,-109.502869,Italian,Yes,Deism,Kenneth Hill,177
1,41,26.040271,-115.130319,Japanese,Yes,Islam,Michelle Hill,198
2,27,33.345173,-72.874953,Japanese,Yes,Druidism,Timothy Williams,193
3,61,37.971990,-97.647942,Mexican,Yes,Paganism,Susan Wilson,174
4,20,28.489021,-99.225687,Mexican,Yes,Zoroastrianism,Nathaniel Allen,179
...,...,...,...,...,...,...,...,...
12551,59,36.825366,-121.365944,Vietnamese,No,Baháʼí,Philip Lee,192
12552,29,24.715601,-96.274830,Vietnamese,No,Christianity,Ronald Smith,196
12553,51,48.788724,-105.698921,French,Yes,Islam,Justin Martin,173
12554,61,34.234611,-107.452929,Indian,No,Deism,Katie Martin,182


In [13]:
data_with_names.Religion.unique()

array(['Deism', 'Islam', 'Druidism', 'Paganism', 'Zoroastrianism',
       'Atheism', 'Hinduism', 'Baháʼí', 'Sikhism', 'Buddhism',
       'Unitarianism', 'Confucianism', 'Jainism', 'Pastafarianism',
       'Shinto', 'Christianity', 'Agnostic', 'Judaism', 'Rastafarianism',
       'Taoism'], dtype=object)

In [15]:
from sklearn.pipeline import Pipeline


from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import category_encoders as ce

# Define the numerical features and create the scaling pipeline
numerical_features = ['Age',"Height"]

numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])



last_name_pipeline = Pipeline(steps=[
    ('name_splitter', NameSplitter(full_name_column='Name', last_name_column='last_name')),

    ('last_name_encoding', Top80PercentEncoder(column='last_name', top_percent=80, other_value=0),)
])


# Top80PercentEncoder for 'Ethnicity'
ethnicity_pipeline = Pipeline(steps=[
    ('ethnicity_encoding', Top80PercentEncoder(column='Ethnicity', top_percent=80, other_value=0))
])

# Top80PercentEncoder for 'Religion'
religion_pipeline = Pipeline(steps=[
    ('religion_encoding', Top80PercentEncoder(column='Religion', top_percent=80, other_value=0))
])

# Pipeline for geographical features (latitude and longitude)

geo_features = ['latitude', 'longitude']

geo_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),  # Scale latitude and longitude
    # ('haversine', HaversineDistanceTransformer())  # Compute Haversine distances
])


# Define the cat egorical features and create the encoding pipeline
binary_categorical_features = ["Diabetic"]

binary_encoding_pipeline = Pipeline(steps=[
    ('encoder', ce.BinaryEncoder(cols=binary_categorical_features))
])


In [16]:
data_with_names

Unnamed: 0,Age,latitude,longitude,Ethnicity,Diabetic,Religion,Name,Height
0,46,43.469852,-109.502869,Italian,Yes,Deism,Kenneth Hill,177
1,41,26.040271,-115.130319,Japanese,Yes,Islam,Michelle Hill,198
2,27,33.345173,-72.874953,Japanese,Yes,Druidism,Timothy Williams,193
3,61,37.971990,-97.647942,Mexican,Yes,Paganism,Susan Wilson,174
4,20,28.489021,-99.225687,Mexican,Yes,Zoroastrianism,Nathaniel Allen,179
...,...,...,...,...,...,...,...,...
12551,59,36.825366,-121.365944,Vietnamese,No,Baháʼí,Philip Lee,192
12552,29,24.715601,-96.274830,Vietnamese,No,Christianity,Ronald Smith,196
12553,51,48.788724,-105.698921,French,Yes,Islam,Justin Martin,173
12554,61,34.234611,-107.452929,Indian,No,Deism,Katie Martin,182


In [17]:
# Combine all pipelines using ColumnTransformer
full_pipeline = ColumnTransformer(transformers=[
    ('numerical_pipeline', numerical_pipeline, numerical_features),  # Apply to numerical columns
    ('last_name_pipeline', last_name_pipeline, ['Name']),  # Split Name into last_name
    ('ethnicity_pipeline', ethnicity_pipeline, ['Ethnicity']),  # Encode Ethnicity
    ('religion_pipeline', religion_pipeline, ['Religion']),
    ('geo_pipeline', geo_pipeline, geo_features),
    ("binary_category_pipeline", binary_encoding_pipeline, binary_categorical_features)

])



# Fit and transform the data
transformed_data = full_pipeline.fit_transform(data_with_names)
#  
print("Transformed Data:\n", transformed_data)


Transformed Data:
 [[ 0.31185193  0.16161026  0.         ... -0.80026648  0.
   1.        ]
 [ 0.01680161  1.60522934  0.         ... -1.13473298  0.
   1.        ]
 [-0.80933931  1.26151051  8.         ...  1.37670729  0.
   1.        ]
 ...
 [ 0.60690226 -0.1133648   3.         ... -0.57417949  0.
   1.        ]
 [ 1.19700291  0.50532909  3.         ... -0.67842868  1.
   0.        ]
 [-0.57329904  0.78030415  1.         ... -1.67081945  1.
   0.        ]]


In [18]:
transformed_data.shape

(12556, 9)

In [19]:
full_pipeline

In [2]:
full_pipeline

NameError: name 'full_pipeline' is not defined