In [None]:
# Question 5: Label Encoding vs One-Hot Encoding
# Task: Show the difference between Label Encoding and One-Hot Encoding on the Titanic dataset for the 'Sex' feature.





# Question 6: Combining Feature Scaling Techniques
# Task: Demonstrate combining Min-Max Scaling and Standardization for the same datasetand explain the results.





# Question 7: Handling Multiple Categorical Features
# Task: Handle multiple categorical features ('Sex', 'Embarked') from the Titanic dataset using One-Hot Encoding.




# Question 8: Ordinal Encoding for Ranked Categories
# Task: Ordinal encode 'Pclass' (Passenger class) from the Titanic dataset considering passenger class as a ranked feature.





# Question 9: Impact of Scaling on Different Algorithms
# Task: Investigate the impact of different scaling techniques on a decision tree model and compare it with a SVM.



# Question 10: Custom Transformations for Categorical Features
# Task: Implement a custom transformation function for encoding high cardinality categorical features efficiently.


import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.base import TransformerMixin
import numpy as np

# Load Titanic dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
titanic = pd.read_csv(url)

# Question 5: Label Encoding vs One-Hot Encoding on 'Sex'
le = LabelEncoder()
titanic['Sex_label'] = le.fit_transform(titanic['Sex'])

ohe = OneHotEncoder(sparse=False, drop='if_binary')
sex_ohe = ohe.fit_transform(titanic[['Sex']])
df_sex_ohe = pd.DataFrame(sex_ohe, columns=ohe.get_feature_names_out(['Sex']))

print("Label Encoding (first 5):")
print(titanic[['Sex', 'Sex_label']].head())
print("\nOne-Hot Encoding (first 5):")
print(df_sex_ohe.head(), "\n")

# Question 6: Combining Min-Max Scaling and Standardization on numeric features ('Age', 'Fare')
num_cols = ['Age', 'Fare']
df_numeric = titanic[num_cols].copy()

# Fill missing Age with median for demonstration
df_numeric['Age'].fillna(df_numeric['Age'].median(), inplace=True)

scaler_minmax = MinMaxScaler()
scaled_minmax = scaler_minmax.fit_transform(df_numeric)

scaler_standard = StandardScaler()
scaled_standard = scaler_standard.fit_transform(scaled_minmax)

df_combined_scaled = pd.DataFrame(scaled_standard, columns=num_cols)
print("Combined Min-Max Scaling then Standardization (first 5 rows):")
print(df_combined_scaled.head(), "\n")

# Question 7: One-Hot Encoding 'Sex' and 'Embarked'
titanic['Embarked'].fillna('Missing', inplace=True)
ct = ColumnTransformer(transformers=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'), ['Sex', 'Embarked'])
], remainder='drop')

encoded = ct.fit_transform(titanic)
ohe_feature_names = ct.named_transformers_['onehot'].get_feature_names_out(['Sex', 'Embarked'])
df_ohe_multi = pd.DataFrame(encoded.toarray(), columns=ohe_feature_names)
print("One-Hot Encoding of 'Sex' and 'Embarked' (first 5 rows):")
print(df_ohe_multi.head(), "\n")

# Question 8: Ordinal Encoding for 'Pclass' (ranked)
ordinal_encoder = OrdinalEncoder(categories=[['3', '2', '1']])  # Assuming 1 is highest class, 3 is lowest
pclass_str = titanic['Pclass'].astype(str).values.reshape(-1, 1)
titanic['Pclass_ordinal'] = ordinal_encoder.fit_transform(pclass_str)
print("Ordinal Encoding of 'Pclass' (first 5 rows):")
print(titanic[['Pclass', 'Pclass_ordinal']].head(), "\n")

# Question 9: Impact of scaling on Decision Tree vs SVM
X = titanic[['Age', 'Fare']].copy()
X['Age'].fillna(X['Age'].median(), inplace=True)
y = titanic['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Decision Tree (no scaling needed)
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
pred_dt = dt.predict(X_test)
acc_dt = accuracy_score(y_test, pred_dt)

# SVM without scaling
svm = SVC(random_state=42)
svm.fit(X_train, y_train)
acc_svm_no_scale = accuracy_score(y_test, svm.predict(X_test))

# SVM with Standard Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svm_scaled = SVC(random_state=42)
svm_scaled.fit(X_train_scaled, y_train)
acc_svm_scaled = accuracy_score(y_test, svm_scaled.predict(X_test_scaled))

print(f"Decision Tree Accuracy: {acc_dt}")
print(f"SVM Accuracy without scaling: {acc_svm_no_scale}")
print(f"SVM Accuracy with Standard Scaling: {acc_svm_scaled}\n")

# Question 10: Custom Transformer for high cardinality categorical features
class HighCardinalityEncoder(TransformerMixin):
    def __init__(self, top_n=10):
        self.top_n = top_n
        self.top_categories_ = {}
        self.ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

    def fit(self, X, y=None):
        for col in X.columns:
            counts = X[col].value_counts()
            self.top_categories_[col] = counts.nlargest(self.top_n).index.tolist()
        X_reduced = X.apply(lambda col: col.where(col.isin(self.top_categories_[col.name]), other='Other'))
        self.ohe.fit(X_reduced)
        return self

    def transform(self, X):
        X_reduced = X.apply(lambda col: col.where(col.isin(self.top_categories_[col.name]), other='Other'))
        return self.ohe.transform(X_reduced)

# Example usage on 'Cabin' (high cardinality, many missing)
titanic['Cabin'].fillna('Missing', inplace=True)
high_card_feat = titanic[['Cabin']]

encoder = HighCardinalityEncoder(top_n=5)
encoder.fit(high_card_feat)
encoded_cabin = encoder.transform(high_card_feat)

print("Custom encoded high cardinality 'Cabin' feature shape:", encoded_cabin.shape)


