# Comparing_Models_KNN_Feature_Selection._Embedded_Methods

1. Fit the models  `LinearRegression`,`Lasso` and `Ridge` and compare the model performances. 
2. Define a function that takes a list of models and trains (and tests) them so we can try a lot of them without repeating code.
3. Use feature selection techniques (P-Value, RFE) to select a subset of features to train the model with (if necessary).
4. (optional) Re-fit the models with the selected features.

# Clean Data

In [71]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error as mse, mean_absolute_error as mae
from sklearn.feature_selection import VarianceThreshold # It only works with numerical features
from sklearn.feature_selection import RFE
from sklearn.linear_model import Lasso,Ridge,ElasticNet, LinearRegression

In [72]:
url = 'marketing_customer_analysis_clean.csv'
df = pd.read_csv(url)

In [73]:
df = df.drop(['unnamed:_0', 'customer'], axis=1)

In [74]:
# Separate the columns
categoricals_df = df.select_dtypes(include='object')
numericals_df = df.select_dtypes(include='number')

## Categorical

In [75]:
# Create new Dataframes splitt in nominal and ordinal
nominal_columns = ['state', 'response', 'employmentstatus', 'gender', 
                   'location_code', 'marital_status', 'policy_type', 
                   'sales_channel', 'policy','vehicle_class', 'vehicle_type', 'renew_offer_type']
nominal_df = categoricals_df[nominal_columns]

In [76]:
# Create new Dataframes splitt in nominal and ordinal
ordinal_columns = ['coverage','education', 'vehicle_size']
ordinal_df = categoricals_df[ordinal_columns]

In [77]:
# get_dummies with all the norminal columns
con_norminals_df = pd.get_dummies(nominal_df, dtype=int)

In [78]:
# For 'coverage' column
enc = OrdinalEncoder(categories=[['Basic', 'Extended', 'Premium']])
ordinal_df['coverage'] = enc.fit_transform(ordinal_df[['coverage']])

# For 'education' column
enc = OrdinalEncoder(categories=[['High School or Below', 'Bachelor', 'College', 'Master', 'Doctor']])
ordinal_df['education'] = enc.fit_transform(ordinal_df[['education']])

# For 'vehicle_size' column
enc = OrdinalEncoder(categories=[['Small', 'Medsize', 'Large']])
ordinal_df['vehicle_size'] = enc.fit_transform(ordinal_df[['vehicle_size']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ordinal_df['coverage'] = enc.fit_transform(ordinal_df[['coverage']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ordinal_df['education'] = enc.fit_transform(ordinal_df[['education']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ordinal_df['vehicle_size'] = enc.fit_transform(ordinal_df[['vehic

In [79]:
# Concat the two categorical Dataframes

result_categorical_df = pd.concat([ordinal_df, con_norminals_df], axis=1)

## Numberical

In [80]:
# Drop NAN and duplicates
numericals_df = numericals_df.dropna()
numericals_df = numericals_df.drop_duplicates()

## Concat both

In [81]:
final_df = pd.concat([result_categorical_df, numericals_df], axis=1)
final_df = final_df.drop(columns=['month'])

In [82]:
# Drop nan of target value

final_df['total_claim_amount'].isna().sum()

2303

In [83]:
final_df = final_df.dropna(subset=['total_claim_amount'])

## Defining X, y

In [84]:
X = final_df.drop(columns=['total_claim_amount'], axis = 1)
y = np.log(final_df['total_claim_amount'])

## Data splitting

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state= 42)

# X_train = pd.DataFrame(X_train, columns=X.columns)
# X_test  = pd.DataFrame(X_test, columns=X.columns)

## Linear Regression

In [86]:
# MinMax X_train
mm = MinMaxScaler() # Initialize the PowerTransformer
mm.fit(X_train) # Fit to and transform X_train
X_train=mm.transform(X_train)
# MinMax X_test
X_test = mm.transform(X_test) # Transform X_test using the fitted transformer

In [87]:
#PowerTransform y_train
pt1 = PowerTransformer()
y_train = pd.DataFrame(y_train)
y_train = pt1.fit_transform(y_train)
#PowerTransform y_test
y_test = pd.DataFrame(y_test)
y_test = pt1.transform(y_test)

#### 1. Fit the models LinearRegression,Lasso and Ridge and compare the model performances.

##### Linear Regression

In [88]:
model=LinearRegression()
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

LinearRegression: Train -> 0.8428090565406501, Test -> 0.8369025898410395


##### Lasso

In [89]:
model=Lasso(alpha=0.003)
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

Lasso: Train -> 0.8410039929102261, Test -> 0.8387481672923525


In [97]:
# A function to find the best Alpha

def find_best_alpha_lasso(X_train, y_train, X_test, y_test, alphas_lasso):
    best_score = -np.inf
    best_alpha = None
    scores = []
    
    for alpha in alphas_lasso:
        model = Lasso(alpha=alpha)
        model.fit(X_train, y_train)
        train_score = model.score(X_train, y_train)
        test_score = model.score(X_test, y_test)
        scores.append((alpha, train_score, test_score))
        
        if test_score > best_score:
            best_score = test_score
            best_alpha = alpha
    
    # print(f"Best Alpha: {best_alpha} with Test Score: {best_score:.4f}")
    return best_alpha
# Example usage
alphas_lasso = np.logspace(-4, -1, 20)  # Generating 20 values between 10^-4 and 10^-1
find_best_alpha_lasso(X_train, y_train, X_test, y_test, alphas_lasso)


0.00379269019073225

##### Ridge

In [103]:
model=Ridge(alpha=0)
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

Ridge: Train -> 0.839808657825552, Test -> 0.8352969393813537


In [98]:
# A function to find the best Alpha

def find_best_alpha_ridge(X_train, y_train, X_test, y_test, alphas_ridge):
    best_score = -np.inf
    best_alpha = None
    scores = []
    
    for alpha in alphas_ridge:
        model = Ridge(alpha=alpha)
        model.fit(X_train, y_train)
        train_score = model.score(X_train, y_train)
        test_score = model.score(X_test, y_test)
        scores.append((alpha, train_score, test_score))
        
        if test_score > best_score:
            best_score = test_score
            best_alpha = alpha
    
    # print(f"Best Alpha: {best_alpha} with Test Score: {best_score:.4f}")
    return best_alpha

# Example usage
alphas_ridge = np.logspace(-4, 4, 50)  # Generating 50 values between 10^-4 and 10^4
find_best_alpha_ridge(X_train, y_train, X_test, y_test, alphas_ridge)

0.8286427728546842

#### 2. Define a function that takes a list of models and trains (and tests) them so we can try a lot of them without repeating code.

In [101]:
def models(X_train, y_train, X_test, y_test, alphas_lasso, alphas_ridge):
    model=LinearRegression()
    model.fit(X_train, y_train)
    print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

    best_alpha_lasso = find_best_alpha_lasso(X_train, y_train, X_test, y_test, alphas_lasso)

    model=Lasso(alpha=best_alpha_lasso)
    model.fit(X_train, y_train)
    print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

    best_alpha_ridge = find_best_alpha_ridge(X_train, y_train, X_test, y_test, alphas_ridge)

    model=Ridge(alpha=best_alpha_ridge)
    model.fit(X_train, y_train)
    print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")
    

In [102]:
models(X_train, y_train, X_test, y_test, alphas_lasso, alphas_ridge)

LinearRegression: Train -> 0.8428090565406501, Test -> 0.8369025898410395
Lasso: Train -> 0.8405400475849193, Test -> 0.8388645351657161
Ridge: Train -> 0.8427130262396116, Test -> 0.8370321272242139


#### 3. Use feature selection techniques (P-Value, RFE) to select a subset of features to train the model with (if necessary).

## Variance threshold method

In [94]:
X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

#display(X_train)
print("Initial number of numerical columns: ",X_train.shape)
print()


selector = VarianceThreshold() # Default threshold value is 0
# Features with a training-set variance lower than this threshold will be removed.
selector.fit(X_train)

kept_features_indexes = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features_indexes].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final number of numerical columns: ",X_train.shape)

AttributeError: 'numpy.ndarray' object has no attribute 'select_dtypes'

## Correlation matrix

In [None]:
c = final_df.select_dtypes(include = np.number)
c = abs(c.corr())
#c

fig, ax = plt.subplots(figsize=(14,14))
sns.heatmap(c, annot=True);

#c['SalePrice']
c_last = c['total_claim_amount'].sort_values(ascending=False)
#c_last
c_thr = .3
cols_to_keep = list(c_last[c_last > c_thr].index)[1:] + [list(c_last[c_last > c_thr].index)[0]]
print(cols_to_keep)

final_df[cols_to_keep]

## Recursive feature elimination

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test, columns=X.columns)

#X_train.isna().sum()
nulls = pd.DataFrame(X_train.isna().sum()).reset_index()
#nulls.head()
nulls.columns = ['Column','nas']
#nulls.head()
#nulls[nulls['nas'] > 0].head()
cols_to_drop = nulls[nulls['nas'] > 0]['Column'] # Too drastic, but made on pourpose for quick filtering (don't do this in production!!)

X_train.drop(columns=cols_to_drop, axis=1, inplace = True)
X_test.drop(columns=cols_to_drop, axis=1, inplace = True)

#display(X_train)

lm = LinearRegression()

selector = RFE(lm, n_features_to_select= 8, step = 1, verbose = 1) # Step is how many features to add or drop everytime
selector.fit(X_train, y_train)

kept_features = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final selected features: ")
display(X_train)

#### 4. (optional) Re-fit the models with the selected features.