In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error

# Sample data (replace with your data)
X = np.random.rand(100, 5)
y = np.random.rand(100)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# Linear Regression (without regularization)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Linear Regression MSE:", mse)

# L1 Regularization (Lasso)
lasso = Lasso(alpha=0.1)  # Adjust alpha for regularization strength
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Lasso MSE:", mse)

# L2 Regularization (Ridge)
ridge = Ridge(alpha=0.1)  # Adjust alpha for regularization strength
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Ridge MSE:", mse)

# Elastic Net
elastic_net = ElasticNet(alpha=0.05, l1_ratio=0.8)  # Adjust alpha and l1_ratio
elastic_net.fit(X_train, y_train)
y_pred = elastic_net.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Elastic Net MSE:", mse)

Linear Regression MSE: 0.0546861085114453
Lasso MSE: 0.055633286044641996
Ridge MSE: 0.05465998966498938
Elastic Net MSE: 0.055633286044641996


In [11]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

param_dist = {'alpha': uniform(0.1, 10)}
random_search = RandomizedSearchCV(Lasso(), param_dist, n_iter=10, cv=5)
random_search.fit(X_train, y_train)
best_lasso = random_search.best_estimator_

In [10]:
from sklearn.model_selection import GridSearchCV

param_grid = {'alpha': [0.1, 1, 10]}
grid_search = GridSearchCV(Lasso(), param_grid, cv=5)  # Adjust estimator and cv
grid_search.fit(X_train, y_train)
best_lasso = grid_search.best_estimator_

In [12]:
from sklearn.metrics import mean_squared_error

# Compare models
models = [LinearRegression(), Lasso(), Ridge(), ElasticNet()]
for model in models:
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  mse = mean_squared_error(y_test, y_pred)
  print(f"{type(model).__name__} MSE: {mse}")

LinearRegression MSE: 0.0546861085114453
Lasso MSE: 0.055633286044641996
Ridge MSE: 0.05449404533963943
ElasticNet MSE: 0.055633286044641996


In [13]:
#one hot encoding using OneHotEncoder of Scikit-Learn

import pandas as pd
from sklearn.preprocessing import OneHotEncoder

#Building a dummy employee dataset for example
data = {'Employee id': [10, 20, 15, 25, 30],
        'Gender': ['M', 'F', 'F', 'M', 'F'],
        'Remarks': ['Good', 'Nice', 'Good', 'Great', 'Nice'],
        }
#Converting into a Pandas dataframe
df = pd.DataFrame(data)
#Print the dataframe:
print(f"Employee data : \n{df}")

#Extract categorical columns from the dataframe
#Here we extract the columns with object datatype as they are the categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

#Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Apply one-hot encoding to the categorical columns
one_hot_encoded = encoder.fit_transform(df[categorical_columns])

#Create a DataFrame with the one-hot encoded columns
#We use get_feature_names_out() to get the column names for the encoded data
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

# Concatenate the one-hot encoded dataframe with the original dataframe
df_encoded = pd.concat([df, one_hot_df], axis=1)

# Drop the original categorical columns
df_encoded = df_encoded.drop(categorical_columns, axis=1)

# Display the resulting dataframe
print(f"Encoded Employee data : \n{df_encoded}")

Employee data : 
   Employee id Gender Remarks
0           10      M    Good
1           20      F    Nice
2           15      F    Good
3           25      M   Great
4           30      F    Nice
Encoded Employee data : 
   Employee id  Gender_F  Gender_M  Remarks_Good  Remarks_Great  Remarks_Nice
0           10       0.0       1.0           1.0            0.0           0.0
1           20       1.0       0.0           0.0            0.0           1.0
2           15       1.0       0.0           1.0            0.0           0.0
3           25       0.0       1.0           0.0            1.0           0.0
4           30       1.0       0.0           0.0            0.0           1.0


In [18]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Sample data
data = {'color': ['red', 'green', 'blue', 'red'],
        'size': [10, 15, 12, 8],
        'price': [100, 120, 110, 90]}
df = pd.DataFrame(data)

# One-hot encoding
encoder = OneHotEncoder(sparse_output=False)
color_encoded = encoder.fit_transform(df[['color']])
df = pd.concat([df, pd.DataFrame(color_encoded, columns=encoder.get_feature_names_out())], axis=1)
# Scaling
scaler = StandardScaler()
df['size_scaled'] = scaler.fit_transform(df[['size']])
# Feature interaction
df['size_price_ratio'] = df['size'] / df['price']

In [21]:
df.head()

Unnamed: 0,color,size,price,color_blue,color_green,color_red,size_scaled,size_price_ratio
0,red,10,100,0.0,0.0,1.0,-0.483368,0.1
1,green,15,120,0.0,1.0,0.0,1.450105,0.125
2,blue,12,110,1.0,0.0,0.0,0.290021,0.109091
3,red,8,90,0.0,0.0,1.0,-1.256757,0.088889
