In [1]:
import pandas as pd
import numpy as np
# Create a sample dataframe
data = {
'Age': [25, 30, 35, 40, 45],
'Income': [50000, 60000, 75000, 90000, 80000],
'Education': ['High School', 'Bachelor', 'Master', 'Ph.D.', 'Bachelor'],
'City': ['New York', 'San Francisco', 'Chicago', 'Los Angeles', 'Miami'],
'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
'Productivity': [5, 4, 3, 2, 4]
}
df = pd.DataFrame(data)

In [2]:
print(df)

   Age  Income    Education           City  Gender  Productivity
0   25   50000  High School       New York    Male             5
1   30   60000     Bachelor  San Francisco  Female             4
2   35   75000       Master        Chicago    Male             3
3   40   90000        Ph.D.    Los Angeles  Female             2
4   45   80000     Bachelor          Miami    Male             4


# ONE HOT ENCODING

In [3]:
## Perform one-hot encoding
df_encoded = pd.get_dummies(df, columns=['Education', 'City', 'Gender'])

In [4]:
df_encoded

Unnamed: 0,Age,Income,Productivity,Education_Bachelor,Education_High School,Education_Master,Education_Ph.D.,City_Chicago,City_Los Angeles,City_Miami,City_New York,City_San Francisco,Gender_Female,Gender_Male
0,25,50000,5,False,True,False,False,False,False,False,True,False,False,True
1,30,60000,4,True,False,False,False,False,False,False,False,True,True,False
2,35,75000,3,False,False,True,False,True,False,False,False,False,False,True
3,40,90000,2,False,False,False,True,False,True,False,False,False,True,False
4,45,80000,4,True,False,False,False,False,False,True,False,False,False,True


# FEATURE SCALING

In [5]:
from sklearn.preprocessing import StandardScaler
# Initialize the StandardScaler
scaler = StandardScaler()
# Fit and transform the selected columns
df_encoded[['Age', 'Income']] = scaler.fit_transform(df_encoded[['Age', 'Income']])


In [6]:
df_encoded

Unnamed: 0,Age,Income,Productivity,Education_Bachelor,Education_High School,Education_Master,Education_Ph.D.,City_Chicago,City_Los Angeles,City_Miami,City_New York,City_San Francisco,Gender_Female,Gender_Male
0,-1.414214,-1.470294,5,False,True,False,False,False,False,False,True,False,False,True
1,-0.707107,-0.770154,4,True,False,False,False,False,False,False,False,True,True,False
2,0.0,0.280056,3,False,False,True,False,True,False,False,False,False,False,True
3,0.707107,1.330266,2,False,False,False,True,False,True,False,False,False,True,False
4,1.414214,0.630126,4,True,False,False,False,False,False,True,False,False,False,True


# FEATURE CREATION

In [7]:
# Create a new feature 'Income per Age'
df_encoded['Income per Age'] = df_encoded['Income'] / df_encoded['Age']


In [8]:
df_encoded

Unnamed: 0,Age,Income,Productivity,Education_Bachelor,Education_High School,Education_Master,Education_Ph.D.,City_Chicago,City_Los Angeles,City_Miami,City_New York,City_San Francisco,Gender_Female,Gender_Male,Income per Age
0,-1.414214,-1.470294,5,False,True,False,False,False,False,False,True,False,False,True,1.039655
1,-0.707107,-0.770154,4,True,False,False,False,False,False,False,False,True,True,False,1.089162
2,0.0,0.280056,3,False,False,True,False,True,False,False,False,False,False,True,inf
3,0.707107,1.330266,2,False,False,False,True,False,True,False,False,False,True,False,1.88128
4,1.414214,0.630126,4,True,False,False,False,False,False,True,False,False,False,True,0.445566


In [9]:
column_name = 'Productivity'
print(np.any(np.isinf(df[column_name])))

False


In [10]:
df_encoded = df_encoded.replace([np.inf, -np.inf], np.nan)  # Replace inf and -inf with NaN
df_encoded = df_encoded.dropna(subset=df_encoded.columns, how='any')  # Drop rows containing NaN


# FEATURE SELECTION

In [11]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# Separate the target variable and the features
X = df_encoded.drop('Productivity', axis=1)
y = df_encoded['Productivity']

# Initialize the linear regression model
model = LinearRegression()

# Perform RFE with cross-validation
rfe = RFE(model, n_features_to_select=3)  # Select the top 3 features
fit = rfe.fit(X, y)  # Corrected: use X instead of x

# Print the selected features
selected_features = X.columns[fit.support_]
print('Selected Features:', selected_features)

Selected Features: Index(['Income', 'Gender_Male', 'Income per Age'], dtype='object')


# BINNING

In [12]:

# Create bins and labels for the single column
bins = [-float('inf'), -0.5, 0.5, float('inf')]  # Adjust the bin edges as needed
labels = ['Young', 'Mid_Career', 'Senior']

# Bin the Age column
df_encoded['Binned_Column'] = pd.cut(df_encoded['Age'], bins=bins, labels=labels, right=False)

# Print the updated DataFrame
df_encoded


Unnamed: 0,Age,Income,Productivity,Education_Bachelor,Education_High School,Education_Master,Education_Ph.D.,City_Chicago,City_Los Angeles,City_Miami,City_New York,City_San Francisco,Gender_Female,Gender_Male,Income per Age,Binned_Column
0,-1.414214,-1.470294,5,False,True,False,False,False,False,False,True,False,False,True,1.039655,Young
1,-0.707107,-0.770154,4,True,False,False,False,False,False,False,False,True,True,False,1.089162,Young
3,0.707107,1.330266,2,False,False,False,True,False,True,False,False,False,True,False,1.88128,Senior
4,1.414214,0.630126,4,True,False,False,False,False,False,True,False,False,False,True,0.445566,Senior
