In [1]:
# Create meaningful new features from existing data to improve model performance.

import pandas as pd

# Example data with date and transaction columns
data = {'Name': ['Alice', 'Bob', 'Charlie'], 'Transaction_Date': ['2020-01-15', '2021-05-20', '2019-11-30'], 'Amount': [250, 400, 150]}
df = pd.DataFrame(data)

# Convert Transaction_Date to datetime
df['Transaction_Date'] = pd.to_datetime(df['Transaction_Date'])

# Extract year, month, and day from the date to create new features
df['Year'] = df['Transaction_Date'].dt.year
df['Month'] = df['Transaction_Date'].dt.month
df['Day'] = df['Transaction_Date'].dt.day

print("Data with New Date Features:\n", df)


Data with New Date Features:
       Name Transaction_Date  Amount  Year  Month  Day
0    Alice       2020-01-15     250  2020      1   15
1      Bob       2021-05-20     400  2021      5   20
2  Charlie       2019-11-30     150  2019     11   30


In [2]:
# Apply transformations to normalize skewed data or stabilize variance.

import numpy as np

# Create a dataset with skewed values
data = {'Amount': [100, 500, 1000, 2000, 10000]}
df = pd.DataFrame(data)

# Log transformation to reduce skewness
df['Amount_Log'] = np.log1p(df['Amount'])  # log1p(x) = log(1 + x)

print("Log Transformed Data:\n", df)


Log Transformed Data:
    Amount  Amount_Log
0     100    4.615121
1     500    6.216606
2    1000    6.908755
3    2000    7.601402
4   10000    9.210440


In [3]:
# Group continuous numerical data into bins for better analysis.

# Example data with Age
data = {'Age': [25, 30, 45, 50, 60, 70]}
df = pd.DataFrame(data)

# Define bins and labels for the age groups
bins = [20, 30, 40, 50, 60, 80]
labels = ['20-30', '30-40', '40-50', '50-60', '60+']

# Create a new column with binned age groups
df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels)

print("Data with Age Bins:\n", df)

Data with Age Bins:
    Age Age_Group
0   25     20-30
1   30     20-30
2   45     40-50
3   50     40-50
4   60     50-60
5   70       60+


In [1]:
# Handle cyclical features (e.g., day of the week, month) using sine and cosine transformations.

# Example data with months represented as cyclical data
data = {'Month': [1, 2, 3, 10, 11, 12]}  # January = 1, December = 12
df = pd.DataFrame(data)

# Apply sine and cosine transformation to encode cyclical features
df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)

print("Cyclically Encoded Month Data:\n", df)

NameError: name 'pd' is not defined

In [1]:
# Use Decision Trees to rank the importance of features.

from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Create and train a Decision Tree model
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X, y)

# Get the importance of each feature
feature_importances = tree.feature_importances_
print("Feature Importances:", feature_importances)


Feature Importances: [0.01333333 0.         0.56405596 0.42261071]
