In [7]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
# Set seed for reproducibility
np.random.seed(42)

# Generate new customer dataset
data = {
    'id': range(1, 101),  # 100 unique customer IDs
    'age': np.random.randint(18, 70, 100),  # Random ages between 18 and 70
    'spend': np.random.normal(200, 50, 100),  # Normal distribution for spend (mean=200, std=50)
    'signup_date': pd.date_range(start='2023-01-01', periods=100, freq='D'),  # Daily signup dates starting from 2023-01-01
    'location': np.random.choice(['Urban', 'Suburban', 'Rural'], 100),  # Random location (Urban, Suburban, Rural)
    'feedback': np.random.choice(['Excellent', 'Good', 'Average', 'Bad'], 100)  # Random customer feedback
}

# Create DataFrame from the generated data
df = pd.DataFrame(data)

# Displaying the first few rows of the DataFrame to inspect the data
df.head()

Unnamed: 0,id,age,spend,signup_date,location,feedback
0,1,56,106.216161,2023-01-01,Suburban,Bad
1,2,69,131.660893,2023-01-02,Rural,Good
2,3,46,231.815255,2023-01-03,Rural,Bad
3,4,32,154.663967,2023-01-04,Suburban,Good
4,5,60,223.802129,2023-01-05,Urban,Bad


In [9]:
df.tail()

Unnamed: 0,id,age,spend,signup_date,location,feedback
95,96,42,203.282458,2023-04-06,Suburban,Good
96,97,62,258.460382,2023-04-07,Rural,Bad
97,98,58,243.665848,2023-04-08,Suburban,Good
98,99,46,210.282507,2023-04-09,Suburban,Good
99,100,32,127.825832,2023-04-10,Rural,Average


In [10]:
# Statistical feature extraction: Customer spend per age (spend/age)

# New feature: spend per unit age
df['spend_per_age'] = df['spend'] / df['age'] 
df.head()

Unnamed: 0,id,age,spend,signup_date,location,feedback,spend_per_age
0,1,56,106.216161,2023-01-01,Suburban,Bad,1.896717
1,2,69,131.660893,2023-01-02,Rural,Good,1.908129
2,3,46,231.815255,2023-01-03,Rural,Bad,5.039462
3,4,32,154.663967,2023-01-04,Suburban,Good,4.833249
4,5,60,223.802129,2023-01-05,Urban,Bad,3.730035


In [11]:
# Compute z-score for 'spend' column (how far each spend is from the mean in terms of standard deviations)
mean_spend = df['spend'].mean()
std_spend = df['spend'].std()

df['spend_z_score'] = (df['spend'] - mean_spend) / std_spend  # Z-score formula
df.head()

Unnamed: 0,id,age,spend,signup_date,location,feedback,spend_per_age,spend_z_score
0,1,56,106.216161,2023-01-01,Suburban,Bad,1.896717,-1.880298
1,2,69,131.660893,2023-01-02,Rural,Good,1.908129,-1.380527
2,3,46,231.815255,2023-01-03,Rural,Bad,5.039462,0.586645
3,4,32,154.663967,2023-01-04,Suburban,Good,4.833249,-0.928715
4,5,60,223.802129,2023-01-05,Urban,Bad,3.730035,0.429256


In [12]:
df.describe()

Unnamed: 0,id,age,spend,signup_date,spend_per_age,spend_z_score
count,100.0,100.0,100.0,100,100.0,100.0
mean,50.5,43.35,201.947483,2023-02-19 12:00:00,5.367605,9.747758e-16
min,1.0,19.0,103.004439,2023-01-01 00:00:00,1.745838,-1.94338
25%,25.75,31.75,161.227117,2023-01-25 18:00:00,3.693499,-0.7998052
50%,50.5,42.0,196.83696,2023-02-19 12:00:00,4.806248,-0.1003779
75%,75.25,57.0,237.475498,2023-03-16 06:00:00,6.449361,0.6978201
max,100.0,69.0,347.183171,2023-04-10 00:00:00,15.722556,2.852633
std,29.011492,14.904663,50.912855,,2.666393,1.0


In [13]:
# Describe is doen only for numerical data so we do not have to use describe for all the variables, instead we can do the following
# df[...] , but df[[...,...,...]] dataframe then list

df[['age', 'spend', 'spend_per_age', 'spend_z_score']].describe() 

Unnamed: 0,age,spend,spend_per_age,spend_z_score
count,100.0,100.0,100.0,100.0
mean,43.35,201.947483,5.367605,9.747758e-16
std,14.904663,50.912855,2.666393,1.0
min,19.0,103.004439,1.745838,-1.94338
25%,31.75,161.227117,3.693499,-0.7998052
50%,42.0,196.83696,4.806248,-0.1003779
75%,57.0,237.475498,6.449361,0.6978201
max,69.0,347.183171,15.722556,2.852633


In [15]:
# Date/time feature extraction: Extracting day, weekday, days since signup, and years since signup

df['signup_day'] = df['signup_date'].dt.day  # Day of the month
df['signup_month'] = df['signup_date'].dt.month  # Day of the month
df['signup_weekday'] = df['signup_date'].dt.weekday  # Weekday (0=Monday, 6=Sunday)
df['days_from_signup'] = (pd.Timestamp.today() - df['signup_date']).dt.days  # Days since signup
df['years_from_signup'] = ((pd.Timestamp.today() - df['signup_date']).dt.days) / 365  # Years since signup
df

Unnamed: 0,id,age,spend,signup_date,location,feedback,spend_per_age,spend_z_score,signup_day,signup_weekday,days_from_signup,years_from_signup,signup_month
0,1,56,106.216161,2023-01-01,Suburban,Bad,1.896717,-1.880298,1,6,828,2.268493,1
1,2,69,131.660893,2023-01-02,Rural,Good,1.908129,-1.380527,2,0,827,2.265753,1
2,3,46,231.815255,2023-01-03,Rural,Bad,5.039462,0.586645,3,1,826,2.263014,1
3,4,32,154.663967,2023-01-04,Suburban,Good,4.833249,-0.928715,4,2,825,2.260274,1
4,5,60,223.802129,2023-01-05,Urban,Bad,3.730035,0.429256,5,3,824,2.257534,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,42,203.282458,2023-04-06,Suburban,Good,4.840059,0.026221,6,3,733,2.008219,4
96,97,62,258.460382,2023-04-07,Rural,Bad,4.168716,1.109993,7,4,732,2.005479,4
97,98,58,243.665848,2023-04-08,Suburban,Good,4.201135,0.819407,8,5,731,2.002740,4
98,99,46,210.282507,2023-04-09,Suburban,Good,4.571359,0.163712,9,6,730,2.000000,4


In [16]:
# Categorical feature encoding (One-hot encoding for 'location')

df_encoded = pd.get_dummies(df, columns=['location'], prefix='location')
df_encoded.head()

Unnamed: 0,id,age,spend,signup_date,feedback,spend_per_age,spend_z_score,signup_day,signup_weekday,days_from_signup,years_from_signup,signup_month,location_Rural,location_Suburban,location_Urban
0,1,56,106.216161,2023-01-01,Bad,1.896717,-1.880298,1,6,828,2.268493,1,False,True,False
1,2,69,131.660893,2023-01-02,Good,1.908129,-1.380527,2,0,827,2.265753,1,True,False,False
2,3,46,231.815255,2023-01-03,Bad,5.039462,0.586645,3,1,826,2.263014,1,True,False,False
3,4,32,154.663967,2023-01-04,Good,4.833249,-0.928715,4,2,825,2.260274,1,False,True,False
4,5,60,223.802129,2023-01-05,Bad,3.730035,0.429256,5,3,824,2.257534,1,False,False,True


In [17]:
# Text feature extraction using TF-IDF for the 'feedback' column
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['feedback'])  # Applying TF-IDF to 'feedback'

In [18]:
# Convert the TF-IDF matrix to a DataFrame for easier analysis
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

In [19]:
# Combine original DataFrame with the TF-IDF features
df_combined = pd.concat([df, tfidf_df], axis=1)

# Display the DataFrame with all combined features
df_combined.head()

Unnamed: 0,id,age,spend,signup_date,location,feedback,spend_per_age,spend_z_score,signup_day,signup_weekday,days_from_signup,years_from_signup,signup_month,average,bad,excellent,good
0,1,56,106.216161,2023-01-01,Suburban,Bad,1.896717,-1.880298,1,6,828,2.268493,1,0.0,1.0,0.0,0.0
1,2,69,131.660893,2023-01-02,Rural,Good,1.908129,-1.380527,2,0,827,2.265753,1,0.0,0.0,0.0,1.0
2,3,46,231.815255,2023-01-03,Rural,Bad,5.039462,0.586645,3,1,826,2.263014,1,0.0,1.0,0.0,0.0
3,4,32,154.663967,2023-01-04,Suburban,Good,4.833249,-0.928715,4,2,825,2.260274,1,0.0,0.0,0.0,1.0
4,5,60,223.802129,2023-01-05,Urban,Bad,3.730035,0.429256,5,3,824,2.257534,1,0.0,1.0,0.0,0.0


In [20]:
# Print the 'feedback' column along with the corresponding TF-IDF features
print(df_combined[['feedback'] + list(tfidf_df.columns)].head())

  feedback  average  bad  excellent  good
0      Bad      0.0  1.0        0.0   0.0
1     Good      0.0  0.0        0.0   1.0
2      Bad      0.0  1.0        0.0   0.0
3     Good      0.0  0.0        0.0   1.0
4      Bad      0.0  1.0        0.0   0.0
