In [35]:
import numpy as np  #numerical operation
import pandas as pd  #handling tabular data

In [2]:
np.random.seed(42)

In [3]:
data = {
    'id': range(1,101),
    'age': np.random.randint(18,60,100),
    'income': np.random.normal(50000, 15000, 100), #(mean, sd, repetaition)
    'signup_date': pd.date_range(start = '2002-01-01', periods = 100, freq ='D'), #100 daily dates from 2002-01-01
    'category': np.random.choice(['A','B','C'], 100), # 100 ROWS
    'feedback': np.random.choice (['Great Product', 'Average Experience', 'Not Good', 'Loved it', 'Bad Service'],100) #Choose 100 times
}

In [7]:
# Create DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,id,age,income,signup_date,category,feedback
0,1,56,47476.923679,2002-01-01,C,Great Product
1,2,46,67471.529683,2002-01-02,C,Great Product
2,3,32,46313.784689,2002-01-03,A,Great Product
3,4,25,38454.983395,2002-01-04,A,Not Good
4,5,38,68182.585132,2002-01-05,C,Bad Service
...,...,...,...,...,...,...
95,96,59,42163.077807,2002-04-06,C,Bad Service
96,97,56,50195.508496,2002-04-07,C,Loved it
97,98,58,55666.543880,2002-04-08,B,Average Experience
98,99,45,50942.368996,2002-04-09,B,Loved it


In [5]:
# Show first 5 rows to ensure the data is correctly organized
df.head()

Unnamed: 0,id,age,income,signup_date,category,feedback
0,1,56,47476.923679,2002-01-01,C,Great Product
1,2,46,67471.529683,2002-01-02,C,Great Product
2,3,32,46313.784689,2002-01-03,A,Great Product
3,4,25,38454.983395,2002-01-04,A,Not Good
4,5,38,68182.585132,2002-01-05,C,Bad Service


In [6]:
df.tail()

Unnamed: 0,id,age,income,signup_date,category,feedback
95,96,59,42163.077807,2002-04-06,C,Bad Service
96,97,56,50195.508496,2002-04-07,C,Loved it
97,98,58,55666.54388,2002-04-08,B,Average Experience
98,99,45,50942.368996,2002-04-09,B,Loved it
99,100,24,57523.94548,2002-04-10,B,Average Experience


In [8]:
# Statistical feature extraction - insight -> how much income per unit age

# Create a new feature (feature engineering)
df['income_per_age'] = df['income'] / df['age']

In [9]:
df.head()

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age
0,1,56,47476.923679,2002-01-01,C,Great Product,847.802209
1,2,46,67471.529683,2002-01-02,C,Great Product,1466.772384
2,3,32,46313.784689,2002-01-03,A,Great Product,1447.305772
3,4,25,38454.983395,2002-01-04,A,Not Good,1538.199336
4,5,38,68182.585132,2002-01-05,C,Bad Service,1794.278556


In [11]:
# Compute z-score of income -> how far each income value is from the mean in terms of SD

# Calculate the mean and standard deviation of 'income'
mean_income = df['income'].mean()
std_income = df['income'].std()

# Calculate the z-score for 'income'
df['income_z_score'] = (df['income'] - mean_income) / std_income
df

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age,income_z_score
0,1,56,47476.923679,2002-01-01,C,Great Product,847.802209,-0.180359
1,2,46,67471.529683,2002-01-02,C,Great Product,1466.772384,1.181255
2,3,32,46313.784689,2002-01-03,A,Great Product,1447.305772,-0.259568
3,4,25,38454.983395,2002-01-04,A,Not Good,1538.199336,-0.794745
4,5,38,68182.585132,2002-01-05,C,Bad Service,1794.278556,1.229677
...,...,...,...,...,...,...,...,...
95,96,59,42163.077807,2002-04-06,C,Bad Service,714.628437,-0.542227
96,97,56,50195.508496,2002-04-07,C,Loved it,896.348366,0.004774
97,98,58,55666.543880,2002-04-08,B,Average Experience,959.767998,0.377346
98,99,45,50942.368996,2002-04-09,B,Loved it,1132.052644,0.055634


In [12]:
df.head()


Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age,income_z_score
0,1,56,47476.923679,2002-01-01,C,Great Product,847.802209,-0.180359
1,2,46,67471.529683,2002-01-02,C,Great Product,1466.772384,1.181255
2,3,32,46313.784689,2002-01-03,A,Great Product,1447.305772,-0.259568
3,4,25,38454.983395,2002-01-04,A,Not Good,1538.199336,-0.794745
4,5,38,68182.585132,2002-01-05,C,Bad Service,1794.278556,1.229677


In [13]:
df.describe()

Unnamed: 0,id,age,income,signup_date,income_per_age,income_z_score
count,100.0,100.0,100.0,100,100.0,100.0
mean,50.5,37.91,50125.408702,2002-02-19 12:00:00,1457.780667,-9.814372e-16
min,1.0,18.0,12518.612875,2002-01-01 00:00:00,315.96383,-2.560987
25%,25.75,26.75,40689.408589,2002-01-25 18:00:00,1026.038951,-0.6425827
50%,50.5,38.0,49719.768392,2002-02-19 12:00:00,1364.627055,-0.02762372
75%,75.25,46.25,60624.020802,2002-03-16 06:00:00,1687.170875,0.7149456
max,100.0,59.0,82380.92785,2002-04-10 00:00:00,3691.551825,2.196571
std,29.011492,12.219454,14684.490603,,639.850775,1.0


In [15]:
df[['age', 'income', 'income_per_age', 'income_z_score']].describe() # df[...] , but df[[...,...,...]] dataframe then list

Unnamed: 0,age,income,income_per_age,income_z_score
count,100.0,100.0,100.0,100.0
mean,37.91,50125.408702,1457.780667,-9.814372e-16
std,12.219454,14684.490603,639.850775,1.0
min,18.0,12518.612875,315.96383,-2.560987
25%,26.75,40689.408589,1026.038951,-0.6425827
50%,38.0,49719.768392,1364.627055,-0.02762372
75%,46.25,60624.020802,1687.170875,0.7149456
max,59.0,82380.92785,3691.551825,2.196571


In [19]:
# Date/ time feature extraction (day extraction)
df['signup_day'] = df['signup_date'].dt.day
df

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age,income_z_score,signup_day
0,1,56,47476.923679,2002-01-01,C,Great Product,847.802209,-0.180359,1
1,2,46,67471.529683,2002-01-02,C,Great Product,1466.772384,1.181255,2
2,3,32,46313.784689,2002-01-03,A,Great Product,1447.305772,-0.259568,3
3,4,25,38454.983395,2002-01-04,A,Not Good,1538.199336,-0.794745,4
4,5,38,68182.585132,2002-01-05,C,Bad Service,1794.278556,1.229677,5
...,...,...,...,...,...,...,...,...,...
95,96,59,42163.077807,2002-04-06,C,Bad Service,714.628437,-0.542227,6
96,97,56,50195.508496,2002-04-07,C,Loved it,896.348366,0.004774,7
97,98,58,55666.543880,2002-04-08,B,Average Experience,959.767998,0.377346,8
98,99,45,50942.368996,2002-04-09,B,Loved it,1132.052644,0.055634,9


In [20]:
df['signup_month'] = df['signup_date'].dt.month
df.head()

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age,income_z_score,signup_day,signup_month
0,1,56,47476.923679,2002-01-01,C,Great Product,847.802209,-0.180359,1,1
1,2,46,67471.529683,2002-01-02,C,Great Product,1466.772384,1.181255,2,1
2,3,32,46313.784689,2002-01-03,A,Great Product,1447.305772,-0.259568,3,1
3,4,25,38454.983395,2002-01-04,A,Not Good,1538.199336,-0.794745,4,1
4,5,38,68182.585132,2002-01-05,C,Bad Service,1794.278556,1.229677,5,1


In [21]:
df['signup_weekday'] = df['signup_date'].dt.weekday
df.head()

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age,income_z_score,signup_day,signup_month,signup_weekday
0,1,56,47476.923679,2002-01-01,C,Great Product,847.802209,-0.180359,1,1,1
1,2,46,67471.529683,2002-01-02,C,Great Product,1466.772384,1.181255,2,1,2
2,3,32,46313.784689,2002-01-03,A,Great Product,1447.305772,-0.259568,3,1,3
3,4,25,38454.983395,2002-01-04,A,Not Good,1538.199336,-0.794745,4,1,4
4,5,38,68182.585132,2002-01-05,C,Bad Service,1794.278556,1.229677,5,1,5


In [24]:
df['days_from_signup'] = (pd.Timestamp.today() - df['signup_date']).dt.days
df.head()

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age,income_z_score,signup_day,signup_month,signup_weekday,days_from_signup
0,1,56,47476.923679,2002-01-01,C,Great Product,847.802209,-0.180359,1,1,1,8498
1,2,46,67471.529683,2002-01-02,C,Great Product,1466.772384,1.181255,2,1,2,8497
2,3,32,46313.784689,2002-01-03,A,Great Product,1447.305772,-0.259568,3,1,3,8496
3,4,25,38454.983395,2002-01-04,A,Not Good,1538.199336,-0.794745,4,1,4,8495
4,5,38,68182.585132,2002-01-05,C,Bad Service,1794.278556,1.229677,5,1,5,8494


In [25]:
df['years_from_signup'] = ((pd.Timestamp.today() - df['signup_date']).dt.days)/365
df.head()

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age,income_z_score,signup_day,signup_month,signup_weekday,days_from_signup,years_from_signup
0,1,56,47476.923679,2002-01-01,C,Great Product,847.802209,-0.180359,1,1,1,8498,23.282192
1,2,46,67471.529683,2002-01-02,C,Great Product,1466.772384,1.181255,2,1,2,8497,23.279452
2,3,32,46313.784689,2002-01-03,A,Great Product,1447.305772,-0.259568,3,1,3,8496,23.276712
3,4,25,38454.983395,2002-01-04,A,Not Good,1538.199336,-0.794745,4,1,4,8495,23.273973
4,5,38,68182.585132,2002-01-05,C,Bad Service,1794.278556,1.229677,5,1,5,8494,23.271233


In [46]:
#Ecode Categorical Variables -> one-hot encoding
df_encoded=pd.get_dummies(df, columns = ['category'], prefix='cat')

In [47]:
df_encoded.head()

Unnamed: 0,id,age,income,signup_date,feedback,income_per_age,income_z_score,signup_day,signup_month,signup_weekday,days_from_signup,years_from_signup,cat_A,cat_B,cat_C
0,1,56,47476.923679,2002-01-01,Great Product,847.802209,-0.180359,1,1,1,8498,23.282192,False,False,True
1,2,46,67471.529683,2002-01-02,Great Product,1466.772384,1.181255,2,1,2,8497,23.279452,False,False,True
2,3,32,46313.784689,2002-01-03,Great Product,1447.305772,-0.259568,3,1,3,8496,23.276712,True,False,False
3,4,25,38454.983395,2002-01-04,Not Good,1538.199336,-0.794745,4,1,4,8495,23.273973,True,False,False
4,5,38,68182.585132,2002-01-05,Bad Service,1794.278556,1.229677,5,1,5,8494,23.271233,False,False,True


In [29]:
#Text feature extraction (TF-IDF)

from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['feedback'])

In [48]:
print(tfidf_matrix)

  (0, 8)	0.7071067811865476
  (0, 4)	0.7071067811865476
  (1, 8)	0.7071067811865476
  (1, 4)	0.7071067811865476
  (2, 8)	0.7071067811865476
  (2, 4)	0.7071067811865476
  (3, 3)	0.7071067811865475
  (3, 7)	0.7071067811865475
  (4, 9)	0.7071067811865476
  (4, 1)	0.7071067811865476
  (5, 2)	0.7071067811865476
  (5, 0)	0.7071067811865476
  (6, 2)	0.7071067811865476
  (6, 0)	0.7071067811865476
  (7, 3)	0.7071067811865475
  (7, 7)	0.7071067811865475
  (8, 2)	0.7071067811865476
  (8, 0)	0.7071067811865476
  (9, 8)	0.7071067811865476
  (9, 4)	0.7071067811865476
  (10, 9)	0.7071067811865476
  (10, 1)	0.7071067811865476
  (11, 5)	0.7071067811865476
  (11, 6)	0.7071067811865476
  (12, 2)	0.7071067811865476
  :	:
  (87, 0)	0.7071067811865476
  (88, 2)	0.7071067811865476
  (88, 0)	0.7071067811865476
  (89, 9)	0.7071067811865476
  (89, 1)	0.7071067811865476
  (90, 5)	0.7071067811865476
  (90, 6)	0.7071067811865476
  (91, 8)	0.7071067811865476
  (91, 4)	0.7071067811865476
  (92, 9)	0.7071067811865476

In [49]:
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df

Unnamed: 0,average,bad,experience,good,great,it,loved,not,product,service
0,0.000000,0.000000,0.000000,0.000000,0.707107,0.000000,0.000000,0.000000,0.707107,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.707107,0.000000,0.000000,0.000000,0.707107,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.707107,0.000000,0.000000,0.000000,0.707107,0.000000
3,0.000000,0.000000,0.000000,0.707107,0.000000,0.000000,0.000000,0.707107,0.000000,0.000000
4,0.000000,0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.707107
...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.707107
96,0.000000,0.000000,0.000000,0.000000,0.000000,0.707107,0.707107,0.000000,0.000000,0.000000
97,0.707107,0.000000,0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
98,0.000000,0.000000,0.000000,0.000000,0.000000,0.707107,0.707107,0.000000,0.000000,0.000000


In [50]:
df_combined = pd.concat([df, tfidf_df], axis = 1)

In [51]:
df_combined.head()

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age,income_z_score,signup_day,signup_month,...,average,bad,experience,good,great,it,loved,not,product,service
0,1,56,47476.923679,2002-01-01,C,Great Product,847.802209,-0.180359,1,1,...,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.707107,0.0
1,2,46,67471.529683,2002-01-02,C,Great Product,1466.772384,1.181255,2,1,...,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.707107,0.0
2,3,32,46313.784689,2002-01-03,A,Great Product,1447.305772,-0.259568,3,1,...,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.707107,0.0
3,4,25,38454.983395,2002-01-04,A,Not Good,1538.199336,-0.794745,4,1,...,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.707107,0.0,0.0
4,5,38,68182.585132,2002-01-05,C,Bad Service,1794.278556,1.229677,5,1,...,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107


In [52]:
print(df_combined[['feedback'] + list(tdidf_df.columns)].head())

        feedback  average       bad  experience      good     great   it  \
0  Great Product      0.0  0.000000         0.0  0.000000  0.707107  0.0   
1  Great Product      0.0  0.000000         0.0  0.000000  0.707107  0.0   
2  Great Product      0.0  0.000000         0.0  0.000000  0.707107  0.0   
3       Not Good      0.0  0.000000         0.0  0.707107  0.000000  0.0   
4    Bad Service      0.0  0.707107         0.0  0.000000  0.000000  0.0   

   loved       not   product   service  
0    0.0  0.000000  0.707107  0.000000  
1    0.0  0.000000  0.707107  0.000000  
2    0.0  0.000000  0.707107  0.000000  
3    0.0  0.707107  0.000000  0.000000  
4    0.0  0.000000  0.000000  0.707107  
