In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression



In [None]:
# STEP 1: CREATE SAMPLE DATASET
df = pd.DataFrame({
    'price': [200000, 150000, 300000, 120000, 500000],
    'size_sqft': [1000, 800, 1500, 600, 2000],
    'city': ['NY', 'LA', 'NY', 'SF', 'LA'],
    'date': pd.to_datetime(['2025-01-01','2025-01-02','2025-01-03','2025-01-04','2025-01-05']),
    'review': [
        "Great product and service!",
        "Not worth the price.",
        "Average experience but good quality.",
        "Excellent and fast delivery!",
        "Terrible product and rude staff."
    ],
    'target': [1, 0, 1, 1, 0]   # Example target variable
})

print("Original DataFrame:\n", df, "\n")

Original DataFrame:
     price  size_sqft city       date                                review  \
0  200000       1000   NY 2025-01-01            Great product and service!   
1  150000        800   LA 2025-01-02                  Not worth the price.   
2  300000       1500   NY 2025-01-03  Average experience but good quality.   
3  120000        600   SF 2025-01-04          Excellent and fast delivery!   
4  500000       2000   LA 2025-01-05      Terrible product and rude staff.   

   target  
0       1  
1       0  
2       1  
3       1  
4       0   



In [None]:
# STEP 2: MATHEMATICAL / STATISTICAL FEATURE GENERATION
df['price_per_sqft'] = df['price'] / df['size_sqft'] # Ratio
df['price_change'] = df['price'].diff().fillna(0)   # Difference
df['log_price'] = np.log1p(df['price'])             # Log transformation
df['price_mean_diff'] = df['price'] - df['price'].mean()# Mean difference

print("After Mathematical Feature Generation:\n", df[['price_per_sqft','price_change','log_price','price_mean_diff']], "\n")

After Mathematical Feature Generation:
    price_per_sqft  price_change  log_price  price_mean_diff
0           200.0           0.0  12.206078         -54000.0
1           187.5      -50000.0  11.918397        -104000.0
2           200.0      150000.0  12.611541          46000.0
3           200.0     -180000.0  11.695255        -134000.0
4           250.0      380000.0  13.122365         246000.0 



In [None]:
# STEP 3: CATEGORICAL FEATURE ENCODING
# a) Label Encoding
le = LabelEncoder()
df['city_encoded'] = le.fit_transform(df['city'])

# b) One-hot Encoding
df = pd.get_dummies(df, columns=['city'], drop_first=True)

print("After Categorical Encoding:\n", df.head(), "\n")

After Categorical Encoding:
     price  size_sqft       date                                review  target  \
0  200000       1000 2025-01-01            Great product and service!       1   
1  150000        800 2025-01-02                  Not worth the price.       0   
2  300000       1500 2025-01-03  Average experience but good quality.       1   
3  120000        600 2025-01-04          Excellent and fast delivery!       1   
4  500000       2000 2025-01-05      Terrible product and rude staff.       0   

   price_per_sqft  price_change  log_price  price_mean_diff  city_encoded  \
0           200.0           0.0  12.206078         -54000.0             1   
1           187.5      -50000.0  11.918397        -104000.0             0   
2           200.0      150000.0  12.611541          46000.0             1   
3           200.0     -180000.0  11.695255        -134000.0             2   
4           250.0      380000.0  13.122365         246000.0             0   

   city_NY  city_SF  

In [None]:
# STEP 4: TEMPORAL FEATURE GENERATION
df['day'] = df['date'].dt.day
df['weekday'] = df['date'].dt.weekday
df['month'] = df['date'].dt.month

# Create lag feature and rolling mean (for demonstration)
df['price_lag1'] = df['price'].shift(1).fillna(df['price'].mean())
df['price_rolling_mean'] = df['price'].rolling(window=2).mean().fillna(df['price'].mean())

print("After Temporal Feature Generation:\n", df[['date','day','weekday','month','price_lag1','price_rolling_mean']], "\n")

After Temporal Feature Generation:
         date  day  weekday  month  price_lag1  price_rolling_mean
0 2025-01-01    1        2      1    254000.0            254000.0
1 2025-01-02    2        3      1    200000.0            175000.0
2 2025-01-03    3        4      1    150000.0            225000.0
3 2025-01-04    4        5      1    300000.0            210000.0
4 2025-01-05    5        6      1    120000.0            310000.0 



In [None]:
# STEP 5: TEXT FEATURE GENERATION
# a) Text Length Feature
df['text_length'] = df['review'].apply(len)

# b) Count number of exclamation marks as intensity indicator
df['num_exclamations'] = df['review'].apply(lambda x: x.count('!'))

# c) TF-IDF Representation
tfidf = TfidfVectorizer(max_features=5)
tfidf_features = tfidf.fit_transform(df['review']).toarray()
tfidf_df = pd.DataFrame(tfidf_features, columns=tfidf.get_feature_names_out())

# Combine TF-IDF features with main dataframe
df = pd.concat([df, tfidf_df], axis=1)

print("After Text Feature Generation:\n", df[['text_length','num_exclamations'] + list(tfidf_df.columns)], "\n")

After Text Feature Generation:
    text_length  num_exclamations       and   average       but  excellent  \
0           26                 1  0.638711  0.000000  0.000000   0.000000   
1           20                 0  0.000000  0.000000  0.000000   0.000000   
2           36                 0  0.000000  0.707107  0.707107   0.000000   
3           28                 1  0.556451  0.000000  0.000000   0.830881   
4           32                 0  0.638711  0.000000  0.000000   0.000000   

    product  
0  0.769447  
1  0.000000  
2  0.000000  
3  0.000000  
4  0.769447   



In [None]:
# STEP 6: FEATURE SELECTION
# Select numeric columns only for model input
X = df.select_dtypes(include=[np.number]).drop(columns=['target'])
y = df['target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Use ANOVA F-test for feature selection
selector = SelectKBest(score_func=f_classif, k=5)
X_new = selector.fit_transform(X_train, y_train)

selected_features = X_train.columns[selector.get_support()]
print("Selected Top Features:\n", selected_features, "\n")

Selected Top Features:
 Index(['price', 'price_per_sqft', 'price_mean_diff', 'price_rolling_mean',
       'product'],
      dtype='object') 



  f = msb / msw
  f = msb / msw


In [None]:
# STEP 7: SIMPLE MODEL TRAINING (Optional)
model = LogisticRegression()
model.fit(X_train[selected_features], y_train)
acc = model.score(X_test[selected_features], y_test)
print(f"Model Accuracy using Selected Features: {acc:.2f}")

Model Accuracy using Selected Features: 0.50
