In [1]:
import pandas as pd

df = pd.read_csv('data/sample_data.csv')
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d-%m-%y %H:%M')
df.head()

Unnamed: 0,UserID,SessionID,Timestamp,ListingID,PropertyType,Location,Bedrooms,Bathrooms,Price,Area,Viewed,TimeSpent,Views,ContactClicked,Liked,Shared
0,user_1,session_5,2024-06-16 09:29:00,1486,Villa,Palm Jumeirah,2,3,970000,490,False,0,0,False,,
1,user_14,session_22,2024-05-18 22:51:00,1620,Apartment,Business Bay,4,3,2000000,697,True,192,2,True,,
2,user_17,session_1,2024-06-13 14:30:00,1197,Villa,Dubai Hills Estate,2,2,2000000,1200,False,0,0,False,,
3,user_3,session_35,2024-03-07 00:37:00,1775,Villa,Downtown Dubai,4,2,2000000,400,False,0,0,False,,
4,user_11,session_10,2024-01-12 05:59:00,1514,Townhouse,JLT,3,1,1600000,697,False,0,0,False,,


In [2]:
len(df)

100

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   UserID          100 non-null    object        
 1   SessionID       100 non-null    object        
 2   Timestamp       100 non-null    datetime64[ns]
 3   ListingID       100 non-null    int64         
 4   PropertyType    100 non-null    object        
 5   Location        100 non-null    object        
 6   Bedrooms        100 non-null    int64         
 7   Bathrooms       100 non-null    int64         
 8   Price           100 non-null    int64         
 9   Area            100 non-null    int64         
 10  Viewed          100 non-null    bool          
 11  TimeSpent       100 non-null    int64         
 12  Views           100 non-null    int64         
 13  ContactClicked  100 non-null    bool          
 14  Liked           0 non-null      float64       
 15  Shared 

In [4]:
df.isna().sum()

UserID              0
SessionID           0
Timestamp           0
ListingID           0
PropertyType        0
Location            0
Bedrooms            0
Bathrooms           0
Price               0
Area                0
Viewed              0
TimeSpent           0
Views               0
ContactClicked      0
Liked             100
Shared            100
dtype: int64

In [5]:
for col in df.columns:
    print(df[col].value_counts(), end='\n\n')

UserID
user_16    9
user_7     9
user_15    8
user_1     7
user_10    7
user_11    6
user_2     6
user_12    6
user_14    6
user_18    5
user_5     5
user_8     4
user_9     4
user_3     3
user_6     3
user_20    3
user_19    3
user_17    2
user_13    2
user_4     2
Name: count, dtype: int64

SessionID
session_33    5
session_38    5
session_6     4
session_26    4
session_21    4
session_30    4
session_17    4
session_9     4
session_5     3
session_36    3
session_32    3
session_22    3
session_24    3
session_40    3
session_35    3
session_4     3
session_31    2
session_1     2
session_28    2
session_42    2
session_11    2
session_47    2
session_16    2
session_12    2
session_25    2
session_14    2
session_18    2
session_41    2
session_29    1
session_43    1
session_27    1
session_48    1
session_39    1
session_7     1
session_44    1
session_46    1
session_2     1
session_10    1
session_3     1
session_23    1
session_45    1
session_20    1
session_37    1
session_

- Property Type and Location need to be Encoded (Categorical)
- Numericals need to be Standardized (Standard Scaler)

In [6]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [7]:
# Dropping coz irrelevant for now
df = df.drop(['Liked', 'Shared'], axis=1)

# The Aim is for the Property to be Interesting enough for the 
# User to Contact the Agent, so that is the Target Variable
X = df.drop(['ContactClicked'], axis=1)
y = df['ContactClicked'].astype(int)

In [8]:
# Preprocessing
cat_cols = ['PropertyType', 'Location']
num_cols = ['Bedrooms', 'Bathrooms', 'Price', 'Area', 'TimeSpent', 'Views']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(), cat_cols),
    ])

# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LogisticRegression())])

In [9]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.88      0.88        16
           1       0.50      0.50      0.50         4

    accuracy                           0.80        20
   macro avg       0.69      0.69      0.69        20
weighted avg       0.80      0.80      0.80        20

