In [1]:
import pandas as pd

df = pd.read_csv('data/sample_data_v2.csv')
df.head()

Unnamed: 0,userID,listingID,viewCount,like,share,contactClicked,timeSpent,ratings
0,43,1167,5,False,True,False,146,0.353
1,21,1023,2,True,True,True,30,1.0
2,8,1804,2,True,False,True,137,0.647
3,34,1252,5,False,False,True,206,0.471
4,42,1289,4,True,True,True,192,1.0


In [2]:
df.shape

(150, 8)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   userID          150 non-null    int64  
 1   listingID       150 non-null    int64  
 2   viewCount       150 non-null    int64  
 3   like            150 non-null    bool   
 4   share           150 non-null    bool   
 5   contactClicked  150 non-null    bool   
 6   timeSpent       150 non-null    int64  
 7   ratings         150 non-null    float64
dtypes: bool(3), float64(1), int64(4)
memory usage: 6.4 KB


In [4]:
df.isna().sum()

userID            0
listingID         0
viewCount         0
like              0
share             0
contactClicked    0
timeSpent         0
ratings           0
dtype: int64

In [5]:
for col in df.columns:
    print(df[col].value_counts(), end='\n\n')

userID
33    8
49    6
20    6
22    5
6     5
19    5
18    5
24    5
44    5
26    5
11    5
1     4
21    4
42    4
28    4
27    4
41    4
16    4
31    3
29    3
40    3
5     3
30    3
17    3
43    3
37    3
4     3
34    3
2     2
14    2
13    2
10    2
45    2
50    2
36    2
9     2
48    2
8     2
3     2
12    1
15    1
7     1
32    1
46    1
38    1
35    1
23    1
47    1
25    1
Name: count, dtype: int64

listingID
1572    8
1935    5
1014    4
1234    4
1791    4
       ..
1541    1
1682    1
1017    1
1310    1
1471    1
Name: count, Length: 81, dtype: int64

viewCount
5     23
6     21
9     17
3     16
8     15
4     13
2     12
10    11
1     11
7     11
Name: count, dtype: int64

like
True     76
False    74
Name: count, dtype: int64

share
False    85
True     65
Name: count, dtype: int64

contactClicked
False    78
True     72
Name: count, dtype: int64

timeSpent
91     4
144    3
68     3
292    3
209    3
      ..
255    1
291    1
260    1
216    1
141    1


- Property Type and Location need to be Encoded (Categorical)
- Numericals need to be Standardized (Standard Scaler)

In [6]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [7]:
# Remove identifiers
df.drop(['userID'], axis=1, inplace=True)

In [8]:
# The Aim is for the Property to be Interesting enough for the 
# User to Contact the Agent, so that is the Target Variable
X = df.drop(['ratings'], axis=1)
y = df['ratings']

In [9]:
# Preprocessing
cat_cols = ['like', 'share', 'contactClicked']
num_cols = ['listingID', 'viewCount', 'timeSpent']

for col in cat_cols:
    X[col] = LabelEncoder().fit_transform(X[col])

X[num_cols] = StandardScaler().fit_transform(X[num_cols])

X.head()

Unnamed: 0,listingID,viewCount,like,share,contactClicked,timeSpent
0,-1.213773,-0.215808,0,1,0,-0.016704
1,-1.725573,-1.345037,1,1,1,-1.374911
2,1.050233,-1.345037,1,0,1,-0.122082
3,-0.911668,-0.215808,0,0,1,0.685816
4,-0.780164,-0.592218,1,1,1,0.521895


In [10]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
# Train the model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print('MSE:', mean_squared_error(y_test, y_pred))

MSE: 3.048557359604964e-32
