In [115]:
import pandas as pd
import numpy as np
import statsmodels.api as st
from math import exp

### Just preparing data

In [29]:
df = pd.read_csv("survey.csv")

In [30]:
df

Unnamed: 0,MYDEPV,Price,Income,Age
0,1,10,33,37
1,0,20,21,55
2,1,30,59,55
3,1,20,76,44
4,0,30,24,37
...,...,...,...,...
745,0,30,25,27
746,0,10,29,32
747,0,10,55,36
748,0,30,21,37


In [31]:
df["ones"] = [1]*750

In [32]:
ls_prices = (20, 30, 10)
for _ in range(2):
    mask = df["Price"].mask(df["Price"]==ls_prices[0], 1)
    mask = mask.mask((mask==ls_prices[1]) | (mask==ls_prices[2]), 0)
    df[f'price{ls_prices[0]}'] = mask
    ls_prices = ls_prices[1::] + ls_prices[:1:]

In [33]:
df = df.drop("Price", axis=1)

In [34]:
df

Unnamed: 0,MYDEPV,Income,Age,ones,price20,price30
0,1,33,37,1,0,0
1,0,21,55,1,1,0
2,1,59,55,1,0,1
3,1,76,44,1,1,0
4,0,24,37,1,0,1
...,...,...,...,...,...,...
745,0,25,27,1,0,1
746,0,29,32,1,0,0
747,0,55,36,1,0,0
748,0,21,37,1,0,1


### Splitting into train test

In [78]:
split_ratio = 0.8
split_mask = np.random.rand(df["Income"].count()) < split_ratio
train = df[split_mask]
test = df[~split_mask]

In [81]:
model = st.Logit(train["MYDEPV"], train[["Income", "Age", "price20", "price30", "ones"]])

In [82]:
model = model.fit()

Optimization terminated successfully.
         Current function value: 0.361230
         Iterations 7


In [83]:
model.summary()

0,1,2,3
Dep. Variable:,MYDEPV,No. Observations:,596.0
Model:,Logit,Df Residuals:,591.0
Method:,MLE,Df Model:,4.0
Date:,"Sat, 07 Dec 2024",Pseudo R-squ.:,0.4745
Time:,14:52:34,Log-Likelihood:,-215.29
converged:,True,LL-Null:,-409.67
Covariance Type:,nonrobust,LLR p-value:,7.459999999999999e-83

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Income,0.1241,0.010,12.360,0.000,0.104,0.144
Age,0.0356,0.013,2.709,0.007,0.010,0.061
price20,-0.7148,0.291,-2.452,0.014,-1.286,-0.143
price30,-2.2058,0.348,-6.339,0.000,-2.888,-1.524
ones,-5.7880,0.578,-10.014,0.000,-6.921,-4.655


### Predicting

In [96]:
train_y = model.predict(train[["Income", "Age", "price20", "price30", "ones"]])
test_y = model.predict(test[["Income", "Age", "price20", "price30", "ones"]])

In [97]:
train["odds_ratio"] = train_y
test["odds_ratio"] = test_y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["odds_ratio"] = train_y
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["odds_ratio"] = test_y


In [98]:
train.head()

Unnamed: 0,MYDEPV,Income,Age,ones,price20,price30,odds_ratio
0,1,33,37,1,0,0,0.407513
1,0,21,55,1,1,0,0.12585
2,1,59,55,1,0,1,0.783876
5,0,22,32,1,1,0,0.067069
6,1,28,32,1,0,0,0.236323


In [99]:
test.head()

Unnamed: 0,MYDEPV,Income,Age,ones,price20,price30,odds_ratio
3,1,76,44,1,1,0,0.988996
4,0,24,37,1,0,1,0.024189
11,0,21,46,1,0,1,0.022988
12,0,49,44,1,0,1,0.414699
20,0,24,55,1,0,0,0.299236


### Checking that the probability mass equals counts

In [94]:
sum(train["MYDEPV"])

266

In [95]:
sum(train["odds_ratio"])

266.00000000000006

In [100]:
sum(test["MYDEPV"])

58

In [101]:
sum(test["odds_ratio"])

64.45054631047128

In [114]:
model.params

Income     0.124147
Age        0.035591
price20   -0.714785
price30   -2.205811
ones      -5.787987
dtype: float64

In [118]:
log_odds_ratio = model.params[0]*58 + model.params[1]*25 + model.params[2] + model.params[4]
prob = 1/(1+exp(-log_odds_ratio))

In [119]:
prob

0.8302703540136706