In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import scipy.stats as stats
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression

In [3]:
file_path = '../data/SpeedBrysbaert_Norms.csv'  
df = pd.read_csv(file_path, sep=';')

df.head()

Unnamed: 0,Woord,Horen,Zien,Ruiken,Proeven,Voelen,Sensaties,Modality,ModalityExclusivity,MaxPercStrength,...,Prevalence,Length,Nsyl,N_phonemes,PoS,OLD20,DLP_RT,DLP_Acc,DCP_RT,DCP_Acc
0,aai,0.2,3.1,0.0,0.0,4.4,0.8,Voelen,0.517647,4.4,...,1.714174,3,1,2,N,1.0,537.85,0.95,1024.92,0.97
1,aaien,0.0,2.7,0.0,0.0,4.9,0.9,Voelen,0.576471,4.9,...,2.586581,5,2,3,WW,1.25,571.5,0.95122,991.05,1.0
2,aal,0.0,3.4,0.9,0.3,1.4,0.4,Zien,0.53125,3.4,...,1.830408,3,1,2,N,1.0,613.63,0.8,1075.17,0.97
3,aalbes,0.0,4.0,1.92,4.17,2.92,0.25,Proeven,0.314465,4.17,...,1.562257,6,2,5,N,2.0,593.23,0.825,1041.53,0.96
4,aalmoes,0.5,2.9,0.0,0.0,1.5,0.3,Zien,0.557692,2.9,...,2.192199,7,2,5,N,2.55,634.01,0.925,953.61,0.98


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24036 entries, 0 to 24035
Data columns (total 28 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Woord                 24036 non-null  object 
 1   Horen                 24036 non-null  float64
 2   Zien                  24036 non-null  float64
 3   Ruiken                24036 non-null  float64
 4   Proeven               24036 non-null  float64
 5   Voelen                24036 non-null  float64
 6   Sensaties             24036 non-null  float64
 7   Modality              24036 non-null  object 
 8   ModalityExclusivity   24036 non-null  float64
 9   MaxPercStrength       24036 non-null  float64
 10  AvModalityStrength    24036 non-null  float64
 11  Minkowski3            24036 non-null  float64
 12  Concreteness          24036 non-null  float64
 13  ConcretenessCategory  24036 non-null  object 
 14  Imageability          24036 non-null  object 
 15  ImageabilityCategor

In [8]:
df['ConcretenessCategory'].value_counts()

ConcretenessCategory
low     12342
high    11694
Name: count, dtype: int64

In [9]:
df['ConcretenessCategory'] = df['ConcretenessCategory'].apply(lambda x: 0 if x == 'low' else 1)

In [13]:
df['ConcretenessCategory'].value_counts()

ConcretenessCategory
0    12342
1    11694
Name: count, dtype: int64

In [11]:
X = df[['Horen','Zien','Ruiken','Proeven','Voelen','Sensaties']]
X

Unnamed: 0,Horen,Zien,Ruiken,Proeven,Voelen,Sensaties
0,0.20,3.10,0.00,0.00,4.40,0.80
1,0.00,2.70,0.00,0.00,4.90,0.90
2,0.00,3.40,0.90,0.30,1.40,0.40
3,0.00,4.00,1.92,4.17,2.92,0.25
4,0.50,2.90,0.00,0.00,1.50,0.30
...,...,...,...,...,...,...
24031,1.11,2.44,0.00,0.00,0.56,2.89
24032,3.25,4.25,2.25,1.00,2.50,0.25
24033,0.92,3.92,2.58,0.25,1.17,0.50
24034,1.08,2.58,0.17,0.00,1.50,1.67


In [12]:
Y = df['ConcretenessCategory']
Y

0        1
1        1
2        1
3        1
4        1
        ..
24031    0
24032    1
24033    1
24034    0
24035    0
Name: ConcretenessCategory, Length: 24036, dtype: int64

In [17]:
from sklearn.metrics import accuracy_score

In [18]:
logreg_simple = LogisticRegression()

logreg_simple.fit(X, Y)

print(f'Accuracy: {accuracy_score(Y, logreg_simple.predict(X))}')

Accuracy: 0.8213513063737726


In [19]:
logreg_intercept_false = LogisticRegression(fit_intercept=False)

logreg_intercept_false.fit(X, Y)

print(f'Accuracy: {accuracy_score(Y, logreg_intercept_false.predict(X))}')

Accuracy: 0.7550757197537028


In [20]:
from statsmodels.genmod import families

res = sm.GLM(Y, X, family=families.Binomial()).fit()
res.summary()

0,1,2,3
Dep. Variable:,ConcretenessCategory,No. Observations:,24036.0
Model:,GLM,Df Residuals:,24030.0
Model Family:,Binomial,Df Model:,5.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-12031.0
Date:,"Wed, 02 Jul 2025",Deviance:,24061.0
Time:,00:03:11,Pearson chi2:,26900.0
No. Iterations:,6,Pseudo R-squ. (CS):,0.3192
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Horen,-0.1900,0.016,-12.214,0.000,-0.220,-0.159
Zien,0.0780,0.011,6.940,0.000,0.056,0.100
Ruiken,0.7969,0.050,16.093,0.000,0.700,0.894
Proeven,-0.0423,0.042,-1.009,0.313,-0.125,0.040
Voelen,0.9612,0.024,40.842,0.000,0.915,1.007
Sensaties,-1.2072,0.024,-50.758,0.000,-1.254,-1.161


In [42]:
filtered = res.pvalues[res.pvalues < 0.05].index
filtered

Index(['Horen', 'Zien', 'Ruiken', 'Voelen', 'Sensaties'], dtype='object')

In [43]:
X_filtered = X[filtered]
X_filtered

Unnamed: 0,Horen,Zien,Ruiken,Voelen,Sensaties
0,0.20,3.10,0.00,4.40,0.80
1,0.00,2.70,0.00,4.90,0.90
2,0.00,3.40,0.90,1.40,0.40
3,0.00,4.00,1.92,2.92,0.25
4,0.50,2.90,0.00,1.50,0.30
...,...,...,...,...,...
24031,1.11,2.44,0.00,0.56,2.89
24032,3.25,4.25,2.25,2.50,0.25
24033,0.92,3.92,2.58,1.17,0.50
24034,1.08,2.58,0.17,1.50,1.67


In [44]:
logreg_filtered = LogisticRegression()

logreg_filtered.fit(X_filtered, Y)

print(f'Accuracy filtered: {accuracy_score(Y, logreg_filtered.predict(X_filtered))}')


Accuracy filtered: 0.8217673489765353


In [45]:
X['Int'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Int'] = 1


In [46]:
res_int = sm.GLM(Y, X, family=families.Binomial()).fit()
res_int.summary()

0,1,2,3
Dep. Variable:,ConcretenessCategory,No. Observations:,24036.0
Model:,GLM,Df Residuals:,24029.0
Model Family:,Binomial,Df Model:,6.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-9808.5
Date:,"Wed, 02 Jul 2025",Deviance:,19617.0
Time:,00:28:58,Pearson chi2:,36900.0
No. Iterations:,6,Pseudo R-squ. (CS):,0.4341
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Horen,0.2130,0.019,11.040,0.000,0.175,0.251
Zien,1.1521,0.022,51.341,0.000,1.108,1.196
Ruiken,0.7310,0.049,14.800,0.000,0.634,0.828
Proeven,0.2609,0.049,5.343,0.000,0.165,0.357
Voelen,0.9436,0.025,37.226,0.000,0.894,0.993
Sensaties,-0.8157,0.026,-31.925,0.000,-0.866,-0.766
Int,-3.8590,0.068,-56.939,0.000,-3.992,-3.726
