In [51]:
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm
import category_encoders as ce

from statsmodels.formula.api import ols

In [35]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00504/qsar_fish_toxicity.csv", 
                 sep=";",
                header=None)

In [36]:
df.columns = ['CIC0','SM1_Dz','GATS1i','NdsCH','NdssC','MLOGP','LC50']
df

Unnamed: 0,CIC0,SM1_Dz,GATS1i,NdsCH,NdssC,MLOGP,LC50
0,3.260,0.829,1.676,0,1,1.453,3.770
1,2.189,0.580,0.863,0,0,1.348,3.115
2,2.125,0.638,0.831,0,0,1.348,3.531
3,3.027,0.331,1.472,1,0,1.807,3.510
4,2.094,0.827,0.860,0,0,1.886,5.390
...,...,...,...,...,...,...,...
903,2.801,0.728,2.226,0,2,0.736,3.109
904,3.652,0.872,0.867,2,3,3.983,4.040
905,3.763,0.916,0.878,0,6,2.918,4.818
906,2.831,1.393,1.077,0,1,0.906,5.317


In [76]:
model = ols('LC50 ~ CIC0 + SM1_Dz + GATS1i + MLOGP', data=df).fit()
print(model.params)
print(model.conf_int())
print(model.rsquared)

Intercept    2.194353
CIC0         0.447502
SM1_Dz       1.220681
GATS1i      -0.774640
MLOGP        0.383101
dtype: float64
                  0         1
Intercept  1.835429  2.553276
CIC0       0.330823  0.564180
SM1_Dz     1.054169  1.387194
GATS1i    -0.980584 -0.568695
MLOGP      0.315757  0.450444
0.5447586742674835


In [44]:
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table

Unnamed: 0,sum_sq,df,F,PR(>F)
CIC0,54.900062,1.0,56.658984,1.253966e-13
SM1_Dz,200.574811,1.0,207.00095,2.070661e-42
GATS1i,52.803861,1.0,54.495624,3.535908e-13
MLOGP,120.782218,1.0,124.651913,3.3356030000000004e-27
Residual,874.967261,903.0,,


In [45]:
model = ols('LC50 ~ CIC0 + SM1_Dz + GATS1i + MLOGP + C(NdsCH) + C(NdssC)', data=df).fit()
print(model.params)
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table

Intercept        2.133239
C(NdsCH)[T.1]    0.706955
C(NdsCH)[T.2]    0.651138
C(NdsCH)[T.3]    0.928803
C(NdsCH)[T.4]    1.055457
C(NdssC)[T.1]   -0.070922
C(NdssC)[T.2]    0.294788
C(NdssC)[T.3]   -0.161324
C(NdssC)[T.4]    0.628982
C(NdssC)[T.5]    1.100969
C(NdssC)[T.6]    0.583812
CIC0             0.383082
SM1_Dz           1.245165
GATS1i          -0.719987
MLOGP            0.395890
dtype: float64


Unnamed: 0,sum_sq,df,F,PR(>F)
C(NdsCH),60.807023,4.0,17.166557,1.418242e-13
C(NdssC),11.912007,6.0,2.241936,0.0373671
CIC0,35.293036,1.0,39.854602,4.305656e-10
SM1_Dz,182.131298,1.0,205.671463,3.943134e-42
GATS1i,44.858286,1.0,50.656144,2.260169e-12
MLOGP,122.66089,1.0,138.514605,7.901821e-30
Residual,790.791522,893.0,,


In [46]:
df2 = pd.read_csv("https://raw.githubusercontent.com/rpruim/OpenIntro/master/data/hsb2.csv")
df2

Unnamed: 0,id,gender,race,ses,schtyp,prog,read,write,math,science,socst
0,70,male,white,low,public,general,57,52,41,47,57
1,121,female,white,middle,public,vocational,68,59,53,63,61
2,86,male,white,high,public,general,44,33,54,58,31
3,141,male,white,high,public,vocational,63,44,47,53,56
4,172,male,white,middle,public,academic,47,52,57,53,61
...,...,...,...,...,...,...,...,...,...,...,...
195,31,female,asian,middle,private,general,55,59,52,42,56
196,145,female,white,middle,public,vocational,42,46,38,36,46
197,187,female,white,middle,private,general,57,41,57,55,52
198,118,female,white,middle,public,general,55,62,58,58,61


In [80]:
import numpy as np

np.unique(df2['prog'])

array(['academic', 'general', 'vocational'], dtype=object)

In [78]:
model2 = ols('socst ~ read + write + science + math + C(gender) + C(race) + C(ses) + C(schtyp) + C(prog)', data=df2).fit()
print(model2.params)
print(model2.conf_int())
anova_table2 = sm.stats.anova_lm(model2, typ=2)
anova_table2

Intercept                17.145557
C(gender)[T.male]        -0.323586
C(race)[T.asian]         -6.448530
C(race)[T.hispanic]      -1.383306
C(race)[T.white]         -2.417137
C(ses)[T.low]            -4.804438
C(ses)[T.middle]         -0.938233
C(schtyp)[T.public]       1.115530
C(prog)[T.general]       -0.837038
C(prog)[T.vocational]    -4.287237
read                      0.318330
write                     0.365629
science                  -0.020366
math                      0.089488
dtype: float64
                               0          1
Intercept               7.207482  27.083633
C(gender)[T.male]      -2.729257   2.082086
C(race)[T.asian]      -12.351003  -0.546056
C(race)[T.hispanic]    -5.994023   3.227411
C(race)[T.white]       -6.345803   1.511528
C(ses)[T.low]          -8.051929  -1.556947
C(ses)[T.middle]       -3.545530   1.669064
C(schtyp)[T.public]    -1.918253   4.149314
C(prog)[T.general]     -3.817134   2.143059
C(prog)[T.vocational]  -7.393191  -1.181283
read       

Unnamed: 0,sum_sq,df,F,PR(>F)
C(gender),4.080675,1.0,0.070416,0.791026
C(race),279.10617,3.0,1.605419,0.189621
C(ses),551.529104,2.0,4.758595,0.009649
C(schtyp),30.494345,1.0,0.526211,0.469116
C(prog),458.59954,2.0,3.956798,0.020756
read,919.949378,1.0,15.874652,9.7e-05
write,986.091303,1.0,17.015997,5.6e-05
science,3.31124,1.0,0.057139,0.81134
math,55.711742,1.0,0.961362,0.328118
Residual,10778.855972,186.0,,


In [57]:
encoder = ce.OneHotEncoder(df2['ses'])
encoder.fit_transform(df2['ses'])

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,ses_1,ses_2,ses_3
0,1,0,0
1,0,1,0
2,0,0,1
3,0,0,1
4,0,1,0
...,...,...,...
195,0,1,0
196,0,1,0
197,0,1,0
198,0,1,0


In [69]:
encoder = ce.LeaveOneOutEncoder(df2['schtyp'])
encoder.fit_transform(df2['schtyp'],df2['socst'])

Unnamed: 0,schtyp
0,51.922156
1,51.898204
2,52.077844
3,51.928144
4,51.898204
...,...
195,54.741935
196,51.988024
197,54.870968
198,51.898204


In [66]:
codes = df2.groupby('ses')['socst'].mean()
{k: int(codes[k]) for k in codes.keys()}

{'high': 57, 'low': 47, 'middle': 52}

In [71]:
model.conf_int()

Unnamed: 0,0,1
Intercept,1.774573,2.491905
C(NdsCH)[T.1],0.513573,0.900337
C(NdsCH)[T.2],0.285476,1.016799
C(NdsCH)[T.3],0.093571,1.764036
C(NdsCH)[T.4],0.338854,1.77206
C(NdssC)[T.1],-0.230344,0.088501
C(NdssC)[T.2],0.059222,0.530355
C(NdssC)[T.3],-0.620833,0.298184
C(NdssC)[T.4],-0.0481,1.306064
C(NdssC)[T.5],-0.786621,2.988558
