In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from IPython.display import Markdown as md
from IPython.core.display import display, HTML
np.random.seed(42)
sns.set()

In [6]:
nes_df = pd.read_stata("./ARM_Data/nes/nes5200_processed_voters_realideo.dta")

In [7]:
nes_df.columns

Index(['year', 'resid', 'weight1', 'weight2', 'weight3', 'age', 'gender',
       'race', 'educ1', 'urban', 'region', 'income', 'occup1', 'union',
       'religion', 'educ2', 'educ3', 'martial_status', 'occup2', 'icpsr_cty',
       'fips_cty', 'partyid7', 'partyid3', 'partyid3_b', 'str_partyid',
       'father_party', 'mother_party', 'dlikes', 'rlikes', 'dem_therm',
       'rep_therm', 'regis', 'vote', 'regisvote', 'presvote',
       'presvote_2party', 'presvote_intent', 'ideo_feel', 'ideo7', 'ideo',
       'cd', 'state', 'inter_pre', 'inter_post', 'black', 'female', 'age_sq',
       'rep_presvote', 'rep_pres_intent', 'south', 'real_ideo', 'presapprov',
       'perfin1', 'perfin2', 'perfin', 'presadm', 'age_10', 'age_sq_10',
       'newfathe', 'newmoth', 'parent_party', 'white'],
      dtype='object')

In [19]:
mdl_df = nes_df[['gender', 'race', 'educ1', 'urban', 'region', 'income', 'partyid7', 'age_10']]
mdl_df.describe(include='all')

Unnamed: 0,gender,race,educ1,urban,region,income,partyid7,age_10
count,41395,41185,41059,38794,39824,37020,40109,39532.0
unique,2,6,4,3,4,5,7,
top,2. female,1. white,"2. high school (12 grades or fewer, incl","3. rural, small towns, outlying and adja","3. south (al,ar,de,d.c.,fl,ga,ky,la,md,m",3. 34 to 67 percentile,2. weak democrat,
freq,22866,34563,19021,14328,13134,12034,8872,
mean,,,,,,,,4.613984
std,,,,,,,,1.699646
min,,,,,,,,1.7
25%,,,,,,,,3.2
50%,,,,,,,,4.4
75%,,,,,,,,5.9


There's no ordinal multinomial logistic regression in neither sklearn nor statsmodels. Trying [mord](https://github.com/fabianp/mord).

In [34]:
from mord import OrdinalRidge

In [24]:
multiclass.multiclass_fit?

[0;31mSignature:[0m [0mmulticlass[0m[0;34m.[0m[0mmulticlass_fit[0m[0;34m([0m[0mX[0m[0;34m,[0m [0my[0m[0;34m,[0m [0malpha[0m[0;34m,[0m [0mn_class[0m[0;34m,[0m [0mmaxiter[0m[0;34m=[0m[0;36m100000[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Multiclass classification with absolute error cost

References
----------
Lee, Yoonkyung, Yi Lin, and Grace Wahba. "Multicategory support
vector machines: Theory and application to the classification of
microarray data and satellite radiance data." Journal of the
American Statistical Association 99.465 (2004): 67-81.
[0;31mFile:[0m      ~/.local/share/virtualenvs/gh_arm-ELdQCv2Y/lib/python3.7/site-packages/mord/multiclass.py
[0;31mType:[0m      function


In [60]:
exog = mdl_df[['age_10']].join(pd.get_dummies(mdl_df.gender, prefix='gender', drop_first=True)).fillna(0)
endog = pd.get_dummies(mdl_df.partyid7, prefix='partyid7')
mdl = OrdinalRidge(
    alpha=0.001,
    fit_intercept=True,
    normalize=False,
    copy_X=True,
    max_iter=None,
    tol=0.001,
    solver='auto'
)

mdl.fit(
    exog,
    endog
)

OrdinalRidge(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=None,
             normalize=False, random_state=None, solver='auto', tol=0.001)

In [61]:
mdl.predict(exog)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [57]:
endog

Unnamed: 0,partyid7_1. strong democrat,partyid7_2. weak democrat,partyid7_3. independent-democrat,partyid7_4. independent-independent,partyid7_5. independent-republican,partyid7_6. weak republican,partyid7_7. strong republican
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...
41493,0,1,0,0,0,0,0
41494,1,0,0,0,0,0,0
41495,0,0,0,0,1,0,0
41496,0,0,0,0,1,0,0


In [62]:
mdl.coef_

array([[ 0.02322624,  0.0063683 ],
       [-0.00398594,  0.03786828],
       [-0.00819234, -0.01479538],
       [-0.00651383, -0.00536393],
       [-0.00318636, -0.02820029],
       [ 0.00292985,  0.00465568],
       [ 0.01568014, -0.00331427]], dtype=float32)