## Preliminaries

### Libraries

In [1]:
import os
import sys
import glob
import logging

import pathlib

import numpy as np
import pandas as pd

import statsmodels.api as sma

<br>

### Paths

In [2]:
os.chdir(pathlib.Path(os.getcwd()).parent)

<br>

### Logging

In [3]:
logging.basicConfig(level=logging.INFO,
                        format='\n%(message)s\n%(asctime)s.%(msecs)03d\n',
                        datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)

<br>
<br>

## Data

In [4]:
source = os.path.join(os.getcwd(), 'warehouse', 'missing', 'disaggregates')
paths = glob.glob(pathname=os.path.join(source, '*.csv'))

<br>

Selecting a sample

In [5]:
path = paths[0]
frame = pd.read_csv(filepath_or_buffer=path, header=0, dtype=np.int32, encoding='utf-8')
logger.info(pathlib.Path(path).stem)


AO
2022-07-14 15:27:13.625



<br>

Take note of the sums of latitude, longitude, and co&ouml;rdinates

In [6]:
frame.sum()

site_id           160
longitude           0
latitude            1
year                0
hk_prevalence       0
asc_prevalence      1
tt_prevalence       0
coordinates         1
dtype: int64

<br>
<br>

## Null Regression

### Reference: Co&ouml;rdinates

Model

In [7]:
design = sma.add_constant(frame[['year', 'hk_prevalence', 'asc_prevalence', 'tt_prevalence']], prepend=False)
alg = sma.GLM(endog=frame[['coordinates']], exog=design, family=sma.families.Binomial())
model = alg.fit()

<br>

Summary

In [8]:
summary = model.summary()
summary

0,1,2,3
Dep. Variable:,coordinates,No. Observations:,160.0
Model:,GLM,Df Residuals:,158.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-6.0658
Date:,"Thu, 14 Jul 2022",Deviance:,12.132
Time:,15:27:13,Pearson chi2:,159.0
No. Iterations:,19,Pseudo R-squ. (CS):,7.861e-05
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
year,0,0,,,0,0
hk_prevalence,0,0,,,0,0
asc_prevalence,-15.5035,1.77e+04,-0.001,0.999,-3.48e+04,3.47e+04
tt_prevalence,0,0,,,0,0
const,-5.0626,1.003,-5.047,0.000,-7.029,-3.096


<br>

Estimates

In [9]:
estimates = summary.tables[1].data
estimates[0]

['', 'coef', 'std err', 'z', 'P>|z|', '[0.025', '0.975]']

<br>

Structuring

In [10]:
coefficients = pd.DataFrame(data = estimates[1:])
coefficients.set_axis(['variable', 'coefficient', 'S.E.', 'z', 'p_value', '0.025', '0.975'], axis='columns', inplace=True)

In [11]:
floats = ['coefficient', 'S.E.', 'z', 'p_value', '0.025', '0.975']
coefficients.loc[:, floats] = coefficients[floats].astype(float)
coefficients

Unnamed: 0,variable,coefficient,S.E.,z,p_value,0.025,0.975
0,year,0.0,0.0,,,0.0,0.0
1,hk_prevalence,0.0,0.0,,,0.0,0.0
2,asc_prevalence,-15.5035,17700.0,-0.001,0.999,-34800.0,34700.0
3,tt_prevalence,0.0,0.0,,,0.0,0.0
4,const,-5.0626,1.003,-5.047,0.0,-7.029,-3.096
