# Examples from "Python for Data Analysis"

<https://wesmckinney.com/book/modeling>

# Patsy

In [None]:
import numpy as np
import pandas as pd
import patsy

In [None]:
data = pd.DataFrame({
    'key1': ['a', 'a', 'b', 'b', 'a', 'b', 'a', 'b'],
    'key2': [0, 1, 0, 1, 0, 1, 0, 0],
    'v1': [1, 2, 3, 4, 5, 6, 7, 8],
    'v2': [-1, 0, 2.5, -0.5, 4.0, -1.2, 0.2, -1.7]
})


In [None]:
y, X = patsy.dmatrices('v2 ~ key1', data)

In [None]:
y

In [None]:
X

In [None]:
y, X = patsy.dmatrices('v2 ~ key1 + 0', data)
display(X)

# Statsmodels

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
# To make the example reproducible
rng = np.random.default_rng(seed=12345)

def dnorm(mean, variance, size=1):
    if isinstance(size, int):
        size = size,
    return mean + np.sqrt(variance) * rng.standard_normal(*size)

N = 100
X = np.c_[dnorm(0, 0.4, size=N),
          dnorm(0, 0.6, size=N),
          dnorm(0, 0.2, size=N)]
eps = dnorm(0, 0.1, size=N)
beta = [0.1, 0.3, 0.5]

y = np.dot(X, beta) + eps

In [None]:
display(X[:5])

In [None]:
y[:5]

In [None]:
X_model = sm.add_constant(X)
display(X_model[:5])

In [None]:
model = sm.OLS(y, X)
results = model.fit()
display(results.params)

In [None]:
print(results.summary())

# SciKit Learn

In [None]:
train = pd.read_csv('/data/IFI8410/sess09/titanic/train.csv')
test = pd.read_csv('/data/IFI8410/sess09/titanic/test.csv')

In [None]:
train.head(4)

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

I would like to use Age as a predictor, but it has missing data. There are a number of ways to do missing data imputation, but I will do a simple one and use the median of the training dataset to fill the nulls in both tables:

In [None]:
impute_value = train['Age'].median()

train['Age'] = train['Age'].fillna(impute_value)
test['Age'] = test['Age'].fillna(impute_value)

Now we need to specify our models. I add a column IsFemale as an encoded version of the 'Sex' column:

In [None]:
train['IsFemale'] = (train['Sex'] == 'female').astype(int)
test['IsFemale'] = (test['Sex'] == 'female').astype(int)

Then we decide on some model variables and create NumPy arrays:

In [None]:
predictors = ['Pclass', 'IsFemale', 'Age']

X_train = train[predictors].to_numpy()

X_test = test[predictors].to_numpy()

y_train = train['Survived'].to_numpy()
# y_test = test['Survived'].to_numpy() not in data set

display(X_train[:5])
display(y_train[:5])

I make no claims that this is a good model or that these features are engineered properly. We use the LogisticRegression model from scikit-learn and create a model instance:

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

Now, we can form predictions for the test dataset using model.predict:
    

In [None]:
y_predict = model.predict(X_test)

display(y_predict[:10])

In [None]:
## (y_test == y_predict).mean()

In [None]:
from sklearn.linear_model import LogisticRegressionCV
model_cv = LogisticRegressionCV(Cs=10)
model_cv.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_score

model = LogisticRegression(C=10)

scores = cross_val_score(model, X_train, y_train, cv=4)

display(scores)