In [30]:
import pandas as pd
import math
import numpy as np

# Creating the Dataframe

In [96]:
df = pd.DataFrame({'X1': ['a','a','a','b','b','b','c','c','c'], 'X2': ['a','b','c','a','b','c','a','b','c'],
                   'y': [0,0,1,0,1,1,1,1,1]})
print(df)
df = pd.concat([df,df,df,df,df,df])

  X1 X2  y
0  a  a  0
1  a  b  0
2  a  c  1
3  b  a  0
4  b  b  1
5  b  c  1
6  c  a  1
7  c  b  1
8  c  c  1


In [97]:
df_train = pd.get_dummies(df)

In [98]:
y = df_train.pop('y')
print(df_train.head())
print(y.head())

   X1_a  X1_b  X1_c  X2_a  X2_b  X2_c
0     1     0     0     1     0     0
1     1     0     0     0     1     0
2     1     0     0     0     0     1
3     0     1     0     1     0     0
4     0     1     0     0     1     0
0    0
1    0
2    1
3    0
4    1
Name: y, dtype: int64


In [34]:
from  sklearn.linear_model import LogisticRegression as LR

# First model (with all hot encoding)

In [91]:
model = LR(solver='liblinear', verbose=1)

In [92]:
model.fit(df_train,y)

[LibLinear]iter  1 act 1.588e+00 pre 1.538e+00 delta 1.254e+00 f 6.238e+00 |g| 2.784e+00 CG   2
iter  2 act 7.738e-03 pre 7.671e-03 delta 1.254e+00 f 4.651e+00 |g| 1.990e-01 CG   2
iter  3 act 1.534e-06 pre 1.534e-06 delta 1.254e+00 f 4.643e+00 |g| 2.846e-03 CG   2


In [93]:
print(model.predict(df_train))
print(y.to_numpy())

[0 0 1 0 1 1 1 1 1]
[0 0 1 0 1 1 1 1 1]


In [38]:
print(model.coef_[0].tolist())
print(model.intercept_[0])

[-1.593776170262741, -0.12262905777237422, 1.716382031083012, -1.5937761702627415, -0.12262905777237408, 1.716382031083012]
1.1639849995974119


In [90]:
model.score(df_train, y)

1.0

In [40]:
def model_man(X):
    b = model.coef_[0].tolist()
    b0 = model.intercept_[0]
    val = 1 / (1 + math.exp(-(np.dot(X,b) + b0)))
    return val

In [41]:
def my_model_man(X):
    b = [1,2,3,1,2,3]
    b0 = -3.5
    val = 1 / (1 + math.exp(-(np.dot(X,b) + b0)))
    return val

In [42]:
def check_results_models(m):
    expected = y.to_numpy().tolist()
    results = []
    for i in df_train.index:
        results.append(1 if m(df_train.iloc[i]) >= 0.5 else 0)
    print(results)
    print(expected)
    print('{}/{} wrong!'.format(sum([1 if r != ex else 0 for r,ex in zip(results, expected)]),(len(expected))))


In [43]:
print('model by sklearn')
check_results_models(model_man)

model by sklearn
[0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1]
[0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1]
0/54 wrong!


In [44]:
print('model by me')
check_results_models(my_model_man)

model by me
[0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1]
[0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1]
0/54 wrong!


# Second Model, without the c hot encoding

In [107]:
df_train2 = df_train[[c for c in df_train.columns if c not in ['X1_a', 'X2_a', 'y']]]
df_train2.head()

Unnamed: 0,X1_b,X1_c,X2_b,X2_c
0,0,0,0,0
1,0,0,1,0
2,0,0,0,1
3,1,0,0,0
4,1,0,1,0


In [108]:
model2 = LR(solver='liblinear', max_iter=100000, verbose=1)
model2.fit(df_train2, y)

[LibLinear]

iter  1 act 1.435e+01 pre 1.312e+01 delta 2.832e+00 f 3.743e+01 |g| 1.616e+01 CG   3
iter  2 act 7.510e-01 pre 6.953e-01 delta 2.832e+00 f 2.308e+01 |g| 3.127e+00 CG   3
iter  3 act 1.234e-02 pre 1.219e-02 delta 2.832e+00 f 2.233e+01 |g| 4.051e-01 CG   2
iter  4 act 4.199e-04 pre 4.197e-04 delta 2.832e+00 f 2.232e+01 |g| 3.909e-02 CG   3


In [109]:
print(model2.predict(df_train2))
print(y.to_numpy())

[0 0 1 0 1 1 1 1 1 0 0 1 0 1 1 1 1 1 0 0 1 0 1 1 1 1 1 0 0 1 0 1 1 1 1 1 0
 0 1 0 1 1 1 1 1 0 0 1 0 1 1 1 1 1]
[0 0 1 0 1 1 1 1 1 0 0 1 0 1 1 1 1 1 0 0 1 0 1 1 1 1 1 0 0 1 0 1 1 1 1 1 0
 0 1 0 1 1 1 1 1 0 0 1 0 1 1 1 1 1]


In [102]:
print(model2.coef_[0].tolist())
print(model2.intercept_[0])

[-1.6804230658142656, -0.44057783122649463, -1.6804230658142656, -0.44057783122649463]
2.041055854787329


In [110]:
model2.score(df_train2,y)

1.0

In [50]:
def model2_man(X):
    b = model2.coef_[0].tolist()
    b0 = model2.intercept_[0]
    val = 1 / (1 + math.exp(-(np.dot(X,b) + b0)))
    return val

In [51]:
def my_model2_man(X):
    b = [-2,-1,-2,-1]
    b0 = 6-3.5
    val = 1 / (1 + math.exp(-(np.dot(X,b) + b0)))
    return val

In [52]:
def check_results_models2(m):
    expected = y.to_numpy().tolist()
    results = []
    for i in df_train2.index:
        results.append(1 if m(df_train2.iloc[i]) >= 0.5 else 0)
    print(results)
    print(expected)
    print('{}/{} wrong!'.format(sum([1 if r != ex else 0 for r,ex in zip(results, expected)]),(len(expected))))


In [53]:
print('modelo by sklearn')
check_results_models2(model2_man)

modelo by sklearn
[0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1]
[0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1]
0/54 wrong!


In [54]:
print('modelo by me')
check_results_models2(my_model2_man)

modelo by me
[0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1]
[0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1]
0/54 wrong!


# Validations (with model without reduntand column)

We can do hot encoding in the Ticket, Sex columns

In [55]:
dv = pd.DataFrame({'X1': ['a','a','b','b'], 'X2': ['a','b','a','b']})
dv

Unnamed: 0,X1,X2
0,a,a
1,a,b
2,b,a
3,b,b


In [115]:
df_train2.head()

Unnamed: 0,X1_b,X1_c,X2_b,X2_c
0,0,0,0,0
1,0,0,1,0
2,0,0,0,1
3,1,0,0,0
4,1,0,1,0


In [116]:
dv_test = pd.get_dummies(dv)
print('before reindex')
print(dv_test)

dv_test = dv_test.reindex(columns=df_train2.columns,fill_value=0)

print('after reindex')
print(dv_test)


before reindex
   X1_a  X1_b  X2_a  X2_b
0     1     0     1     0
1     1     0     0     1
2     0     1     1     0
3     0     1     0     1
after reindex
   X1_b  X1_c  X2_b  X2_c
0     0     0     0     0
1     0     0     1     0
2     1     0     0     0
3     1     0     1     0


In [117]:
model2.predict(dv_test)

array([0, 0, 0, 1])