# Example of Logistic Regression in Theano

In [1]:
import pandas as pd
import numpy as np
import theano
import theano.tensor as T

In [99]:
titanic = pd.read_csv('data/dati/titanic3.csv')

In [100]:
titanic.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29,0,0,24160,2113375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,9167,1,2,113781,1515500,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2,1,2,113781,1515500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30,1,2,113781,1515500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25,1,2,113781,1515500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [101]:
titanic.drop(['name', 'cabin', 'home.dest'], axis=1, inplace = True)

In [102]:
titanic.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,ticket,fare,embarked,boat,body
0,1,1,female,29,0,0,24160,2113375,S,2.0,
1,1,1,male,9167,1,2,113781,1515500,S,11.0,
2,1,0,female,2,1,2,113781,1515500,S,,
3,1,0,male,30,1,2,113781,1515500,S,,135.0
4,1,0,female,25,1,2,113781,1515500,S,,


In [103]:
def conversion_cat_to_numeric(col, data):
    for name in col:
        column = pd.Categorical.from_array(data[name])
        data[name] = column.codes

In [104]:
conversion_cat_to_numeric(['sex', 'embarked'], titanic)

In [105]:
titanic.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,ticket,fare,embarked,boat,body
0,1,1,0,29,0,0,24160,2113375,2,2.0,
1,1,1,1,9167,1,2,113781,1515500,2,11.0,
2,1,0,0,2,1,2,113781,1515500,2,,
3,1,0,1,30,1,2,113781,1515500,2,,135.0
4,1,0,0,25,1,2,113781,1515500,2,,


In [107]:
titanic.fillna(-999, inplace=True)

In [129]:
training_steps = 100000

In [109]:
def normalize(col, df):
    for i in col:
        df[i] = (df[i] - np.min(df[i]))/(np.max(df[i]) - np.min(df[i]))

In [110]:
titanic.dtypes

pclass        int64
survived      int64
sex            int8
age          object
sibsp         int64
parch         int64
ticket       object
fare         object
embarked       int8
boat         object
body        float64
dtype: object

In [111]:
titanic.drop('age', axis = 1, inplace=True)

In [112]:
titanic.drop(['ticket','fare','boat'], axis=1,inplace=True)

In [113]:
columns = titanic.columns.drop('survived')

In [114]:
normalize(columns, titanic)

In [115]:
X = titanic.drop('survived', axis = 1, inplace=False)
y_1 = titanic['survived']

In [116]:
rng = np.random

In [117]:
# Declare Theano symbolic variables
x = T.dmatrix("x")
y = T.dvector("y")

In [118]:
w = theano.shared(rng.randn(X.shape[1]), name="w")
b = theano.shared(0., name="b")

In [119]:
print("Initial weights:")
print(w.get_value())
print(b.get_value())

Initial weights:
[ 0.43110537 -1.54988228 -0.43010336 -0.08287    -1.91289806  0.90984403]
0.0


In [120]:
# Construct Theano expression graph
p_1 = 1 / (1 + T.exp(-T.dot(x, w) - b))   # Probability that target = 1
prediction = p_1 > 0.5                    # The prediction thresholded
xent = -y * T.log(p_1) - (1-y) * T.log(1-p_1) # Cross-entropy loss function
cost = xent.mean() + 0.01 * (w ** 2).sum()# The cost to minimize
gw, gb = T.grad(cost, [w, b])             # Compute the gradient of the cost
                                          # w.r.t weight vector w and
                                          # bias term b
                                          # (we shall return to this in a
                                          # following section of this tutorial)

In [121]:
train = theano.function(
          inputs=[x,y],
          outputs=[prediction, xent],
          updates=((w, w - 0.1 * gw), (b, b - 0.1 * gb)))
predict = theano.function(inputs=[x], outputs=prediction)

In [130]:
# Train
for i in range(training_steps):
    predicted, loss_function = train(X, y_1)

print("Final weights:")
print(w.get_value())
print(b.get_value())
print("target values:")
print(y_1)
print("prediction:")
print(predict(X))

Final weights:
[-0.96066539 -1.59627811 -0.15427345  0.02932547 -0.42129523 -0.7521374 ]
1.50842073742
target values:
0       1
1       1
2       0
3       0
4       0
5       1
6       1
7       0
8       1
9       0
10      0
11      1
12      1
13      1
14      1
15      0
16      0
17      1
18      1
19      0
20      1
21      1
22      1
23      1
24      1
25      0
26      1
27      1
28      1
29      1
       ..
1279    0
1280    0
1281    0
1282    0
1283    0
1284    0
1285    0
1286    1
1287    0
1288    0
1289    0
1290    1
1291    0
1292    0
1293    0
1294    0
1295    0
1296    0
1297    0
1298    0
1299    0
1300    1
1301    0
1302    0
1303    0
1304    0
1305    0
1306    0
1307    0
1308    0
Name: survived, dtype: int64
prediction:
[1 0 1 ..., 0 0 0]


In [131]:
from sklearn.metrics import accuracy_score

In [132]:
accuracy_score(y_1, predict(X))

0.78838808250572956

In [133]:
from sklearn.ensemble import ExtraTreesClassifier

In [134]:
extc = ExtraTreesClassifier()

In [136]:
extc.fit(X, y_1)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [137]:
extc.score(X,y_1)

0.82658517952635602

In [138]:
import xgboost as xgb

In [140]:
xgbc = xgb.XGBClassifier()

In [141]:
xgbc.fit(X,y_1)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [142]:
xgbc.score(X,y_1)

0.81359816653934303