In [1]:
# データ加工・処理・分析ライブラリ
import numpy as np
import numpy.random as random
import scipy as sp
from pandas import Series, DataFrame
import pandas as pd

In [35]:
# データを取得
import requests, zipfile
import io
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
res = requests.get(url).content

# 取得したデータをDataFrameオブジェクトとして読み込み
adult_raw = pd.read_csv(io.StringIO(res.decode('utf-8')), header=None)

# データの列にラベルを設定
adult_raw.columns =['age','workclass','fnlwgt','education','education-num','marital-status',
                             'occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week',
                             'native-country','flg-50K']


In [37]:
# first check whether the data contains the null
adult_raw.isnull().sum()
# some data manipulation. first get the dummy variable of the y variable
adult_raw['fin_flg'] = adult_raw['flg-50K'].map(lambda x: 1 if x ==' >50K' else 0)

In [40]:
adult=adult_raw[['age','fnlwgt','education-num','capital-gain','capital-loss','flg-50K','fin_flg']]
adult.head()

In [53]:
# now we can start the model. we first get the X and
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score,cross_val_predict
from sklearn.metrics import recall_score, precision_score
from sklearn.metrics import confusion_matrix
y=adult['fin_flg']
x=adult.drop(['fin_flg','flg-50K'],axis=1)
# scaling the X
x=preprocessing.scale(x, axis=0)
# split the data
x_train, x_verify, y_train, y_verify=train_test_split(x,y,test_size=0.5, random_state=0)
# now get the model fromt logistic
model = LogisticRegression()
model.fit(x_train, y_train)
# you can calculate the possibility for new sample 
print(model.predict_proba(x_verify))
# then you can plot the score
print( model.score(x_train,y_train),model.score(x_verify,y_verify) )
# you can also do the cross validation
cross_val_score(model, x,y, cv=3, scoring='accuracy')
# but since this is an classification problem, you need to consider the recall and precision
# first you can use cross_val_predict to the prediction on each test fold. 
y_predict = cross_val_predict(model, x,y, cv=3)

# using the y_predict and y, compute the confusion matrix, trough which you can work out the recall and precision. 
confusion_matrix(y,y_predict)
# of course you can directly comput recall and precision using precision_score and recall_score
print(recall_score(y,y_predict), precision_score(y,y_predict))
# you can also use cross_val_score to calculate the precision  and recall 
# which means that, for each devide, you train the model, compare the prediction and test test, and get the two scores.
cross_val_score(model,x,y,cv=3,scoring='precision')
cross_val_score(model,x,y,cv=3,scoring='recall')

[[0.89628393 0.10371607]
 [0.76290336 0.23709664]
 [0.82684487 0.17315513]
 ...
 [0.63634739 0.36365261]
 [0.90316553 0.09683447]
 [0.52317209 0.47682791]]
0.8106265356265356 0.8101467968797985
0.3558219614845045 0.7110091743119266


array([0.34697781, 0.35501148, 0.36548029])

If you want to deal with multiple classes, use softmax.
 ScikitLearn’s LogisticRegression uses one-versus-the-rest by default when you train it on more than two classes, but you can set the multi_class hyperparameter to "multinomial" to switch it to Softmax Regression. You must also specify a solver that supports Softmax Regression, such as the "lbfgs" solver (see Scikit-Learn’s documentation for more details). It also applies ℓ2 regularization by default, which you can control using the hyperparameter C:


In [76]:
from sklearn.datasets import load_iris
iris=load_iris()

In [82]:
X = iris["data"][:, (2, 3)] # petal length, petal width
y = iris["target"]
#X= preprocessing.scale(X,axis=0)
mult_logistic=LogisticRegression(multi_class="multinomial",solver="lbfgs", C=10)
mult_logistic.fit(X,y)
print( mult_logistic.predict_proba([[5, 2]]) )
# but how to calculate the recall, precision ???

[[6.38014896e-07 5.74929995e-02 9.42506362e-01]]
