In [1]:
#-*- coding: utf-8 -*-

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from pandas import DataFrame, Series

%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
phone_brand_device_model = pd.read_csv('data/phone_brand_device_model.csv')
gender_age_train = pd.read_csv('data/gender_age_train.csv')
label_categories = pd.read_csv('data/label_categories.csv')
events = pd.read_csv('data/events.csv')
app_labels = pd.read_csv('data/app_labels.csv')
app_events = pd.read_csv('data/app_events.csv', dtype = {'event_id' : np.int8, 'app_id' : np.int64, 
                                                   'is_installed' : np.int8, 'is_active' : np.int8})
gender_age_test = pd.read_csv('data/gender_age_test.csv')

In [3]:
print('train_deviceid counts : {}' .format(len(gender_age_train.device_id.unique())))
print('test_deviceid  counts : {}' .format(len(gender_age_test.device_id.unique())))
print('events event_id counts : {}' .format(len(events.event_id.unique())))
print('events device_id counts : {}' .format(len(events.device_id.unique())))
print('app_events event_id counts : {}' .format(len(app_events.event_id.unique())))
print('app_events app_id counts : {}' .format(len(app_events.app_id.unique())))

train_deviceid counts : 74645
test_deviceid  counts : 112071
events event_id counts : 3252950
events device_id counts : 60865
app_events event_id counts : 256
app_events app_id counts : 19237


## 브랜드명 전환

In [4]:
english_phone_brands_mapping = {
    "三星": "samsung",
    "天语": "Ktouch",
    "海信": "hisense",
    "联想": "lenovo",
    "欧比": "obi",
    "爱派尔": "ipair",
    "努比亚": "nubia",
    "优米": "youmi",
    "朵唯": "dowe",
    "黑米": "heymi",
    "锤子": "hammer",
    "酷比魔方": "koobee",
    "美图": "meitu",
    "尼比鲁": "nibilu",
    "一加": "oneplus",
    "优购": "yougo",
    "诺基亚": "nokia",
    "糖葫芦": "candy",
    "中国移动": "ccmc",
    "语信": "yuxin",
    "基伍": "kiwu",
    "青橙": "greeno",
    "华硕": "asus",
    "夏新": "panosonic",
    "维图": "weitu",
    "艾优尼": "aiyouni",
    "摩托罗拉": "moto",
    "乡米": "xiangmi",
    "米奇": "micky",
    "大可乐": "bigcola",
    "沃普丰": "wpf",
    "神舟": "hasse",
    "摩乐": "mole",
    "飞秒": "fs",
    "米歌": "mige",
    "富可视": "fks",
    "德赛": "desci",
    "梦米": "mengmi",
    "乐视": "lshi",
    "小杨树": "smallt",
    "纽曼": "newman",
    "邦华": "banghua",
    "E派": "epai",
    "易派": "epai",
    "普耐尔": "pner",
    "欧新": "ouxin",
    "西米": "ximi",
    "海尔": "haier",
    "波导": "bodao",
    "糯米": "nuomi",
    "唯米": "weimi",
    "酷珀": "kupo",
    "谷歌": "google",
    "昂达": "ada",
    "聆韵": "lingyun",
    "小米": "Xiaomi",
    "华为": "Huawei",
    "魅族": "Meizu",
    "中兴": "ZTE",
    "酷派": "Coolpad",
    "金立": "Gionee",
    "SUGAR": "SUGAR",
    "OPPO": "OPPO",
    "vivo": "vivo",
    "HTC": "HTC",
    "LG": "LG",
    "ZUK": "ZUK",
    "TCL": "TCL",
    "LOGO": "LOGO",
    "SUGAR": "SUGAR",
    "Lovme": "Lovme",
    "PPTV": "PPTV",
    "ZOYE": "ZOYE",
    "MIL": "MIL",
    "索尼" : "Sony",
    "欧博信" : "Opssom",
    "奇酷" : "Qiku",
    "酷比" : "CUBE",
    "康佳" : "Konka",
    "亿通" : "Yitong",
    "金星数码" : "JXD",
    "至尊宝" : "Monkey King",
    "百立丰" : "Hundred Li Feng",
    "贝尔丰" : "Bifer",
    "百加" : "Bacardi",
    "诺亚信" : "Noain",
    "广信" : "Kingsun",
    "世纪天元" : "Ctyon",
    "青葱" : "Cong",
    "果米" : "Taobao",
    "斐讯" : "Phicomm",
    "长虹" : "Changhong",
    "欧奇" : "Oukimobile",
    "先锋" : "XFPLAY",
    "台电" : "Teclast",
    "大Q" : "Daq",
    "蓝魔" : "Ramos",
    "奥克斯" : "AUX"
}

phone_brand_device_model.phone_brand = phone_brand_device_model.phone_brand.map(pd.Series(english_phone_brands_mapping), na_action='ignore')

### time saperate year / month / day / hour

In [5]:
events['timestamp'].replace('-', '', regex = True, inplace = True)
events['timestamp'].replace(':', '', regex = True, inplace = True)
events['timestamp'].replace(' ', '', regex = True, inplace = True)

In [6]:
events['year'] = events['timestamp'].map(lambda x: x[0:4])
events['month'] = events['timestamp'].map(lambda x: x[4:6])
events['day'] = events['timestamp'].map(lambda x: x[6:8])
events['time'] = events['timestamp'].map(lambda x: x[8:10])

In [7]:
events.drop(['timestamp'], axis = 1, inplace = True)
events.drop(['year'], axis = 1, inplace = True)

### longitude / latitude mean값으로 변경
* dvice_id에 따른 지역 편차 줄임

In [8]:
event_mean = events.groupby('device_id', as_index = False).agg({'longitude' : 'mean', 
                                                   'latitude' : 'mean'})

In [9]:
event_mean.head()

Unnamed: 0,device_id,longitude,latitude
0,-9222956879900151005,90.592,18.552
1,-9222661944218806987,0.0,0.0
2,-9222399302879214035,0.0,0.0
3,-9221825537663503111,112.300808,33.859091
4,-9221767098072603291,0.0,0.0


In [10]:
events_tude_mean = pd.merge(events, event_mean, on = 'device_id')

In [11]:
events_tude_mean = events_tude_mean.drop('longitude_x', 1)
events_tude_mean = events_tude_mean.drop('latitude_x', 1)

In [12]:
events_tude_mean.sort_values(by = 'device_id').head()

Unnamed: 0,event_id,device_id,month,day,time,longitude_y,latitude_y
1510824,661623,-9222956879900151005,5,7,11,90.592,18.552
1510850,2085017,-9222956879900151005,5,6,15,90.592,18.552
1510849,2085015,-9222956879900151005,5,6,15,90.592,18.552
1510848,2084996,-9222956879900151005,5,6,15,90.592,18.552
1510847,2068832,-9222956879900151005,5,7,12,90.592,18.552


### 명목형 feature  - label encoding (brand name)

In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
le = LabelEncoder()

In [13]:
phone_brand_device_model.phone_brand.dropna().max()

'yuxin'

In [15]:
# 누락된 phone_brand data가 있었다!!!!!!

phone_brand_device_model.phone_brand.fillna(phone_brand_device_model.phone_brand.dropna().max(), inplace = True)


In [16]:
phone_brand_device_model['brand'] = le.fit_transform(phone_brand_device_model['phone_brand'])

In [17]:
phone_brand_device_model.device_model.max()

'黄金斗士青春版'

In [18]:
phone_brand_device_model.device_model.fillna(phone_brand_device_model.device_model.dropna().max(), inplace = True)

In [19]:
phone_brand_device_model['device'] = le.fit_transform(phone_brand_device_model['device_model'])

In [20]:
phone_brand_device_model.head()

Unnamed: 0,device_id,phone_brand,device_model,brand,device
0,-8890648629457979026,Xiaomi,红米,37,1517
1,1277779817574759137,Xiaomi,MI 2,37,749
2,5137427614288105724,samsung,Galaxy S4,85,560
3,3669464369358936369,SUGAR,时尚手机,31,1503
4,-5019277647504317457,samsung,Galaxy Note 2,85,536


In [21]:
phone_brand_label = phone_brand_device_model.copy()

In [22]:
phone_brand_label = phone_brand_label.drop('phone_brand', 1)
phone_brand_label = phone_brand_label.drop('device_model', 1)

In [23]:
phone_brand_label.head()

Unnamed: 0,device_id,brand,device
0,-8890648629457979026,37,1517
1,1277779817574759137,37,749
2,5137427614288105724,85,560
3,3669464369358936369,31,1503
4,-5019277647504317457,85,536


In [32]:
# merge1 : train + event

age_even = pd.merge(gender_age_train, events_tude_mean, how = 'left', on = 'device_id')

In [34]:
age_even.shape

(1266931, 10)

In [33]:
age_even.isnull().sum()

device_id          0
gender             0
age                0
group              0
event_id       51336
month          51336
day            51336
time           51336
longitude_y    51336
latitude_y     51336
dtype: int64

In [35]:
age_even.head()

Unnamed: 0,device_id,gender,age,group,event_id,month,day,time,longitude_y,latitude_y
0,-8076087639492063270,M,35,M32-38,,,,,,
1,-2897161552818060146,M,35,M32-38,,,,,,
2,-8260683887967679142,M,35,M32-38,2479656.0,5.0,1.0,14.0,0.0,0.0
3,-4938849341048082022,M,30,M29-31,,,,,,
4,245133531816851882,M,30,M29-31,,,,,,


In [None]:
# merge 2 : train + event + app_evnets

In [36]:
age_app = pd.merge(age_even, app_events, how = 'left', on = 'event_id')

In [37]:
age_app.head()

Unnamed: 0,device_id,gender,age,group,event_id,month,day,time,longitude_y,latitude_y,app_id,is_installed,is_active
0,-8076087639492063270,M,35,M32-38,,,,,,,,,
1,-2897161552818060146,M,35,M32-38,,,,,,,,,
2,-8260683887967679142,M,35,M32-38,2479660.0,5.0,1.0,14.0,0.0,0.0,,,
3,-4938849341048082022,M,30,M29-31,,,,,,,,,
4,245133531816851882,M,30,M29-31,,,,,,,,,


In [None]:
# merge 2 : train + event + app_evnets + app_label

In [38]:
age_label = pd.merge(age_app, app_labels, how='left', on='app_id')

In [39]:
age_label.head()

Unnamed: 0,device_id,gender,age,group,event_id,month,day,time,longitude_y,latitude_y,app_id,is_installed,is_active,label_id
0,-8076087639492063270,M,35,M32-38,,,,,,,,,,
1,-2897161552818060146,M,35,M32-38,,,,,,,,,,
2,-8260683887967679142,M,35,M32-38,2479660.0,5.0,1.0,14.0,0.0,0.0,,,,
3,-4938849341048082022,M,30,M29-31,,,,,,,,,,
4,245133531816851882,M,30,M29-31,,,,,,,,,,


In [43]:
age_model = pd.merge(age_label, phone_brand_label, how = 'left', on = 'device_id')

In [44]:
age_model.head()

Unnamed: 0,device_id,gender,age,group,event_id,month,day,time,longitude_y,latitude_y,app_id,is_installed,is_active,label_id,brand,device
0,-8076087639492063270,M,35,M32-38,,,,,,,,,,,37,749
1,-2897161552818060146,M,35,M32-38,,,,,,,,,,,37,749
2,-8260683887967679142,M,35,M32-38,2479660.0,5.0,1.0,14.0,0.0,0.0,,,,,37,749
3,-4938849341048082022,M,30,M29-31,,,,,,,,,,,37,1524
4,245133531816851882,M,30,M29-31,,,,,,,,,,,37,753


In [45]:
age_model.shape

(7318396, 16)

In [None]:
# test 에도 붙여 보자

In [51]:
test_e = pd.merge(gender_age_test, events_tude_mean, how = 'left', on = 'device_id')

In [52]:
test_e.head()

Unnamed: 0,device_id,event_id,month,day,time,longitude_y,latitude_y
0,1002079943728939269,460577.0,5,3,21,0.0,0.0
1,1002079943728939269,755837.0,5,5,22,0.0,0.0
2,1002079943728939269,1171252.0,5,2,8,0.0,0.0
3,1002079943728939269,1805074.0,5,1,16,0.0,0.0
4,1002079943728939269,2145937.0,5,5,8,0.0,0.0


In [53]:
test_app = pd.merge(test_e, app_events, how = 'left', on = 'event_id')

In [54]:
test_app.head()

Unnamed: 0,device_id,event_id,month,day,time,longitude_y,latitude_y,app_id,is_installed,is_active
0,1002079943728939269,460577.0,5,3,21,0.0,0.0,,,
1,1002079943728939269,755837.0,5,5,22,0.0,0.0,,,
2,1002079943728939269,1171250.0,5,2,8,0.0,0.0,,,
3,1002079943728939269,1805070.0,5,1,16,0.0,0.0,,,
4,1002079943728939269,2145940.0,5,5,8,0.0,0.0,,,


In [55]:
test_app.shape

(12039969, 10)

In [56]:
test_app.app_id.isnull().sum()

2021620

In [57]:
test_app_label = pd.merge(test_app, app_labels, how='left', on = 'app_id')

In [58]:
test_app_label.head()

Unnamed: 0,device_id,event_id,month,day,time,longitude_y,latitude_y,app_id,is_installed,is_active,label_id
0,1002079943728939269,460577.0,5,3,21,0.0,0.0,,,,
1,1002079943728939269,755837.0,5,5,22,0.0,0.0,,,,
2,1002079943728939269,1171250.0,5,2,8,0.0,0.0,,,,
3,1002079943728939269,1805070.0,5,1,16,0.0,0.0,,,,
4,1002079943728939269,2145940.0,5,5,8,0.0,0.0,,,,


In [59]:
test_app_label.app_id.isnull().sum()

2021620

In [60]:
test_app_label.shape

(12413635, 11)

In [63]:
test_model = pd.merge(test_app_label, phone_brand_label, how = 'left', on = 'device_id')

In [67]:
test_model.head()

Unnamed: 0,device_id,event_id,month,day,time,longitude_y,latitude_y,app_id,is_installed,is_active,label_id,brand,device
0,1002079943728939269,460577.0,5,3,21,0.0,0.0,,,,,37,1482
1,1002079943728939269,755837.0,5,5,22,0.0,0.0,,,,,37,1482
2,1002079943728939269,1171250.0,5,2,8,0.0,0.0,,,,,37,1482
3,1002079943728939269,1805070.0,5,1,16,0.0,0.0,,,,,37,1482
4,1002079943728939269,2145940.0,5,5,8,0.0,0.0,,,,,37,1482


In [64]:
test_model.shape

(12427350, 13)

In [68]:
test_model.month.isnull().sum()

77047

In [None]:
# 필요한 col만 남기고 삭제

In [69]:
age_model.head()

Unnamed: 0,device_id,gender,age,group,event_id,month,day,time,longitude_y,latitude_y,app_id,is_installed,is_active,label_id,brand,device
0,-8076087639492063270,M,35,M32-38,,,,,,,,,,,37,749
1,-2897161552818060146,M,35,M32-38,,,,,,,,,,,37,749
2,-8260683887967679142,M,35,M32-38,2479660.0,5.0,1.0,14.0,0.0,0.0,,,,,37,749
3,-4938849341048082022,M,30,M29-31,,,,,,,,,,,37,1524
4,245133531816851882,M,30,M29-31,,,,,,,,,,,37,753


In [74]:
train_f = age_model.drop(['gender', 'age', 'event_id', 'is_installed', 'is_active'], 1)

In [75]:
train_f.tail()

Unnamed: 0,device_id,group,month,day,time,longitude_y,latitude_y,app_id,label_id,brand,device
7318391,89181010588227347,M23-26,5,5,19,48.209412,10.958824,,,37,752
7318392,89181010588227347,M23-26,5,6,17,48.209412,10.958824,,,37,752
7318393,89181010588227347,M23-26,5,4,14,48.209412,10.958824,,,37,752
7318394,89181010588227347,M23-26,5,4,14,48.209412,10.958824,,,37,752
7318395,89181010588227347,M23-26,5,6,17,48.209412,10.958824,,,37,752


In [81]:
X = train_f.drop('group', 1)

In [82]:
X.head()

Unnamed: 0,device_id,month,day,time,longitude_y,latitude_y,app_id,label_id,brand,device
0,-8076087639492063270,,,,,,,,37,749
1,-2897161552818060146,,,,,,,,37,749
2,-8260683887967679142,5.0,1.0,14.0,0.0,0.0,,,37,749
3,-4938849341048082022,,,,,,,,37,1524
4,245133531816851882,,,,,,,,37,753


In [105]:
y = age_model.group_label

In [106]:
y.head()

0    10
1    10
2    10
3     9
4     9
Name: group_label, dtype: int64

In [None]:
# nan 값 치환

In [99]:
X.fillna(-1, inplace = True)

In [111]:
test_model.fillna(-1, inplace = True)

In [112]:
test_model.head()

Unnamed: 0,device_id,event_id,month,day,time,longitude_y,latitude_y,app_id,is_installed,is_active,label_id,brand,device
0,1002079943728939269,460577.0,5,3,21,0.0,0.0,-1.0,-1.0,-1.0,-1.0,37,1482
1,1002079943728939269,755837.0,5,5,22,0.0,0.0,-1.0,-1.0,-1.0,-1.0,37,1482
2,1002079943728939269,1171252.0,5,2,8,0.0,0.0,-1.0,-1.0,-1.0,-1.0,37,1482
3,1002079943728939269,1805074.0,5,1,16,0.0,0.0,-1.0,-1.0,-1.0,-1.0,37,1482
4,1002079943728939269,2145937.0,5,5,8,0.0,0.0,-1.0,-1.0,-1.0,-1.0,37,1482


In [100]:
X.head()

Unnamed: 0,device_id,month,day,time,longitude_y,latitude_y,app_id,label_id,brand,device
0,-8076087639492063270,-1,-1,-1,-1.0,-1.0,-1.0,-1.0,37,749
1,-2897161552818060146,-1,-1,-1,-1.0,-1.0,-1.0,-1.0,37,749
2,-8260683887967679142,5,1,14,0.0,0.0,-1.0,-1.0,37,749
3,-4938849341048082022,-1,-1,-1,-1.0,-1.0,-1.0,-1.0,37,1524
4,245133531816851882,-1,-1,-1,-1.0,-1.0,-1.0,-1.0,37,753


In [None]:
# y label encoding

In [103]:
age_model['group_label'] = le.fit_transform(age_model['group'])

In [104]:
age_model.head()

Unnamed: 0,device_id,gender,age,group,event_id,month,day,time,longitude_y,latitude_y,app_id,is_installed,is_active,label_id,brand,device,group_label
0,-8076087639492063270,M,35,M32-38,,,,,,,,,,,37,749,10
1,-2897161552818060146,M,35,M32-38,,,,,,,,,,,37,749,10
2,-8260683887967679142,M,35,M32-38,2479660.0,5.0,1.0,14.0,0.0,0.0,,,,,37,749,10
3,-4938849341048082022,M,30,M29-31,,,,,,,,,,,37,1524,9
4,245133531816851882,M,30,M29-31,,,,,,,,,,,37,753,9


In [None]:
# Logistic regression

In [76]:
from sklearn.linear_model import LogisticRegression

In [86]:
from sklearn.model_selection import KFold, cross_val_score

In [85]:
cv = KFold(n_splits = 5, shuffle = True, random_state = 0)
for train_index, test_index in cv.split(X):
    print('test index : ', test_index)
    print('train index : ', train_index)
    print('-' * 80)

test index :  [      0       6       7 ..., 7318382 7318384 7318394]
train index :  [      1       2       3 ..., 7318392 7318393 7318395]
--------------------------------------------------------------------------------
test index :  [      1       4       8 ..., 7318375 7318379 7318392]
train index :  [      0       2       3 ..., 7318393 7318394 7318395]
--------------------------------------------------------------------------------
test index :  [      2       9      10 ..., 7318373 7318386 7318390]
train index :  [      0       1       3 ..., 7318393 7318394 7318395]
--------------------------------------------------------------------------------
test index :  [      5      12      13 ..., 7318387 7318388 7318395]
train index :  [      0       1       2 ..., 7318392 7318393 7318394]
--------------------------------------------------------------------------------
test index :  [      3      18      20 ..., 7318389 7318391 7318393]
train index :  [      0       1       2 ..., 731839

In [77]:
lr = LogisticRegression()

In [110]:
cross_val_score(lr, X, y, scoring = 'log_loss', cv = cv, n_jobs = -1)


array([-2.46793359, -2.46795482, -2.46793862, -2.46791191, -2.46799055])

In [114]:
from sklearn.model_selection import train_test_split

In [115]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [116]:
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [118]:
from sklearn.metrics import accuracy_score

In [119]:
y_pred = lr.predict(X_test)

In [121]:
accuracy_score(y_test, y_pred)

0.12140802438129886

In [133]:
y_pred_df = lr.predict_proba(X_test)

In [134]:
y_pred_df

array([[ 0.07519272,  0.0826096 ,  0.07577287, ...,  0.07873953,
         0.08543209,  0.08993332],
       [ 0.08759845,  0.07453989,  0.09198139, ...,  0.09368393,
         0.08397436,  0.09108098],
       [ 0.08714   ,  0.08517791,  0.08613267, ...,  0.08421172,
         0.08202304,  0.07835666],
       ..., 
       [ 0.08380962,  0.08312036,  0.08390138, ...,  0.08381715,
         0.08326781,  0.08326684],
       [ 0.07825476,  0.08537585,  0.07739926, ...,  0.07841014,
         0.08414273,  0.08443378],
       [ 0.08198979,  0.08079544,  0.08328494, ...,  0.08463858,
         0.08421695,  0.08745206]])

In [136]:
result = pd.DataFrame(y_pred_df, columns = le.classes_)
device_id = X_test["device_id"].values
result['device_id'] = device_id
result = result.set_index('device_id')

In [137]:
result.head()

Unnamed: 0_level_0,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
8975547711407443166,0.075193,0.08261,0.075773,0.081467,0.082517,0.099131,0.081003,0.084038,0.084164,0.07874,0.085432,0.089933
-7543605604869881675,0.087598,0.07454,0.091981,0.080593,0.074637,0.062352,0.083672,0.088031,0.087856,0.093684,0.083974,0.091081
-4047091239451547677,0.08714,0.085178,0.086133,0.084786,0.085212,0.078178,0.084537,0.082144,0.082102,0.084212,0.082023,0.078357
-1059933011524840053,0.082829,0.080026,0.084487,0.081961,0.080039,0.079382,0.082945,0.085256,0.085227,0.085837,0.084183,0.08783
1476664663289716375,0.080084,0.07946,0.082084,0.081207,0.079444,0.08419,0.082134,0.085682,0.085692,0.084553,0.084962,0.090508


In [None]:
# 그래프.....ㅡㅡaaa

In [122]:
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

In [129]:
def plot_decision_regions(X, y, classifier, test_idx = None, resolution = 0.02):
    markers = ('1', '2', '3', '4', '5', '6', '7', '8', '9', 'o', 's', 'x')
    colors = ('#FF0000', '#FF5E00', '#FFBB00', '#FFE400', '#F15F5F', '#FF00DD',
              '#00D8FF', '#0054FF',  '#0100FF', '#5F00FF', '#6799FF', '#3DB7CC')
    cmap = ListedColormap(colors[:len(np.unique(y))])
    
    x1_min, x1_max = X[:, 14].min() - 1, X[:, 14].max() + 1
    x2_min, x2_max = X[:, 15].min() - 1, X[:, 15].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                          np.arange(x2_min, x2_max, resolution))
    
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())
    
    X_test, y_test = X[test_idx, :], y[test_idx]
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], y = X[y == cl, 1],
                   alpha = 0.8, c = cmap(idx), marker = markers[idx], label = cl)
    if test_idx:
        X_test, y_test = X[test_idx, :], y[test_idx]
        plt.scatter(X_test[: 0], X_test[:, 1], c = '', alpha = 1.0, linewithd = 1, marker = 'o', s = 55, label = 'test set')

In [130]:
plot_decision_regions(X_test, y_test, classifier = lr)
plt.show()

TypeError: unhashable type: 'slice'

In [113]:
y_pred = lr.predict_proba(test_model)

NotFittedError: Call fit before prediction