# 웹사이트 카테고리 분류
* 목표 : 시각화 및 NLP 등 여러가지 기법을 사용해서 디테일한 카테고리 분류
## 왜 함?
* 웹 사이트 이용자들의 혼동을 줄여 이용자들의 편의 증가
* 각 카테고리 별 새로운 웹 사이트 모둠 개발 가능
* 오분류 제거
## 지표
- F1 - SCORE 로 확인 하겠음.
-------

### 라이브러리

In [34]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

### 데이터셋 확인

In [35]:
def load_website_data():
    return pd.read_csv(Path('./datasets/website_classification.csv'))

website_df = load_website_data()

website_df.head()

Unnamed: 0.1,Unnamed: 0,website_url,cleaned_website_text,Category
0,0,https://www.booking.com/index.html?aid=1743217,official site good hotel accommodation big sav...,Travel
1,1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel
2,2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel
3,3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel
4,4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel


In [36]:
website_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408 entries, 0 to 1407
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0            1408 non-null   int64 
 1   website_url           1408 non-null   object
 2   cleaned_website_text  1408 non-null   object
 3   Category              1408 non-null   object
dtypes: int64(1), object(3)
memory usage: 44.1+ KB


### 카테고리 종류 확인

In [37]:
website_df['Category'].value_counts()

Education                          114
Business/Corporate                 109
Travel                             107
Streaming Services                 105
Sports                             104
E-Commerce                         102
Games                               98
News                                96
Health and Fitness                  96
Photography                         93
Computers and Technology            93
Food                                92
Law and Government                  84
Social Networking and Messaging     83
Forums                              16
Adult                               16
Name: Category, dtype: int64

### URL 에 www. (    .. ) .com 사이의 데이터 추출해서 카테고리와 연관성 확인

In [38]:
website_df['website_url'].head()

0       https://www.booking.com/index.html?aid=1743217
1                     https://travelsites.com/expedia/
2                 https://travelsites.com/tripadvisor/
3                https://www.momondo.in/?ispredir=true
4    https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...
Name: website_url, dtype: object

### https:// , http://, www 있거나 없거나
* 도메인만 추출

In [39]:
from urllib.parse import urlparse

In [40]:
url_split = website_df['website_url'].apply(lambda url: 
                                             "{uri.netloc}".format(uri=urlparse(url)))

url_split.head()

0     www.booking.com
1     travelsites.com
2     travelsites.com
3      www.momondo.in
4    www.ebookers.com
Name: website_url, dtype: object

### domain 필요 없는 부분 제거

In [41]:
url_split = url_split.str.replace('www.', '')

url_split.head()

  url_split = url_split.str.replace('www.', '')


0        booking.com
1    travelsites.com
2    travelsites.com
3         momondo.in
4       ebookers.com
Name: website_url, dtype: object

In [42]:
url_split2 = url_split.str.replace('.com', '')
url_split2.head()

  url_split2 = url_split.str.replace('.com', '')


0        booking
1    travelsites
2    travelsites
3     momondo.in
4       ebookers
Name: website_url, dtype: object

In [43]:
url_split3 = url_split2.str.replace('.org', '')
url_split3.head()

  url_split3 = url_split2.str.replace('.org', '')


0        booking
1    travelsites
2    travelsites
3     momondo.in
4       ebookers
Name: website_url, dtype: object

In [44]:
website_df.columns

Index(['Unnamed: 0', 'website_url', 'cleaned_website_text', 'Category'], dtype='object')

In [45]:
website_df.insert(0, 'Domain', url_split3)
website_df.drop(['Unnamed: 0', 'website_url'], axis=1, inplace=True)
website_df.head()

Unnamed: 0,Domain,cleaned_website_text,Category
0,booking,official site good hotel accommodation big sav...,Travel
1,travelsites,expedia hotel book sites like use vacation wor...,Travel
2,travelsites,tripadvisor hotel book sites like previously d...,Travel
3,momondo.in,cheap flights search compare flights momondo f...,Travel
4,ebookers,bot create free account create free account si...,Travel


### 글자 유사도 ( 도메인과 클라스 유사도 )

In [46]:
# 교육

import difflib

to_match = 'education'
candidates = website_df['Domain']

similarity_education = []
for word in candidates:
    sim_score = str(difflib.SequenceMatcher(None, to_match, word).ratio())
    similarity_education.append(sim_score)

similarity_education[0:5]

['0.25', '0.2', '0.2', '0.21052631578947367', '0.23529411764705882']

In [47]:
# 사업

import difflib

to_match = 'business'
candidates = website_df['Domain']

similarity_business = []
for word in candidates:
    sim_score = str(difflib.SequenceMatcher(None, to_match, word).ratio())
    similarity_business.append(sim_score)

similarity_business[0:5]

['0.4',
 '0.42105263157894735',
 '0.42105263157894735',
 '0.2222222222222222',
 '0.25']

In [48]:
# 기업

import difflib

to_match = 'corporate'
candidates = website_df['Domain']

similarity_corporate = []
for word in candidates:
    sim_score = str(difflib.SequenceMatcher(None, to_match, word).ratio())
    similarity_corporate.append(sim_score)

similarity_corporate[0:5]

['0.25', '0.4', '0.4', '0.21052631578947367', '0.23529411764705882']

In [49]:
# 여행

import difflib

to_match = 'travel'
candidates = website_df['Domain']

similarity_travel = []
for word in candidates:
    sim_score = str(difflib.SequenceMatcher(None, to_match, word).ratio())
    similarity_travel.append(sim_score)

similarity_travel[0:5]

['0.0',
 '0.7058823529411765',
 '0.7058823529411765',
 '0.0',
 '0.14285714285714285']

In [50]:
# 운동

import difflib

to_match = 'sport'
candidates = website_df['Domain']

similarity_sports = []
for word in candidates:
    sim_score = str(difflib.SequenceMatcher(None, to_match, word).ratio())
    similarity_sports.append(sim_score)

similarity_sports[0:5]

['0.16666666666666666',
 '0.25',
 '0.25',
 '0.13333333333333333',
 '0.15384615384615385']

In [51]:
# 게임

import difflib

to_match = 'game'
candidates = website_df['Domain']

similarity_games = []
for word in candidates:
    sim_score = str(difflib.SequenceMatcher(None, to_match, word).ratio())
    similarity_games.append(sim_score)

similarity_games[0:5]

['0.18181818181818182',
 '0.26666666666666666',
 '0.26666666666666666',
 '0.14285714285714285',
 '0.16666666666666666']

In [52]:
# 뉴스

import difflib

to_match = 'news'
candidates = website_df['Domain']

similarity_news = []
for word in candidates:
    sim_score = str(difflib.SequenceMatcher(None, to_match, word).ratio())
    similarity_news.append(sim_score)

similarity_news[0:5]

['0.18181818181818182',
 '0.26666666666666666',
 '0.26666666666666666',
 '0.14285714285714285',
 '0.3333333333333333']

In [53]:
# 건강

import difflib

to_match = 'health'
candidates = website_df['Domain']

similarity_health = []
for word in candidates:
    sim_score = str(difflib.SequenceMatcher(None, to_match, word).ratio())
    similarity_health.append(sim_score)

similarity_health[0:5]

['0.0',
 '0.35294117647058826',
 '0.35294117647058826',
 '0.0',
 '0.14285714285714285']

In [54]:
# 피트니스

import difflib

to_match = 'fitness'
candidates = website_df['Domain']

similarity_fitness = []
for word in candidates:
    sim_score = str(difflib.SequenceMatcher(None, to_match, word).ratio())
    similarity_fitness.append(sim_score)

similarity_fitness[0:5]

['0.2857142857142857',
 '0.4444444444444444',
 '0.4444444444444444',
 '0.23529411764705882',
 '0.26666666666666666']

In [55]:
# 판매

import difflib

to_match = 'commerce'
candidates = website_df['Domain']

similarity_commerce = []
for word in candidates:
    sim_score = str(difflib.SequenceMatcher(None, to_match, word).ratio())
    similarity_commerce.append(sim_score)

similarity_commerce[0:5]

['0.13333333333333333',
 '0.21052631578947367',
 '0.21052631578947367',
 '0.2222222222222222',
 '0.375']

In [56]:
# 사진

import difflib

to_match = 'photo'
candidates = website_df['Domain']

similarity_photo = []
for word in candidates:
    sim_score = str(difflib.SequenceMatcher(None, to_match, word).ratio())
    similarity_photo.append(sim_score)

similarity_photo[0:5]

['0.3333333333333333',
 '0.125',
 '0.125',
 '0.26666666666666666',
 '0.3076923076923077']

In [57]:
# 컴퓨터

import difflib

to_match = 'computer'
candidates = website_df['Domain']

similarity_computer = []
for word in candidates:
    sim_score = str(difflib.SequenceMatcher(None, to_match, word).ratio())
    similarity_computer.append(sim_score)

similarity_computer[0:5]

['0.13333333333333333',
 '0.21052631578947367',
 '0.21052631578947367',
 '0.2222222222222222',
 '0.375']

In [58]:
# 기술

import difflib

to_match = 'tech'
candidates = website_df['Domain']

similarity_tech = []
for word in candidates:
    sim_score = str(difflib.SequenceMatcher(None, to_match, word).ratio())
    similarity_tech.append(sim_score)

similarity_tech[0:5]

['0.0',
 '0.26666666666666666',
 '0.26666666666666666',
 '0.0',
 '0.16666666666666666']

In [59]:
# 음식

import difflib

to_match = 'food'
candidates = website_df['Domain']

similarity_food = []
for word in candidates:
    sim_score = str(difflib.SequenceMatcher(None, to_match, word).ratio())
    similarity_food.append(sim_score)

similarity_food[0:5]

['0.36363636363636365',
 '0.0',
 '0.0',
 '0.42857142857142855',
 '0.3333333333333333']

In [60]:
# 법

import difflib

to_match = 'law'
candidates = website_df['Domain']

similarity_law = []
for word in candidates:
    sim_score = str(difflib.SequenceMatcher(None, to_match, word).ratio())
    similarity_law.append(sim_score)

similarity_law[0:5]

['0.0', '0.14285714285714285', '0.14285714285714285', '0.0', '0.0']

In [61]:
# 기관

import difflib

to_match = 'government'
candidates = website_df['Domain']

similarity_government = []
for word in candidates:
    sim_score = str(difflib.SequenceMatcher(None, to_match, word).ratio())
    similarity_government.append(sim_score)

similarity_government[0:5]

['0.11764705882352941',
 '0.2857142857142857',
 '0.2857142857142857',
 '0.3',
 '0.3333333333333333']

In [62]:
# 메세지, 채팅

import difflib

to_match = 'message'
candidates = website_df['Domain']

similarity_message = []
for word in candidates:
    sim_score = str(difflib.SequenceMatcher(None, to_match, word).ratio())
    similarity_message.append(sim_score)

similarity_message[0:5]

['0.14285714285714285',
 '0.2222222222222222',
 '0.2222222222222222',
 '0.11764705882352941',
 '0.26666666666666666']

In [63]:
# 포럼

import difflib

to_match = 'forum'
candidates = website_df['Domain']

similarity_forum = []
for word in candidates:
    sim_score = str(difflib.SequenceMatcher(None, to_match, word).ratio())
    similarity_forum.append(sim_score)

similarity_forum[0:5]

['0.16666666666666666',
 '0.125',
 '0.125',
 '0.26666666666666666',
 '0.3076923076923077']

In [64]:
# 성인

import difflib

to_match = 'adult'
candidates = website_df['Domain']

similarity_adult = []
for word in candidates:
    sim_score = str(difflib.SequenceMatcher(None, to_match, word).ratio())
    similarity_adult.append(sim_score)

similarity_adult[0:5]

['0.0', '0.375', '0.375', '0.13333333333333333', '0.0']

### 유사도 포함한 데이터 프레임

In [65]:
website_df.insert(0, 'word_sim_Education', similarity_education)
website_df.insert(0, 'word_sim_Business', similarity_business)
website_df.insert(0, 'word_sim_Corporate', similarity_corporate)
website_df.insert(0, 'word_sim_Travel', similarity_travel)
website_df.insert(0, 'word_sim_Sports', similarity_sports)
website_df.insert(0, 'word_sim_Games', similarity_games)
website_df.insert(0, 'word_sim_News', similarity_news)
website_df.insert(0, 'word_sim_Health', similarity_health)
website_df.insert(0, 'word_sim_Fitness', similarity_fitness)
website_df.insert(0, 'word_sim_Commerce', similarity_commerce)
website_df.insert(0, 'word_sim_Photo', similarity_photo)
website_df.insert(0, 'word_sim_Computer', similarity_computer)
website_df.insert(0, 'word_sim_Tech', similarity_tech)
website_df.insert(0, 'word_sim_Food', similarity_food)
website_df.insert(0, 'word_sim_Law', similarity_law)
website_df.insert(0, 'word_sim_Government', similarity_government)
website_df.insert(0, 'word_sim_Message', similarity_message)
website_df.insert(0, 'word_sim_Forum', similarity_forum)
website_df.insert(0, 'word_sim_Adult', similarity_adult)

website_df.head()

Unnamed: 0,word_sim_Adult,word_sim_Forum,word_sim_Message,word_sim_Government,word_sim_Law,word_sim_Food,word_sim_Tech,word_sim_Computer,word_sim_Photo,word_sim_Commerce,...,word_sim_News,word_sim_Games,word_sim_Sports,word_sim_Travel,word_sim_Corporate,word_sim_Business,word_sim_Education,Domain,cleaned_website_text,Category
0,0.0,0.1666666666666666,0.1428571428571428,0.1176470588235294,0.0,0.3636363636363636,0.0,0.1333333333333333,0.3333333333333333,0.1333333333333333,...,0.1818181818181818,0.1818181818181818,0.1666666666666666,0.0,0.25,0.4,0.25,booking,official site good hotel accommodation big sav...,Travel
1,0.375,0.125,0.2222222222222222,0.2857142857142857,0.1428571428571428,0.0,0.2666666666666666,0.2105263157894736,0.125,0.2105263157894736,...,0.2666666666666666,0.2666666666666666,0.25,0.7058823529411765,0.4,0.4210526315789473,0.2,travelsites,expedia hotel book sites like use vacation wor...,Travel
2,0.375,0.125,0.2222222222222222,0.2857142857142857,0.1428571428571428,0.0,0.2666666666666666,0.2105263157894736,0.125,0.2105263157894736,...,0.2666666666666666,0.2666666666666666,0.25,0.7058823529411765,0.4,0.4210526315789473,0.2,travelsites,tripadvisor hotel book sites like previously d...,Travel
3,0.1333333333333333,0.2666666666666666,0.1176470588235294,0.3,0.0,0.4285714285714285,0.0,0.2222222222222222,0.2666666666666666,0.2222222222222222,...,0.1428571428571428,0.1428571428571428,0.1333333333333333,0.0,0.2105263157894736,0.2222222222222222,0.2105263157894736,momondo.in,cheap flights search compare flights momondo f...,Travel
4,0.0,0.3076923076923077,0.2666666666666666,0.3333333333333333,0.0,0.3333333333333333,0.1666666666666666,0.375,0.3076923076923077,0.375,...,0.3333333333333333,0.1666666666666666,0.1538461538461538,0.1428571428571428,0.2352941176470588,0.25,0.2352941176470588,ebookers,bot create free account create free account si...,Travel


### 유사도 float 변형 및 소수점 제거

In [66]:
type_list = ['word_sim_Education', 'word_sim_Business', 'word_sim_Corporate', 'word_sim_Travel', 'word_sim_Sports', 'word_sim_Games', 'word_sim_News', 'word_sim_Health', 'word_sim_Fitness', 'word_sim_Commerce', 'word_sim_Photo', 'word_sim_Computer', 'word_sim_Tech', 'word_sim_Food', 'word_sim_Law', 'word_sim_Government', 'word_sim_Message', 'word_sim_Forum', 'word_sim_Adult']
website_df[type_list] = website_df[type_list].astype('float')

website_df.dtypes

word_sim_Adult          float64
word_sim_Forum          float64
word_sim_Message        float64
word_sim_Government     float64
word_sim_Law            float64
word_sim_Food           float64
word_sim_Tech           float64
word_sim_Computer       float64
word_sim_Photo          float64
word_sim_Commerce       float64
word_sim_Fitness        float64
word_sim_Health         float64
word_sim_News           float64
word_sim_Games          float64
word_sim_Sports         float64
word_sim_Travel         float64
word_sim_Corporate      float64
word_sim_Business       float64
word_sim_Education      float64
Domain                   object
cleaned_website_text     object
Category                 object
dtype: object

In [67]:
website_df = website_df.round(decimals=3)
website_df.head()

Unnamed: 0,word_sim_Adult,word_sim_Forum,word_sim_Message,word_sim_Government,word_sim_Law,word_sim_Food,word_sim_Tech,word_sim_Computer,word_sim_Photo,word_sim_Commerce,...,word_sim_News,word_sim_Games,word_sim_Sports,word_sim_Travel,word_sim_Corporate,word_sim_Business,word_sim_Education,Domain,cleaned_website_text,Category
0,0.0,0.167,0.143,0.118,0.0,0.364,0.0,0.133,0.333,0.133,...,0.182,0.182,0.167,0.0,0.25,0.4,0.25,booking,official site good hotel accommodation big sav...,Travel
1,0.375,0.125,0.222,0.286,0.143,0.0,0.267,0.211,0.125,0.211,...,0.267,0.267,0.25,0.706,0.4,0.421,0.2,travelsites,expedia hotel book sites like use vacation wor...,Travel
2,0.375,0.125,0.222,0.286,0.143,0.0,0.267,0.211,0.125,0.211,...,0.267,0.267,0.25,0.706,0.4,0.421,0.2,travelsites,tripadvisor hotel book sites like previously d...,Travel
3,0.133,0.267,0.118,0.3,0.0,0.429,0.0,0.222,0.267,0.222,...,0.143,0.143,0.133,0.0,0.211,0.222,0.211,momondo.in,cheap flights search compare flights momondo f...,Travel
4,0.0,0.308,0.267,0.333,0.0,0.333,0.167,0.375,0.308,0.375,...,0.333,0.167,0.154,0.143,0.235,0.25,0.235,ebookers,bot create free account create free account si...,Travel


In [68]:
website_df['category_id'] = website_df['Category'].factorize()[0]
category_id_df = website_df[['Category', 'category_id']].drop_duplicates()

category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Category']].values)

website_df.head()

Unnamed: 0,word_sim_Adult,word_sim_Forum,word_sim_Message,word_sim_Government,word_sim_Law,word_sim_Food,word_sim_Tech,word_sim_Computer,word_sim_Photo,word_sim_Commerce,...,word_sim_Games,word_sim_Sports,word_sim_Travel,word_sim_Corporate,word_sim_Business,word_sim_Education,Domain,cleaned_website_text,Category,category_id
0,0.0,0.167,0.143,0.118,0.0,0.364,0.0,0.133,0.333,0.133,...,0.182,0.167,0.0,0.25,0.4,0.25,booking,official site good hotel accommodation big sav...,Travel,0
1,0.375,0.125,0.222,0.286,0.143,0.0,0.267,0.211,0.125,0.211,...,0.267,0.25,0.706,0.4,0.421,0.2,travelsites,expedia hotel book sites like use vacation wor...,Travel,0
2,0.375,0.125,0.222,0.286,0.143,0.0,0.267,0.211,0.125,0.211,...,0.267,0.25,0.706,0.4,0.421,0.2,travelsites,tripadvisor hotel book sites like previously d...,Travel,0
3,0.133,0.267,0.118,0.3,0.0,0.429,0.0,0.222,0.267,0.222,...,0.143,0.133,0.0,0.211,0.222,0.211,momondo.in,cheap flights search compare flights momondo f...,Travel,0
4,0.0,0.308,0.267,0.333,0.0,0.333,0.167,0.375,0.308,0.375,...,0.167,0.154,0.143,0.235,0.25,0.235,ebookers,bot create free account create free account si...,Travel,0


In [69]:
category_id_df

Unnamed: 0,Category,category_id
0,Travel,0
36,Social Networking and Messaging,1
43,News,2
107,Streaming Services,3
213,Sports,4
398,Photography,5
586,Law and Government,6
670,Health and Fitness,7
768,Games,8
810,E-Commerce,9


In [70]:
website_df['category_id'].value_counts(normalize=True).round(4)

12    0.0810
14    0.0774
0     0.0760
3     0.0746
4     0.0739
9     0.0724
8     0.0696
2     0.0682
7     0.0682
5     0.0661
13    0.0661
11    0.0653
6     0.0597
1     0.0589
10    0.0114
15    0.0114
Name: category_id, dtype: float64

In [71]:
website_df.drop('Domain', axis=1, inplace=True)
website_df.drop('cleaned_website_text', axis=1, inplace=True)
website_df.drop('Category', axis=1, inplace=True)

website_df.head()

Unnamed: 0,word_sim_Adult,word_sim_Forum,word_sim_Message,word_sim_Government,word_sim_Law,word_sim_Food,word_sim_Tech,word_sim_Computer,word_sim_Photo,word_sim_Commerce,word_sim_Fitness,word_sim_Health,word_sim_News,word_sim_Games,word_sim_Sports,word_sim_Travel,word_sim_Corporate,word_sim_Business,word_sim_Education,category_id
0,0.0,0.167,0.143,0.118,0.0,0.364,0.0,0.133,0.333,0.133,0.286,0.0,0.182,0.182,0.167,0.0,0.25,0.4,0.25,0
1,0.375,0.125,0.222,0.286,0.143,0.0,0.267,0.211,0.125,0.211,0.444,0.353,0.267,0.267,0.25,0.706,0.4,0.421,0.2,0
2,0.375,0.125,0.222,0.286,0.143,0.0,0.267,0.211,0.125,0.211,0.444,0.353,0.267,0.267,0.25,0.706,0.4,0.421,0.2,0
3,0.133,0.267,0.118,0.3,0.0,0.429,0.0,0.222,0.267,0.222,0.235,0.0,0.143,0.143,0.133,0.0,0.211,0.222,0.211,0
4,0.0,0.308,0.267,0.333,0.0,0.333,0.167,0.375,0.308,0.375,0.267,0.143,0.333,0.167,0.154,0.143,0.235,0.25,0.235,0


## 여러 그래프로 유사도 특징 확인하기

In [72]:
from sklearn.model_selection import train_test_split

def get_train_test_dataset(df=None):
    df_copy = df
    X_features = df_copy.iloc[:, :-1]
    y_target = df_copy.iloc[:, -1]
    
    X_train, X_test, y_train, y_test = \
    train_test_split(X_features, y_target, test_size=0.3, random_state=0, stratify=y_target)
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = get_train_test_dataset(website_df)

In [73]:
print('학습 데이터 레이블 값 비율')
print(y_train.value_counts() / y_train.shape[0] * 100, '\n')

print('테스트 데이터 레이블 값 비율')
print(y_test.value_counts() / y_test.shape[0] * 100)

학습 데이터 레이블 값 비율
12    8.121827
14    7.715736
0     7.614213
3     7.512690
4     7.411168
9     7.208122
8     7.005076
7     6.802030
2     6.802030
5     6.598985
13    6.598985
11    6.497462
6     5.989848
1     5.888325
15    1.116751
10    1.116751
Name: category_id, dtype: float64 

테스트 데이터 레이블 값 비율
12    8.037825
14    7.801418
0     7.565012
3     7.328605
9     7.328605
4     7.328605
7     6.855792
2     6.855792
8     6.855792
13    6.619385
5     6.619385
11    6.619385
6     5.910165
1     5.910165
15    1.182033
10    1.182033
Name: category_id, dtype: float64


In [74]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    precision = precision_score(y_test, pred, average= 'macro')
    recall = recall_score(y_test, pred, average= 'macro')
    f1 = f1_score(y_test, pred, average= 'macro')

    print('정밀도: {0:.4f}, 재현률: {1:.4f}, F1: {2:.4f}'\
          .format(precision, recall, f1))

In [75]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter=1000)
lr_clf.fit(X_train, y_train)

# lr_pred : X_test를 이용해서 나온 레이블 값 모음, ex) [0,0,0,0,0,1,0,0,1 ...]
lr_pred = lr_clf.predict(X_test)
lr_pred_proba = lr_clf.predict_proba(X_test)[:, 1]

get_clf_eval(y_test, lr_pred, lr_pred_proba)

정밀도: 0.2067, 재현률: 0.2417, F1: 0.2073


  _warn_prf(average, modifier, msg_start, len(result))
