# Read Data

In [1]:
#!pip install plotly_express
#!pip install pandas 

In [2]:
# import libraries
import pandas as pd 
import plotly_express as px 
import json

In [3]:
# first dataset
try:
    url1 = 'datasets/tinder.csv'
    tinder = pd.read_csv(url1, on_bad_lines='skip')
    print('local')
except:
    print('remote')
    url1 = 'https://raw.githubusercontent.com/dataPracticum/code_pudding_v2/main/Tinder.csv'
    tinder = pd.read_csv(url1, on_bad_lines='skip', names=['id', 'usage', 'university', 'answer', 'matches', 'percentage', 'relationship'])

local


In [4]:
tinder.head()

Unnamed: 0,id,usage,university,answer,matches,percentage,relationship
0,292890.897,web,"Meridian, Idaho",No,0.0,0.0,0
1,292887.987,web,"Meridian, Idaho",No,0.0,0.0,0
2,292894.0656,gender,"Meridian, Idaho",No,499.173606,0.225255,0
3,292887.118,web,"Meridian, Idaho",No,0.0,0.0,0
4,292893.6561,gender,"Meridian, Idaho",No,455.925963,0.21136,0


In [5]:
# function to convert to json
def convert_json(df):
    df_json = df.to_json()
    data = json.loads(df_json)
    new_df = json.dumps(data, indent=2)
    return print(new_df)

In [6]:
# convert_json(tinder)

# Introduction

This mini dataset consists of questions answered by millennials from various universities based on their usage of Tinder app

## Tinder

In [7]:
# look at data
tinder.head()

Unnamed: 0,id,usage,university,answer,matches,percentage,relationship
0,292890.897,web,"Meridian, Idaho",No,0.0,0.0,0
1,292887.987,web,"Meridian, Idaho",No,0.0,0.0,0
2,292894.0656,gender,"Meridian, Idaho",No,499.173606,0.225255,0
3,292887.118,web,"Meridian, Idaho",No,0.0,0.0,0
4,292893.6561,gender,"Meridian, Idaho",No,455.925963,0.21136,0


In [8]:
# missing values
tinder.isna().sum()

id              0
usage           0
university      0
answer          0
matches         0
percentage      0
relationship    0
dtype: int64

In [9]:
# looking for duplicates
tinder[tinder.duplicated()]

Unnamed: 0,id,usage,university,answer,matches,percentage,relationship
1295,292889.9062,web,"Meridian, Idaho",No,0.0,0.0,0
1636,292887.7623,web,"Meridian, Idaho",No,0.0,0.0,0


Different id's so they are not actually duplicates

In [10]:
# values of answer
tinder.answer.value_counts()

No                    1571
Yes                    164
I don't use Tinder     161
Name: answer, dtype: int64

In [11]:
# tinder columns
tinder.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1896 entries, 0 to 1895
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            1896 non-null   float64
 1   usage         1896 non-null   object 
 2   university    1896 non-null   object 
 3   answer        1896 non-null   object 
 4   matches       1896 non-null   float64
 5   percentage    1896 non-null   float64
 6   relationship  1896 non-null   int64  
dtypes: float64(3), int64(1), object(3)
memory usage: 103.8+ KB


Tinder is a relatively small dataset with 1,896 values. Target value is relationship, where it determines whether the individual found a relationship using the app. We can use this data to run a ML model to predict wether an individual will get a relationship from Tinder. 

Usage indicates the medium of usage, and Tinder has a mobile-based, web-based app as well as a separate feature called Tinder U. Not sure what Gender represents.

### EDA

In [12]:
# correlation
tinder.corr()

Unnamed: 0,id,matches,percentage,relationship
id,1.0,-0.42979,0.164302,0.167766
matches,-0.42979,1.0,0.4122,0.186139
percentage,0.164302,0.4122,1.0,0.311429
relationship,0.167766,0.186139,0.311429,1.0


In [13]:
# summary statistics
tinder.describe()

Unnamed: 0,id,matches,percentage,relationship
count,1896.0,1896.0,1896.0,1896.0
mean,292999.39475,446.881267,0.254098,0.350738
std,227.809806,494.110578,0.220809,0.477327
min,292881.0,0.0,0.0,0.0
25%,292888.196925,0.0,0.0,0.0
50%,292893.9044,400.439633,0.257873,0.0
75%,292902.56875,845.560798,0.396787,1.0
max,293785.0,1969.0,1.0,1.0


In [14]:
# Skew
tinder.skew()

  tinder.skew()


id              2.044382
matches         0.716367
percentage      0.816940
relationship    0.626066
dtype: float64

In [15]:
# answers to question: do you use tinder?
tinder.answer.value_counts()

No                    1571
Yes                    164
I don't use Tinder     161
Name: answer, dtype: int64

In [16]:
# compile no answers
tinder.answer.replace('I don\'t use Tinder', 'No', inplace=True)

In [17]:
# answers to question: do you use tinder?
tinder.answer.value_counts()

No     1732
Yes     164
Name: answer, dtype: int64

In [18]:
# use tinder and found relationship
tinder[tinder.answer=='Yes'].relationship.value_counts()

1    86
0    78
Name: relationship, dtype: int64

In [19]:
# relationship found using tinder 
print(86/(86+78) * 100, '%')

52.4390243902439 %


In [20]:
# dont use tinder
tinder[tinder.answer=='No'].relationship.value_counts()

0    1153
1     579
Name: relationship, dtype: int64

In [21]:
# relationship found not using tinder
print(579/(1153+579) * 100, '%')

33.429561200923786 %


In [22]:
# dataframe comparing relationship success
data = {'category': ['Tinder', 'Not Tinder'], 'percentage':[52.44, 33.43]}
comparison = pd.DataFrame(data)

In [23]:
# convert_json(comparison)

##### Bar 1

In [24]:
px.bar(comparison, x='category', y='percentage', color='category', title='Relationship Success Rates')

In [25]:
convert_json(px.bar(comparison, x='category', y='percentage', color='category', title='Relationship Success Rates'))

{
  "data": [
    {
      "alignmentgroup": "True",
      "hovertemplate": "category=%{x}<br>percentage=%{y}<extra></extra>",
      "legendgroup": "Tinder",
      "marker": {
        "color": "#636efa",
        "pattern": {
          "shape": ""
        }
      },
      "name": "Tinder",
      "offsetgroup": "Tinder",
      "orientation": "v",
      "showlegend": true,
      "textposition": "auto",
      "x": [
        "Tinder"
      ],
      "xaxis": "x",
      "y": [
        52.44
      ],
      "yaxis": "y",
      "type": "bar"
    },
    {
      "alignmentgroup": "True",
      "hovertemplate": "category=%{x}<br>percentage=%{y}<extra></extra>",
      "legendgroup": "Not Tinder",
      "marker": {
        "color": "#EF553B",
        "pattern": {
          "shape": ""
        }
      },
      "name": "Not Tinder",
      "offsetgroup": "Not Tinder",
      "orientation": "v",
      "showlegend": true,
      "textposition": "auto",
      "x": [
        "Not Tinder"
      ],
      "xaxis"

In [26]:
# university names
tinder.university.value_counts()

Meridian, Idaho                       1421
Westport, CT                            14
University of Pennsylvania              12
University of Mississippi               10
University of Washington                10
                                      ... 
Boston College                           1
Southern Methodist University            1
Mississippi State University             1
Washington University in St. Louis       1
North Carolina State                     1
Name: university, Length: 148, dtype: int64

In [27]:
# medium of usage
tinder.usage.value_counts()

gender        488
mobile        483
university    464
web           461
Name: usage, dtype: int64

In [28]:
# target values
tinder.relationship.value_counts()

0    1231
1     665
Name: relationship, dtype: int64

##### Fig 2

In [29]:
convert_json(tinder.answer.value_counts())

{
  "No": 1732,
  "Yes": 164
}


In [30]:
px.pie(tinder.answer, names='answer', title='Total Percentage of Tinder Users', template='plotly_white', height=600)

In [31]:
# convert_json(px.pie(tinder.answer, names='answer', title='Total Percentage of Tinder Users', template='plotly_white', height=600))

More people do not use Tinder

In [32]:
convert_json(tinder.relationship.value_counts())

{
  "0": 1231,
  "1": 665
}


In [33]:
px.pie(tinder.relationship, names='relationship', title='Total Percentage of Relationships', template='plotly_white', height=600)

More people are not ina relationship

In [34]:
px.pie(tinder[tinder.answer=='Yes'].relationship, names='relationship', title='Relationships Found Using Tinder', template='plotly_white', height=600)

A little more than half of the people that use tinder have found a relationship using the app

In [35]:
px.pie(tinder[tinder.answer=='No'].relationship, names='relationship', title='Found a Relationship Without Tinder', template='plotly_white', height=600,)


A third of People not using Tinder have found a relationship

In [36]:
columns = ['usage', 'answer', 'matches', 'percentage',
       'relationship']
for column in columns:
    px.histogram(tinder[column], title='Distribution of '+ str.upper(column), labels={'value': str.upper(column)}, template='plotly_white', color_discrete_sequence=['red']).show()

In [37]:
# number of unique universities
tinder.university.nunique()

148

There are 148 different universities in the dataset

In [38]:
tinder.university.value_counts().nlargest(20)

Meridian, Idaho                         1421
Westport, CT                              14
University of Pennsylvania                12
University of Mississippi                 10
University of Washington                  10
University of Southern California          9
Washington State University                8
University of Tampa                        8
Wesleyan University                        8
University of Wisconsin, Eau Claire        8
University of California, Santa Cruz       8
University of California, Berkeley         8
Williams College                           7
Vassar College                             7
Vanderbilt University                      7
University of Buffalo                      7
University of Oregon                       7
Union College                              7
Whatsgoodly University                     6
University of Delaware                     6
Name: university, dtype: int64

In [39]:
px.bar(tinder.university.value_counts().nlargest(20), title='Distribution of Most Common Universities', labels={'value': 'Universities'}, template='plotly_white', color_discrete_sequence=['red']).show()

Most common university is Meridian in Ohio. The other universities make up a small percentage, roughly 15%  of the rest of the schools.

In [40]:
columns = ['matches', 'percentage']
for column in columns:
    px.box(tinder[column], title='Distribution of '+ str.upper(column), labels={'value': str.upper(column)}, template='plotly_white', color_discrete_sequence=['red']).show()

Average matches are around 400, with the max at 1969. The minimm amount of matches is 0, while the upper quartile is 846. Most people get matches on .25% of swipes, while the interquartile range is 0-0.4%.  

In [41]:
px.imshow(tinder.corr(), text_auto=True, title='Tinder Correlations', template='plotly_white', aspect='equal', height=600, color_continuous_scale='hot')

## Statistical Testing 

In [42]:
# filter dataset for only tinder users
tinder_users = tinder.query("answer=='Yes'")

In [43]:
tinder_users.head()

Unnamed: 0,id,usage,university,answer,matches,percentage,relationship
7,293223.0,university,University of Iowa,Yes,1.0,0.333,0
19,292943.0117,university,"University of California, Davis",Yes,8.497078,0.193545,1
30,293163.0,university,Cal Poly San Luis Obispo,Yes,8.0,0.32,0
37,293272.2777,university,"University of California, Santa Cruz",Yes,4.361147,0.323162,1
45,293661.0,university,Morehead State University,Yes,0.0,0.0,0


In [44]:
tinder_relat = tinder_users.relationship.to_list()

In [45]:
non_users = tinder.query("answer=='No'")

In [46]:
non_users.head()

Unnamed: 0,id,usage,university,answer,matches,percentage,relationship
0,292890.897,web,"Meridian, Idaho",No,0.0,0.0,0
1,292887.987,web,"Meridian, Idaho",No,0.0,0.0,0
2,292894.0656,gender,"Meridian, Idaho",No,499.173606,0.225255,0
3,292887.118,web,"Meridian, Idaho",No,0.0,0.0,0
4,292893.6561,gender,"Meridian, Idaho",No,455.925963,0.21136,0


In [47]:
non_relat = non_users.relationship.to_list()

Null Hypothesis

    * Null Hypothesis that there is no difference in relationship status of Tinder users and non Users

In [48]:
from scipy import stats as st
import numpy as np

alpha = 0.05  # critical statistical significance level
# if the p-value is less than alpha, we reject the hypothesis

results = st.ttest_ind(tinder_relat, non_relat)

print('p-value: ', results.pvalue)

if results.pvalue < alpha:
    print("We reject the null hypothesis, the samples are different")
else:
    print("We can't reject the null hypothesis")

p-value:  1.018892115051285e-06
We reject the null hypothesis, the samples are different


# Looking for Love in All the Wrong Places?

* Are you looking for love? What are your chances of finding it on tinder, and what are your chances of finding it by yourself? 
* Can we predict your chances on tinder based on your school and average stats?


## ML Predicting Relationship

In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder

In [50]:
tinder.columns

Index(['id', 'usage', 'university', 'answer', 'matches', 'percentage',
       'relationship'],
      dtype='object')

In [51]:
# target balance of tinder users
y3.value_counts()

NameError: name 'y3' is not defined

Target values are balanced

In [None]:
# encoding categorical columns
encoder = OrdinalEncoder()
columns_to_encode = ['usage', 'university', 'answer']
data_ordinal = pd.DataFrame(encoder.fit_transform(tinder[columns_to_encode]), columns=columns_to_encode)
tinder.update(data_ordinal)


In [None]:
# label encoding tinder users dataset
encoder = OrdinalEncoder()
columns_to_encode = ['usage', 'university', 'answer']
data_ordinal3 = pd.DataFrame(encoder.fit_transform(tinder_users[columns_to_encode]), columns=columns_to_encode)
tinder_users.update(data_ordinal3)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
# full dataset features and target
X = tinder.drop(columns=['relationship', 'id'])
y = tinder.relationship

In [None]:
# tinder dataset
X3 = tinder_users.drop(columns=['relationship', 'id'])
y3 = tinder_users.relationship

In [None]:
X.head()

Unnamed: 0,usage,university,answer,matches,percentage
0,3.0,46.0,0.0,0.0,0.0
1,3.0,46.0,0.0,0.0,0.0
2,0.0,46.0,0.0,499.173606,0.225255
3,3.0,46.0,0.0,0.0,0.0
4,0.0,46.0,0.0,455.925963,0.21136


In [None]:
X3.head()

In [169]:
y.value_counts()

0    1231
1     665
Name: relationship, dtype: int64

#### Usage Map

In [170]:
# What is your medium of usage?
usage = pd.DataFrame({'usage':['Gender', 'Mobile', 'University', 'Web'], 'values': [0, 1 , 2 ,3]})
usage

Unnamed: 0,usage,values
0,Gender,0
1,Mobile,1
2,University,2
3,Web,3


In [171]:
# convert_json(usage)

In [172]:
# show  map of coded values
tinder.usage.value_counts()

0.0    488
1.0    483
2.0    464
3.0    461
Name: usage, dtype: int64

In [173]:
# label encoded values
X.usage.value_counts()

0.0    488
1.0    483
2.0    464
3.0    461
Name: usage, dtype: int64

#### University Map

In [174]:
# What University do you attend?
uni = pd.DataFrame({'university':['Meridian, Idaho', 'Westport, CT', 'University of Pennsylvania', 'University of Mississippi', 'University of Washington', 'University of Southern California', 'Washington State University', 'University of Tampa', 'Wesleyan University', 'University of Wisconsin, Eau Claire', 'University of California, Santa Cruz', 'University of California, Berkeley', 'Williams College', 'Vassar College', 'Vanderbilt University', 'University of Buffalo', 'University of Oregon', 'Union College', 'Whatsgoodly University', 'University of Delaware']
, 'values': [46.0, 144.0, 123.0, 117.0, 133.0, 127.0, 140.0, 129.0, 143.0, 134.0, 97.0, 92.0, 146.0, 139.0, 138.0, 91.0, 122.0, 86.0, 145.0, 104.0]})
uni

Unnamed: 0,university,values
0,"Meridian, Idaho",46.0
1,"Westport, CT",144.0
2,University of Pennsylvania,123.0
3,University of Mississippi,117.0
4,University of Washington,133.0
5,University of Southern California,127.0
6,Washington State University,140.0
7,University of Tampa,129.0
8,Wesleyan University,143.0
9,"University of Wisconsin, Eau Claire",134.0


In [175]:
# convert_json(uni)

In [176]:
# show  map of coded values
tinder.university.value_counts().nlargest(20)

46.0     1421
144.0      14
123.0      12
117.0      10
133.0      10
127.0       9
140.0       8
129.0       8
143.0       8
134.0       8
97.0        8
92.0        8
146.0       7
139.0       7
138.0       7
91.0        7
122.0       7
86.0        7
145.0       6
104.0       6
Name: university, dtype: int64

In [177]:
# coded values
X.university.value_counts().nlargest(20)

46.0     1421
144.0      14
123.0      12
117.0      10
133.0      10
127.0       9
140.0       8
129.0       8
143.0       8
134.0       8
97.0        8
92.0        8
146.0       7
139.0       7
138.0       7
91.0        7
122.0       7
86.0        7
145.0       6
104.0       6
Name: university, dtype: int64

May be easier to assume Meridian, as it is the most prevalent, or at least one of the top 20 universities.

##### Answer Map

In [178]:
# Do you use Tinder?
answer = pd.DataFrame({'answer':['No', 'Yes'], 'values': [0, 1 ]})
answer

Unnamed: 0,answer,values
0,No,0
1,Yes,1


In [179]:
# convert_json(answer)

In [180]:
# map of coded values
tinder.answer.value_counts()

0.0    1732
1.0     164
Name: answer, dtype: int64

In [181]:
# coded values
X.answer.value_counts()

0.0    1732
1.0     164
Name: answer, dtype: int64

In [182]:
# new data to make predictions
new_data = pd.DataFrame(index=['usage', 'university', 'answer', 'matches', 'percentage'], data=[3, 46, 1, 400, 0.25])
new_data = new_data.T

In [183]:
new_data

Unnamed: 0,usage,university,answer,matches,percentage
0,3.0,46.0,1.0,400.0,0.25


In [184]:
# Classifier pipeline
pipe_lr = Pipeline([('scalar1', StandardScaler()), ('lr_classifier', LogisticRegression(random_state=19, max_iter=1000))])
pipe_dt = Pipeline([('scalar1', StandardScaler()), ('dt_classifier', DecisionTreeClassifier(random_state=19))])
pipe_rf = Pipeline([('scalar1', StandardScaler()), ('rf_classifier', RandomForestClassifier(random_state=19))])
pipe_sv = Pipeline([('scalar1', StandardScaler()), ('svm_classifier', svm.SVC(random_state=19))])
pipe_nb = Pipeline([('scalar1', StandardScaler()), ('nb_classifier', GaussianNB())])
pipe_kn = Pipeline([('scalar1', StandardScaler()), ('knn_classifier', KNeighborsClassifier(n_neighbors=3))])

pipelines = [pipe_lr, pipe_dt, pipe_rf, pipe_sv, pipe_nb, pipe_kn]

best_accuracy = 0
best_classifier = 0
best_pipeline = ""

pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'Random Forest', 3: 'SVM', 4: 'Naive-Bayes', 5: 'KNN'}

# Use cross-validation to evaluate the models
for i, model in enumerate(pipelines):
    scores = cross_val_score(model, X, y, cv=5, scoring='f1')
    print('{} Cross-Validation Accuracy: {:.2f}'.format(pipe_dict[i], scores.mean()))
    if scores.mean() > best_accuracy:
        best_accuracy = scores.mean()
        best_pipeline = model
        best_classifier = i

# Print the best classifier
print('\nClassifier with the best accuracy: {}'.format(pipe_dict[best_classifier]))

Logistic Regression Cross-Validation Accuracy: 0.29
Decision Tree Cross-Validation Accuracy: 0.83
Random Forest Cross-Validation Accuracy: 0.85
SVM Cross-Validation Accuracy: 0.80
Naive-Bayes Cross-Validation Accuracy: 0.50
KNN Cross-Validation Accuracy: 0.85

Classifier with the best accuracy: KNN


In [201]:
# XGB

import xgboost as xgb

X2 = tinder.drop(columns=['relationship', 'id'])
y2 = tinder.relationship

lbl = LabelEncoder()
X2.usage = lbl.fit_transform(X2.usage.astype(str))
X2.university = lbl.fit_transform(X2.university.astype(str))
X2.answer = lbl.fit_transform(X2.answer.astype(str))
X2.matches = lbl.fit_transform(X2.matches.astype(int))
X2.percentage = lbl.fit_transform(X2.percentage.astype(float))

# Create a XGBM 
xgbc = xgb.XGBClassifier(random_state=19)

xgbc.fit(X2_train, y2_train)

scores = cross_val_score(xgbc, X2, y2, cv=5, scoring='f1') 
final_score = sum(scores) / len(scores)
print('Average model evaluation score:', final_score)

Average model evaluation score: 0.8527231284039409


In [186]:
# new data to make predictions with categorical columns
new_data2 = pd.DataFrame(index=['usage', 'university', 'answer', 'matches', 'percentage'], data=['web','Meridian, Idaho' , 'Yes', 400, 0.25])
new_data2 = new_data2.T

In [187]:
# change column dtypes
new_data2.matches = new_data2.matches.astype('float')
new_data2.percentage = new_data2.percentage.astype('float')
new_data2.answer = new_data2.answer.astype('category')
new_data2.university = new_data2.university.astype('category')
new_data2.usage = new_data2.usage.astype('category')

In [188]:
# checking implementation
new_data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   usage       1 non-null      category
 1   university  1 non-null      category
 2   answer      1 non-null      category
 3   matches     1 non-null      float64 
 4   percentage  1 non-null      float64 
dtypes: category(3), float64(2)
memory usage: 495.0 bytes


In [189]:
# probabilities of 0- no relationship, 1 - relationship
xgbc.predict_proba(new_data)

array([[0.98916876, 0.01083125]], dtype=float32)

In [190]:
# overall prediction baased on probabilities
xgbc.predict(new_data)

array([0])

In [203]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.2, 0.3],
    'n_estimators': [100, 200, 300],
}

lbl = LabelEncoder()
X.usage = lbl.fit_transform(X2.usage.astype(str))
X.university = lbl.fit_transform(X2.university.astype(str))
X.answer = lbl.fit_transform(X2.answer.astype(str))
X.matches = lbl.fit_transform(X2.matches.astype(int))
X.percentage = lbl.fit_transform(X2.percentage.astype(float))

# Create the XGBoost model
xgb = XGBClassifier()

# Create the GridSearchCV object
grid_search = GridSearchCV(xgb, param_grid, cv=5, scoring='f1', verbose=3, n_jobs=-1)


# Fit the GridSearchCV object to the data
grid_search.fit(X, y)

scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5, scoring='f1') 
final_score = sum(scores) / len(scores)
print('Average model evaluation score:', final_score)


Fitting 5 folds for each of 27 candidates, totalling 135 fits
Average model evaluation score: 0.8596379122502107


In [204]:
# find the best parameters
grid_search.best_params_

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}

In [None]:
# prediction of relationship status based on new data
grid_search.best_estimator_.predict(new_data)

array([0])

Prediction is single based on the new data

In [192]:
# catboost

from catboost import CatBoostClassifier, Pool


catb = CatBoostClassifier(task_type='GPU', loss_function='Logloss', eval_metric='AUC', iterations=1500, random_seed=19)

catb.fit(X, y, verbose=100, use_best_model=True, plot=True)

scores = cross_val_score(catb, X, y, cv=2, scoring='f1') 
final_score = sum(scores) / len(scores)
print('Average model evaluation score:', final_score)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

You should provide test set for use best model. use_best_model parameter has been switched to false value.
Default metric period is 5 because AUC is/are not implemented for GPU


Learning rate set to 0.022632
0:	total: 48.8ms	remaining: 1m 13s
100:	total: 4.69s	remaining: 1m 5s
200:	total: 9.4s	remaining: 1m
300:	total: 14.6s	remaining: 58.3s
400:	total: 19.6s	remaining: 53.7s
500:	total: 24.6s	remaining: 49.1s
600:	total: 29.5s	remaining: 44.2s
700:	total: 34.6s	remaining: 39.4s
800:	total: 40s	remaining: 34.9s
900:	total: 45.7s	remaining: 30.4s
1000:	total: 51s	remaining: 25.4s
1100:	total: 56.6s	remaining: 20.5s
1200:	total: 1m 2s	remaining: 15.6s
1300:	total: 1m 8s	remaining: 10.5s
1400:	total: 1m 13s	remaining: 5.22s
1499:	total: 1m 18s	remaining: 0us


Default metric period is 5 because AUC is/are not implemented for GPU


Learning rate set to 0.023511
0:	total: 52.9ms	remaining: 1m 19s
1:	total: 104ms	remaining: 1m 17s
2:	total: 156ms	remaining: 1m 17s
3:	total: 208ms	remaining: 1m 17s
4:	total: 260ms	remaining: 1m 17s
5:	total: 314ms	remaining: 1m 18s
6:	total: 351ms	remaining: 1m 14s
7:	total: 399ms	remaining: 1m 14s
8:	total: 450ms	remaining: 1m 14s
9:	total: 506ms	remaining: 1m 15s
10:	total: 564ms	remaining: 1m 16s
11:	total: 613ms	remaining: 1m 16s
12:	total: 667ms	remaining: 1m 16s
13:	total: 717ms	remaining: 1m 16s
14:	total: 766ms	remaining: 1m 15s
15:	total: 822ms	remaining: 1m 16s
16:	total: 873ms	remaining: 1m 16s
17:	total: 928ms	remaining: 1m 16s
18:	total: 985ms	remaining: 1m 16s
19:	total: 1.04s	remaining: 1m 17s
20:	total: 1.1s	remaining: 1m 17s
21:	total: 1.15s	remaining: 1m 17s
22:	total: 1.19s	remaining: 1m 16s
23:	total: 1.24s	remaining: 1m 16s
24:	total: 1.29s	remaining: 1m 16s
25:	total: 1.34s	remaining: 1m 15s
26:	total: 1.38s	remaining: 1m 15s
27:	total: 1.43s	remaining: 1m 14s


Default metric period is 5 because AUC is/are not implemented for GPU


3:	total: 209ms	remaining: 1m 18s
4:	total: 260ms	remaining: 1m 17s
5:	total: 308ms	remaining: 1m 16s
6:	total: 352ms	remaining: 1m 15s
7:	total: 396ms	remaining: 1m 13s
8:	total: 447ms	remaining: 1m 14s
9:	total: 495ms	remaining: 1m 13s
10:	total: 544ms	remaining: 1m 13s
11:	total: 593ms	remaining: 1m 13s
12:	total: 639ms	remaining: 1m 13s
13:	total: 688ms	remaining: 1m 12s
14:	total: 734ms	remaining: 1m 12s
15:	total: 785ms	remaining: 1m 12s
16:	total: 831ms	remaining: 1m 12s
17:	total: 880ms	remaining: 1m 12s
18:	total: 937ms	remaining: 1m 13s
19:	total: 986ms	remaining: 1m 12s
20:	total: 1.02s	remaining: 1m 11s
21:	total: 1.06s	remaining: 1m 11s
22:	total: 1.11s	remaining: 1m 11s
23:	total: 1.16s	remaining: 1m 11s
24:	total: 1.21s	remaining: 1m 11s
25:	total: 1.26s	remaining: 1m 11s
26:	total: 1.31s	remaining: 1m 11s
27:	total: 1.35s	remaining: 1m 11s
28:	total: 1.4s	remaining: 1m 10s
29:	total: 1.44s	remaining: 1m 10s
30:	total: 1.48s	remaining: 1m 10s
31:	total: 1.53s	remaining: 