# Classification of bizbuysell data

### With multinomial Naive Bayes

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import joblib

### Load labeled bizbuysell dataset

In [3]:
df_in=pd.read_parquet('data/bizwiz_value_score.parquet')

In [4]:
df_in.head()

Unnamed: 0,id,pptitle,ppdesc,COUNTY_NAME,STATE_NAME,bizwiz_value_score,bizwiz_class,bizwiz_label
1,1990890.0,high exposur may land new jersey,amaz opportun price sell readi busi locat hear...,Atlantic,New Jersey,4.062859,3,high
3,2039720.0,duplex short term long term pinella counti flo...,rare largoseminol area duplex central locat pi...,Pinellas,Florida,3.46421,2,medium
4,1576680.0,major price reduct resttavern 13000sq ft build...,price reduc 850 000 make offercurr owner want ...,Baltimore (city),Maryland,2.505054,1,low
5,2087638.0,busi real estat casco michigan,• busi real estat sale• liquorshopp first floo...,St. Clair,Michigan,2.973314,1,low
7,2051958.0,turnkey restaur near lake wister state park hi...,commerci real estat sale welcom 409 us highway...,Le Flore,Oklahoma,3.712142,2,medium


### Create test/train datasets

In [5]:
def two_class(x):
    if x in (3,4):
        return 1
    return 0
df_in['class']=df_in['bizwiz_class'].apply(two_class)

In [6]:
X = df_in.pptitle + ' ' + df_in.ppdesc + ' ' + df_in.COUNTY_NAME + ' ' + df_in.STATE_NAME
#y = df_in.bizwiz_class
y = df_in['class']
print(X.shape)
print(y.shape)

(27336,)
(27336,)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(20502,)
(6834,)
(20502,)
(6834,)


### Vectorize the data

In [8]:
# remove English stop words
# include 1-grams and 2-grams
# ignore terms that appear in more than 50% of the documents
# only keep terms that appear in at least 2 documents

vect = CountVectorizer(stop_words='english',ngram_range=(1, 2),max_df=0.5,min_df=2)
#vect = CountVectorizer()

In [9]:
X_train_dtm = vect.fit_transform(X_train)

In [10]:
X_test_dtm = vect.transform(X_test)

### Train MultinomialNB Model

In [11]:
nb = MultinomialNB()

In [12]:
#train the model 
%time nb.fit(X_train_dtm, y_train)

CPU times: user 37.3 ms, sys: 2.34 ms, total: 39.7 ms
Wall time: 38 ms


MultinomialNB()

### Make predictions with model and look at performance

In [13]:
y_pred_class = nb.predict(X_test_dtm)

In [14]:
print(metrics.classification_report(y_test, y_pred_class))

              precision    recall  f1-score   support

           0       0.69      0.83      0.75      3398
           1       0.79      0.64      0.71      3436

    accuracy                           0.73      6834
   macro avg       0.74      0.73      0.73      6834
weighted avg       0.74      0.73      0.73      6834



### Save the model

In [15]:
joblib.dump(nb,'data/model.nb.joblib')

['data/model.nb.joblib']