# 20.5 8 Machine Learning (DL) Prototype 3

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
import seaborn as sns
import joblib

### best performing models from comparison testing

In [3]:
linear_model='SGDRegressor'

In [4]:
feature_model='GradientBoostingRegressor'
feature_columns=['cash_flow', 'gross_revenue']

### load processed bizbuysell dataset

In [5]:
df_in=pd.read_parquet('data/bizwiz_value_score_2.parquet')

In [6]:
df_in.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12143 entries, 1427 to 36567
Data columns (total 21 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     12143 non-null  float64
 1   pptitle                                12143 non-null  object 
 2   ppdesc                                 12143 non-null  object 
 3   ppdetails                              12143 non-null  object 
 4   ppfinancials                           12143 non-null  object 
 5   pcategories                            12143 non-null  object 
 6   COUNTY_NAME                            12143 non-null  object 
 7   STATE_NAME                             12143 non-null  object 
 8   price                                  12143 non-null  float64
 9   cash_flow                              12143 non-null  float64
 10  gross_revenue                          12143 non-null  float64
 11  esta

# text model

### vectorize text

In [7]:
vect_text=TfidfVectorizer(stop_words='english',ngram_range=(1, 2),max_df=0.5,min_df=20)

In [8]:
text= df_in.pptitle + ' ' + df_in.ppdesc + ' ' + df_in.ppdetails + ' ' + df_in.ppfinancials 

In [9]:
%time X_text = vect_text.fit_transform(text)

CPU times: user 4.84 s, sys: 29.3 ms, total: 4.87 s
Wall time: 4.87 s


In [10]:
X_text.shape

(12143, 13646)

### test train split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_text, df_in.price,random_state=1)

In [12]:
regr= SGDRegressor()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
tr2=r2_score(y_test,y_pred)

In [13]:
tr2

0.324204487090319

# feature model

In [14]:
df_f=df_in[feature_columns]

In [15]:
Xf_train, Xf_test, yf_train, yf_test = train_test_split(df_f, df_in.price,random_state=1)

In [16]:
regrf= GradientBoostingRegressor()
regrf.fit(Xf_train, yf_train)
yf_pred = regrf.predict(Xf_test)
fr2=r2_score(yf_test,yf_pred)

In [17]:
fr2

0.6016297217430493

# save models

In [18]:
joblib.dump((vect_text,regr,regrf),'data/model.final.joblib',True)

['data/model.final.joblib']

### grid search SGDRegressor

In [19]:
param_grid= {
    'alpha': 10.0 ** -np.arange(1, 3),
    'loss': ['squared_loss', 'huber', 'epsilon_insensitive'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'learning_rate': ['constant', 'optimal', 'invscaling'],
    'early_stopping': [True],
#     'verbose':[1],
}

In [20]:
model=SGDRegressor()

In [21]:
# clf = GridSearchCV(model, param_grid)
# clf.fit(X_train, y_train)
# print("Best score: " + str(clf.best_score_))

### score GradientBoostingRegressor

In [22]:
param_grid_f= {
    'n_estimators': [10, 50, 100, 500],
    'learning_rate': [ 0.001, 0.1, 1.0],
    'subsample': [0.5, 0.7, 1.0],
    'max_depth':[3, 7, 9],
#     'verbose':[1],
}

In [23]:
model_f=GradientBoostingRegressor()

In [24]:
# clf_f = GridSearchCV(model_f, param_grid_f)
# clf_f.fit(Xf_train, yf_train)
# print("Best score: " + str(clf_f.best_score_))