In [22]:
import pandas  as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
import warnings
import xgboost
print(xgboost.__version__)

warnings.filterwarnings('ignore')

2.0.3


In [2]:
df = pd.read_csv('cleaned_data.csv')

In [3]:
df.head()

Unnamed: 0,Review,Rating,Sentiment,lenght_of_reviews
0,nice hotel expensive parking got good deal sta...,4,Positive,593
1,ok nothing special charge diamond member hilto...,2,Negative,1689
2,nice room experience hotel monaco seattle good...,3,Negative,1427
3,unique great stay wonderful time hotel monaco ...,5,Positive,600
4,great stay great stay went seahawk game awesom...,5,Positive,1281


##model building##

In [4]:

df['Rating'].value_counts()

Rating
5    9054
4    6039
3    2184
2    1793
1    1421
Name: count, dtype: int64

In [5]:
X = df['Review']
y = df['Sentiment']

In [6]:
le = LabelEncoder()

y = le.fit_transform(y)

In [7]:
tfidf = TfidfVectorizer()

X = tfidf.fit_transform(X)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

In [9]:
y_train.shape , X_train.shape

((14343,), (14343, 43257))

In [10]:
def data_model():
    model = {
        'LogisticRegression': LogisticRegression(),
        'XGBClassifier': XGBClassifier()
    }
    return model

In [11]:
models = data_model ()

for name, model in models.items():
    model.fit(X_train, y_train)

In [12]:
for name,model in models.items():
    prediction = model.predict(X_test)
    print(f'Evalution:{name}')
    print(f'Accuracy:{accuracy_score(y_test,prediction)}')
    print(f'Classification Report:\n {classification_report(y_test,prediction)}')
    print(f'Confusion Matrix:\n {confusion_matrix(y_test,prediction)}')
    print('\n')

Evalution:LogisticRegression
Accuracy:0.8911841249186727
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.69      0.77      1626
           1       0.90      0.96      0.93      4522

    accuracy                           0.89      6148
   macro avg       0.88      0.83      0.85      6148
weighted avg       0.89      0.89      0.89      6148

Confusion Matrix:
 [[1120  506]
 [ 163 4359]]


Evalution:XGBClassifier
Accuracy:0.874430709173715
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.68      0.74      1626
           1       0.89      0.94      0.92      4522

    accuracy                           0.87      6148
   macro avg       0.85      0.81      0.83      6148
weighted avg       0.87      0.87      0.87      6148

Confusion Matrix:
 [[1112  514]
 [ 258 4264]]




In [13]:
from sklearn.model_selection import GridSearchCV

In [14]:
log = LogisticRegression()

In [15]:
param_grid = {
    'solver': ['liblinear','newton-cg','lbfgs'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2']

}

In [16]:
grid_search = GridSearchCV(log,param_grid,cv= 3,n_jobs= 1, verbose= 2)

grid_search.fit(X_train,y_train)

Fitting 3 folds for each of 42 candidates, totalling 126 fits
[CV] END ..............C=0.001, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ..............C=0.001, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ..............C=0.001, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ..............C=0.001, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END ..............C=0.001, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END ..............C=0.001, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END ..................C=0.001, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END ..................C=0.001, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END ..................C=0.001, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END ..............C=0.001, penalty=l2, solver=liblinear; total time=   0.1s
[CV] END ..............C=0.001, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ..............C=0.001, penalty=l2, sol

In [17]:
best_param = grid_search.best_params_
print(f'this is the best parameters.:{best_param}')

this is the best parameters.:{'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}


In [18]:
best_model1 = LogisticRegression(**best_param)
best_model1.fit(X_train,y_train)

In [19]:
predictions = best_model1.predict(X_test)
print('Best Evalution On logistic Regression ')
print(f'Accuracy:{accuracy_score(y_test,predictions)}')
print(f'Classification Report:\n {classification_report(y_test,predictions)}')
print(f'Confusion Matrix:\n {confusion_matrix(y_test,predictions)}')


Best Evalution On logistic Regression 
Accuracy:0.8898828887443071
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.74      0.78      1626
           1       0.91      0.94      0.93      4522

    accuracy                           0.89      6148
   macro avg       0.87      0.84      0.85      6148
weighted avg       0.89      0.89      0.89      6148

Confusion Matrix:
 [[1205  421]
 [ 256 4266]]


In [21]:
XG = XGBClassifier()
import xgboost
print(xgboost.__version__)



2.0.3


In [23]:

param_grid1 = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 300],
    'colsample_bytree': [0.4, 0.6],
    'colsample_bylevel': [0.4],
    'colsample_bynode': [0.4],
    'gamma': [0.0, 0.2],
}


In [24]:
grid_search1 = GridSearchCV(XG,param_grid1,cv =3 ,n_jobs=1,verbose= 2, scoring= 'accuracy')

grid_search1.fit(X_train,y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] END colsample_bylevel=0.4, colsample_bynode=0.4, colsample_bytree=0.4, gamma=0.0, learning_rate=0.01, n_estimators=100; total time=  24.7s
[CV] END colsample_bylevel=0.4, colsample_bynode=0.4, colsample_bytree=0.4, gamma=0.0, learning_rate=0.01, n_estimators=100; total time=  28.4s
[CV] END colsample_bylevel=0.4, colsample_bynode=0.4, colsample_bytree=0.4, gamma=0.0, learning_rate=0.01, n_estimators=100; total time=  46.5s
[CV] END colsample_bylevel=0.4, colsample_bynode=0.4, colsample_bytree=0.4, gamma=0.0, learning_rate=0.01, n_estimators=300; total time= 2.9min
[CV] END colsample_bylevel=0.4, colsample_bynode=0.4, colsample_bytree=0.4, gamma=0.0, learning_rate=0.01, n_estimators=300; total time= 2.2min
[CV] END colsample_bylevel=0.4, colsample_bynode=0.4, colsample_bytree=0.4, gamma=0.0, learning_rate=0.01, n_estimators=300; total time= 1.8min
[CV] END colsample_bylevel=0.4, colsample_bynode=0.4, colsample_bytree=0.4,

In [25]:
best_param1 = grid_search1.best_params_

print(f'this is the best parameters.:{best_param1}')

this is the best parameters.:{'colsample_bylevel': 0.4, 'colsample_bynode': 0.4, 'colsample_bytree': 0.4, 'gamma': 0.0, 'learning_rate': 0.2, 'n_estimators': 300}


In [26]:
best_model2 = XGBClassifier(**best_param1)

best_model2.fit(X_train,y_train)

In [27]:
pred = best_model2.predict(X_test)

print('Best Evalution On XGBClassifier ')

print(f'Accuracy:{accuracy_score(y_test,pred)}')

print(f'Classification Report:\n {classification_report(y_test,pred)}')

print(f'Confusion Matrix:\n {confusion_matrix(y_test,pred)}')


Best Evalution On XGBClassifier 
Accuracy:0.8851659076122316
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.70      0.76      1626
           1       0.90      0.95      0.92      4522

    accuracy                           0.89      6148
   macro avg       0.87      0.83      0.84      6148
weighted avg       0.88      0.89      0.88      6148

Confusion Matrix:
 [[1142  484]
 [ 222 4300]]


At the end of our model traning , Logistic Regression proved to perform better.

In [28]:
import pickle 

In [29]:
with open ('model.pkl','wb') as file:
    pickle.dump(best_model1,file)

In [30]:
with open('tfidf.pkl','wb') as file:
    pickle.dump(tfidf,file)

In [31]:
with open('le.pkl','wb')as file:
    pickle.dump(le,file)

In [37]:
def model_test(text):
    vector = tfidf.transform([text]).toarray()
    prediction = best_model1.predict(vector)
    return le.inverse_transform(prediction)[0] 

model_test('this hotel is dirty')

'Negative'