In [345]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest, chi2, SelectPercentile
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTEN
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import re

In [346]:
data = pd.read_csv('D:\Hutson\learning-materials\AI_ML\AIMLDLCV_advance\Class\Datasets\European Restaurant Reviews.csv')
data.drop(['Review Date'], axis=1, inplace=True)
data.head()

Unnamed: 0,Country,Restaurant Name,Sentiment,Review Title,Review
0,France,The Frog at Bercy Village,Negative,Rude manager,The manager became agressive when I said the c...
1,France,The Frog at Bercy Village,Negative,A big disappointment,"I ordered a beef fillet ask to be done medium,..."
2,France,The Frog at Bercy Village,Negative,Pretty Place with Bland Food,"This is an attractive venue with welcoming, al..."
3,France,The Frog at Bercy Village,Negative,Great service and wine but inedible food,Sadly I used the high TripAdvisor rating too ...
4,France,The Frog at Bercy Village,Negative,Avoid- Worst meal in Rome - possibly ever,From the start this meal was bad- especially g...


In [347]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1502 entries, 0 to 1501
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Country          1502 non-null   object
 1   Restaurant Name  1502 non-null   object
 2   Sentiment        1502 non-null   object
 3   Review Title     1502 non-null   object
 4   Review           1502 non-null   object
dtypes: object(5)
memory usage: 58.8+ KB


In [348]:
for i in data.columns:
    print(f'{i}: {data[i].nunique()}')

Country: 7
Restaurant Name: 7
Sentiment: 2
Review Title: 1343
Review: 1426


In [349]:
target = 'Sentiment'
x = data.drop(target, axis=1)
y = data[target]

In [350]:
sm = SMOTEN(random_state=42, sampling_strategy={
    "Negative": 1000
})

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(y_train.value_counts())
print('_________')
x_train, y_train = sm.fit_resample(x_train, y_train)
print(y_train.value_counts())

Sentiment
Positive    985
Negative    216
Name: count, dtype: int64
_________
Sentiment
Negative    1000
Positive     985
Name: count, dtype: int64


In [351]:
preprocessor = ColumnTransformer(transformers=[
    ('Review', TfidfVectorizer(), 'Review'),
    ('Country', OneHotEncoder(), ['Country']),
    ('Restaurant_Name', OneHotEncoder(), ['Restaurant Name']),
    ('Review_Title', TfidfVectorizer(), 'Review Title')
])


In [352]:
cls = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectPercentile(chi2, percentile=90)),
    ('classifier', LogisticRegression(solver='sag', penalty='l2', max_iter=100,C=100))
])

In [353]:
# from lazypredict.Supervised import LazyClassifier
# clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
# models,predictions = clf.fit(x_train, x_test, y_train, y_test)
# print(models)


In [354]:
params = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__max_iter': [100,  400, 500],
    'classifier__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    }

model = RandomizedSearchCV(cls, param_distributions=params, cv=5, scoring='accuracy', n_jobs=-1, verbose=1, n_iter=10)

cls.fit(x_train, y_train)
y_pred = cls.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    Negative       0.95      0.82      0.88        49
    Positive       0.97      0.99      0.98       252

    accuracy                           0.96       301
   macro avg       0.96      0.90      0.93       301
weighted avg       0.96      0.96      0.96       301



In [355]:
accuracy_score(y_test, y_pred)

0.9634551495016611

In [356]:
print(model.best_params_)
print(model.best_score_)

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_params_'