In [1]:
import numpy as np

In [None]:
pip install datasets

In [3]:
from datasets import load_dataset

In [4]:
ds = load_dataset('yelp_review_full')

In [5]:
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [6]:
ds['train'][20]

{'label': 3,
 'text': "A great townie bar with tasty food and an interesting clientele. I went to check this place out on the way home from the airport one Friday night and it didn't disappoint. It is refreshing to walk into a townie bar and not feel like the music stops and everyone in the place is staring at you - I'm guessing the mixed crowd of older hockey fans, young men in collared shirts, and thirtysomethings have probably seen it all during their time at this place. \\n\\nThe staff was top notch - the orders were somewhat overwhelming as they appeared short-staffed for the night, but my waitress tried to keep a positive attitude for my entire visit. The other waiter was wearing a hooded cardigan, and I wanted to steal it from him due to my difficulty in finding such a quality article of clothing.\\n\\nWe ordered a white pizza - large in size, engulfed in cheese, full of garlic flavor, flavorful hot sausage. An overall delicious pizza, aside from 2 things: 1, way too much grease

In [7]:
ds['train'][0]['text']

"dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank."

In [8]:
ds['train'].features

{'label': ClassLabel(names=['1 star', '2 star', '3 stars', '4 stars', '5 stars'], id=None),
 'text': Value(dtype='string', id=None)}

In [9]:
import pandas as pd

ds_train = pd.DataFrame(ds['train'])
ds_test = pd.DataFrame(ds['test'])

In [10]:
ds_train.head()

Unnamed: 0,label,text
0,4,dr. goldberg offers everything i look for in a...
1,1,"Unfortunately, the frustration of being Dr. Go..."
2,3,Been going to Dr. Goldberg for over 10 years. ...
3,3,Got a letter in the mail last week that said D...
4,0,I don't know what Dr. Goldberg was like before...


In [11]:
ds_test.head()

Unnamed: 0,label,text
0,0,I got 'new' tires from them and within two wee...
1,0,Don't waste your time. We had two different p...
2,0,All I can say is the worst! We were the only 2...
3,0,I have been to this restaurant twice and was d...
4,0,Food was NOT GOOD at all! My husband & I ate h...


In [12]:
ds_train['label'].value_counts()

label
4    130000
1    130000
3    130000
0    130000
2    130000
Name: count, dtype: int64

In [13]:
ds_test['label'].value_counts()

label
0    10000
2    10000
1    10000
3    10000
4    10000
Name: count, dtype: int64

In [14]:
from datasets import Dataset, DatasetDict

# assign the splits
train = Dataset.from_pandas(ds_train)
test = Dataset.from_pandas(ds_test)

# reconstruct both datasets into a Dataset Dict object
new_ds = DatasetDict(
    {
         'train': train,
       'test': test
    }
)

# view the resulting dataset dict object
new_ds

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

### **Manual Application**
In this project task, we will manually apply the tokenization and text representation methods on our project dataset.


In [15]:
print("The shape of train set is :",new_ds['train'].shape)
print("The shape of test set is :",new_ds['test'].shape)

The shape of train set is : (650000, 2)
The shape of test set is : (50000, 2)


Dataset is equally divided among different classes of reviews

## Step 1: Choose the Method

Choosing TFIDF because it is less sparse and better for text analysis

## Step 2: Preprocessing

### Removing Puncuation

In [16]:
import string

punctuation = string.punctuation

def remove_punctuation(review):
    return "".join([char for char in review if char not in punctuation])

### Text to lower case

In [17]:
def lower_case(review):
    return review.lower() if isinstance(review, str) else review

### Stop word removal

In [18]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Harmanpreet
[nltk_data]     Kaur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [20]:
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

## Step 3: Tokenization

In [21]:
import re

def tokenization(review):
    return re.findall(r'\b\w+\b', review)

### Processing function

In [22]:
def preprocessing(review):
    review = remove_punctuation(review)
    review = lower_case(review)
    tokens = tokenization(review)
    tokens = remove_stopwords(tokens)
    return ' '.join(tokens)  # Return as a single string

#### Sample check for our preprocessing function

In [23]:
check = preprocessing(new_ds['train'][0]['text'])

check

'dr goldberg offers everything look general practitioner hes nice easy talk without patronizing hes always time seeing patients hes affiliated topnotch hospital nyu parents explained important case something happens need surgery get referrals see specialists without see first really need im sitting trying think complaints im really drawing blank'

# Apply preprocessing to the dataset

In [24]:
new_ds = new_ds.map(lambda x: {'text': preprocessing(x['text'])})

Map:   0%|          | 0/650000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

## Step 4: Vectorization

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

In [26]:
# Convert back to pandas DataFrame for further processing

train_df = new_ds['train'].to_pandas()
test_df = new_ds['test'].to_pandas()

In [27]:
train_df.head()

Unnamed: 0,label,text
0,4,dr goldberg offers everything look general pra...
1,1,unfortunately frustration dr goldbergs patient...
2,3,going dr goldberg 10 years think one 1st patie...
3,3,got letter mail last week said dr goldberg mov...
4,0,dont know dr goldberg like moving arizona let ...


In [28]:
test_df.head()

Unnamed: 0,label,text
0,0,got new tires within two weeks got flat took c...
1,0,dont waste time two different people come hous...
2,0,say worst 2 people place lunch place freezing ...
3,0,restaurant twice disappointed times wont go ba...
4,0,food good husband ate couple weeks ago first t...


### Sampling datasets for faster computation

In [29]:
train_df_small = train_df.sample(frac=0.1, random_state=42)
test_df_small = test_df.sample(frac=0.1, random_state=42)

In [30]:
# Fit and transform the train data
X_train = vectorizer.fit_transform(train_df_small['text'])

In [31]:
# Transform the test data (only transform)
X_test = vectorizer.transform(test_df_small['text'])

In [32]:
#labels
y_train = train_df_small['label']
y_test = test_df_small['label']

## ML Training and Testing
### Logistic Regression

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix

In [34]:
model = LogisticRegression(multi_class='auto', max_iter=1000)

In [35]:
model.fit(X_train, y_train)

In [36]:
y_pred = model.predict(X_test)

In [37]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_mat)

Accuracy: 54.96%
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.72      0.69      1015
           1       0.49      0.48      0.49      1018
           2       0.46      0.43      0.44      1004
           3       0.48      0.47      0.47      1012
           4       0.64      0.66      0.65       951

    accuracy                           0.55      5000
   macro avg       0.55      0.55      0.55      5000
weighted avg       0.55      0.55      0.55      5000

Confusion Matrix:
[[727 215  46  17  10]
 [256 489 206  50  17]
 [ 67 227 432 220  58]
 [ 17  48 205 473 269]
 [ 24  12  54 234 627]]


### Random Forest

In [54]:
from sklearn.ensemble import RandomForestClassifier

In [55]:
rfc = RandomForestClassifier(random_state=42)

In [56]:
rfc.fit(X_train, y_train)

In [57]:
y_pred_rfc = rfc.predict(X_test)

In [58]:
# Accuracy
accuracy = accuracy_score(y_test, y_pred_rfc)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred_rfc))

# Confusion Matrix
conf_mat = confusion_matrix(y_test, y_pred_rfc)
print("Confusion Matrix:")
print(conf_mat)

Accuracy: 49.06%
Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.78      0.65      1015
           1       0.43      0.33      0.38      1018
           2       0.41      0.37      0.39      1004
           3       0.43      0.37      0.40      1012
           4       0.57      0.61      0.59       951

    accuracy                           0.49      5000
   macro avg       0.48      0.49      0.48      5000
weighted avg       0.48      0.49      0.48      5000

Confusion Matrix:
[[790 132  45  27  21]
 [354 338 198  79  49]
 [145 226 370 191  72]
 [ 55  65 219 376 297]
 [ 80  20  69 203 579]]


### MultiClass Naive Bayes

In [59]:
from sklearn.naive_bayes import MultinomialNB

In [60]:
mnb = MultinomialNB()

In [61]:
mnb.fit(X_train, y_train)

In [62]:
mnb_predict = mnb.predict(X_test)

In [63]:
# Accuracy
accuracy = accuracy_score(y_test, mnb_predict)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Classification Report
print("Classification Report:")
print(classification_report(y_test, mnb_predict))

# Confusion Matrix
conf_mat = confusion_matrix(y_test, mnb_predict)
print("Confusion Matrix:")
print(conf_mat)

Accuracy: 48.98%
Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.66      0.62      1015
           1       0.41      0.51      0.46      1018
           2       0.43      0.45      0.44      1004
           3       0.44      0.50      0.47      1012
           4       0.73      0.32      0.45       951

    accuracy                           0.49      5000
   macro avg       0.52      0.49      0.49      5000
weighted avg       0.52      0.49      0.49      5000

Confusion Matrix:
[[670 309  28   6   2]
 [239 518 216  41   4]
 [ 94 273 451 172  14]
 [ 48 101 266 504  93]
 [ 92  50  92 411 306]]
