In [2]:
import numpy as np

In [None]:
pip install datasets

In [4]:
from datasets import load_dataset

In [None]:
ds = load_dataset('yelp_review_full')

In [6]:
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [7]:
ds['train'][20]

{'label': 3,
 'text': "A great townie bar with tasty food and an interesting clientele. I went to check this place out on the way home from the airport one Friday night and it didn't disappoint. It is refreshing to walk into a townie bar and not feel like the music stops and everyone in the place is staring at you - I'm guessing the mixed crowd of older hockey fans, young men in collared shirts, and thirtysomethings have probably seen it all during their time at this place. \\n\\nThe staff was top notch - the orders were somewhat overwhelming as they appeared short-staffed for the night, but my waitress tried to keep a positive attitude for my entire visit. The other waiter was wearing a hooded cardigan, and I wanted to steal it from him due to my difficulty in finding such a quality article of clothing.\\n\\nWe ordered a white pizza - large in size, engulfed in cheese, full of garlic flavor, flavorful hot sausage. An overall delicious pizza, aside from 2 things: 1, way too much grease

In [8]:
ds['train'][0]['text']

"dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank."

In [9]:
ds['train'].features

{'label': ClassLabel(names=['1 star', '2 star', '3 stars', '4 stars', '5 stars'], id=None),
 'text': Value(dtype='string', id=None)}

In [10]:
import pandas as pd

ds_train = pd.DataFrame(ds['train'])
ds_test = pd.DataFrame(ds['test'])

In [11]:
ds_train.head()

Unnamed: 0,label,text
0,4,dr. goldberg offers everything i look for in a...
1,1,"Unfortunately, the frustration of being Dr. Go..."
2,3,Been going to Dr. Goldberg for over 10 years. ...
3,3,Got a letter in the mail last week that said D...
4,0,I don't know what Dr. Goldberg was like before...


In [12]:
ds_test.head()

Unnamed: 0,label,text
0,0,I got 'new' tires from them and within two wee...
1,0,Don't waste your time. We had two different p...
2,0,All I can say is the worst! We were the only 2...
3,0,I have been to this restaurant twice and was d...
4,0,Food was NOT GOOD at all! My husband & I ate h...


In [13]:
ds_train['label'].value_counts()

label
4    130000
1    130000
3    130000
0    130000
2    130000
Name: count, dtype: int64

In [14]:
ds_test['label'].value_counts()

label
0    10000
2    10000
1    10000
3    10000
4    10000
Name: count, dtype: int64

In [15]:
from datasets import Dataset, DatasetDict

# assign the splits
train = Dataset.from_pandas(ds_train)
test = Dataset.from_pandas(ds_test)

# reconstruct both datasets into a Dataset Dict object
new_ds = DatasetDict(
    {
         'train': train,
       'test': test
    }
)

# view the resulting dataset dict object
new_ds

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

## Vectorization

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

In [34]:
# Convert back to pandas DataFrame for further processing

train_df = new_ds['train'].to_pandas()
test_df = new_ds['test'].to_pandas()

In [35]:
train_df.head()

Unnamed: 0,label,text
0,4,dr goldberg offer everything look general prac...
1,1,unfortunately frustration dr goldberg patient ...
2,3,go dr goldberg 10 year think one 1st patient s...
3,3,get letter mail last week say dr goldberg move...
4,0,know dr goldberg like move arizona let tell st...


In [36]:
test_df.head()

Unnamed: 0,label,text
0,0,get new tire within two week get flat take car...
1,0,waste time two different people come house giv...
2,0,say worst 2 people place lunch place freeze lo...
3,0,restaurant twice disappoint time go back first...
4,0,food good husband eat couple week ago first ti...


#### Savings dataframes for further notebooks

In [37]:
train_df.to_csv('train_df.csv', index=False)
test_df.to_csv('test_df.csv', index=False)

### Sampling datasets for faster computation

In [38]:
train_df_small = train_df.sample(frac=0.1, random_state=42)
test_df_small = test_df.sample(frac=0.1, random_state=42)

In [39]:
# Fit and transform the train data
X_train = vectorizer.fit_transform(train_df_small['text'])

In [40]:
# Transform the test data (only transform)
X_test = vectorizer.transform(test_df_small['text'])

In [41]:
#labels
y_train = train_df_small['label']
y_test = test_df_small['label']