## First look at Huggingface datasets

Imports

In [1]:
from datasets import list_datasets
from datasets import load_dataset
import matplotlib.pyplot as plt
from transformers import AutoTokenizer
from transformers import AutoModel
import torch
import numpy as np
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler
import pandas as pd


In [2]:
all_datasets = list_datasets(with_community_datasets = True, with_details = False )

  all_datasets = list_datasets(with_community_datasets = True, with_details = False )


In [None]:
all_datasets

In [None]:
print(f"There are {len(all_datasets)} datasets currently available on the HF Hub.")
print(f"The first 10 are {all_datasets[:10]}")

Load the emotion dataset

In [None]:

emotions = load_dataset("emotion")

In [None]:
emotions

In [None]:
train_ds = emotions["train"]

In [None]:
train_ds
train_ds.column_names

In [None]:
print(train_ds[:5])

In [None]:
emotions.set_format(type="pandas")
df = emotions["train"][:]
df.head()

In [None]:
def label_int2str(row):
    return emotions["train"].features["label"].int2str(row)
df["label_name"] = df["label"].apply(label_int2str)
df.head()

In [None]:
df["label_name"].value_counts(ascending=True).plot.barh()
plt.title("Frequency of Classes")
plt.show()

In [None]:
df["Words Per Tweet"] = df["text"].str.split().apply(len)
df.boxplot("Words Per Tweet", by="label_name", grid=False, showfliers=False, color="black")
plt.suptitle("")
plt.xlabel("")
plt.show()

In [None]:
emotions.reset_format()

In [None]:

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
text = "Tokenizing text is a core task of NLP."
encoded_text = tokenizer(text)
print(encoded_text)

In [None]:
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)

In [None]:
print(tokenizer.convert_tokens_to_string(tokens))

In [None]:
tokenizer.vocab_size

In [None]:
tokenizer.model_max_length

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True,truncation=True)

print(tokenize(emotions["train"][:2]))

In [None]:
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

In [None]:
print(emotions_encoded["train"].column_names)

In [None]:

model_ckpt = "distilbert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [None]:
print(device)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
text = "this is a test"
inputs = tokenizer(text,return_tensors="pt")
print(f"Input tensor shape: {inputs['input_ids'].size()}")


In [None]:
inputs = {key:value.to(device) for key,value in inputs.items()}
with torch.no_grad():
    outputs = model(**inputs)
print(outputs)

In [None]:
outputs.last_hidden_state.size()

In [None]:
outputs.last_hidden_state[:,0].size()

In [None]:
def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    return {"hidden_state":last_hidden_state[:,0].cpu().numpy()}

In [None]:
emotions_encoded.set_format("torch", columns=["input_ids","attention_mask","label"])

In [None]:
emotions_hidden = emotions_encoded.map(extract_hidden_states, batched=True)

In [None]:
emotions["train"][0]

In [None]:
emotions_hidden["train"][:1]

In [None]:

X_train = np.array(emotions_hidden["train"]["hidden_state"])
X_valid = np.array(emotions_hidden["validation"]["hidden_state"])
y_train = np.array(emotions_hidden["valid"]["label"])
y_valid = np.array(emotions_hidden["validation"]["label"])

X_train.shape,X_valid.shape

In [None]:

# Scale features to the [0,1] range
X_scaled = MinMaxScaler().fit_transform(X_train)
# Initialize and fit UMAP
mapper = UMAP(n_components=2,metric="cosine").fit(X_scaled)
# Create a df from 2d embeddings
df_emb = pd.DataFrame(mapper.embedding_,columns=["X","Y"])
df_emb["label"] = y_train
df_emb.head()