In [1]:
!pip install transformers evaluate datasets > /dev/null

In [2]:
from datasets import load_dataset

In [3]:
my_set = load_dataset("text",data_files={
    "train":["/content/linux_play.txt"],
    "test":["/content/vim_play.txt"]
})

Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-0a5f1a0b1e89beb1/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-0a5f1a0b1e89beb1/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
my_set

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 277
    })
    test: Dataset({
        features: ['text'],
        num_rows: 62
    })
})

In [9]:
my_set['train'][0:5]['text']

['BEGIN;',
 'Linux Playbook',
 '',
 'The command and scenarios has to be executed inside the',
 'Kali Docker image. The docker image is called linux_playg.']

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [10]:
dataset = my_set.map(lambda examples: tokenizer(examples["text"]), 
                     batched=True)

Map:   0%|          | 0/277 [00:00<?, ? examples/s]

Map:   0%|          | 0/62 [00:00<?, ? examples/s]

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 277
    })
    test: Dataset({
        features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 62
    })
})

In [17]:
space_dataset = load_dataset("csv", 
                       data_files="/content/space_titanic.csv")



  0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
space_dataset

DatasetDict({
    train: Dataset({
        features: ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'Transported'],
        num_rows: 8693
    })
})

In [None]:
space_dataset['train'].shuffle(seed=4).select(range(10))[:]

In [18]:
space_dataset = space_dataset.rename_column(original_column_name='Transported',
                                            new_column_name='Target')

In [19]:
space_dataset

DatasetDict({
    train: Dataset({
        features: ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'Target'],
        num_rows: 8693
    })
})

In [20]:
space_dataset.unique("Target")

{'train': [False, True]}

In [21]:
space_dataset.unique("CryoSleep")

{'train': [False, True, None]}

In [24]:
space_dataset['train'][0]

{'PassengerId': '0001_01',
 'HomePlanet': 'Europa',
 'CryoSleep': False,
 'Cabin': 'B/0/P',
 'Destination': 'TRAPPIST-1e',
 'Age': 39.0,
 'VIP': False,
 'RoomService': 0.0,
 'FoodCourt': 0.0,
 'ShoppingMall': 0.0,
 'Spa': 0.0,
 'VRDeck': 0.0,
 'Name': 'Maham Ofracculy',
 'Target': False}

In [27]:
def first_name_get(name_value):
  return {"firstName":name_value['Name'].split(' ')[0]}

In [36]:
def name_length(name_value):
  return {"nameLength":len(name_value['Name'])}

In [30]:
def filter_name_none(name_value):
  return name_value['Name'] is not None

In [31]:
space_filter = space_dataset.filter(filter_name_none)

Filter:   0%|          | 0/8693 [00:00<?, ? examples/s]

In [32]:
space_filter

DatasetDict({
    train: Dataset({
        features: ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'Target'],
        num_rows: 8493
    })
})

In [35]:
space_filter = space_filter.map(first_name_get)



In [38]:
space_filter = space_filter.map(name_length)



In [46]:
hextra = space_filter.filter(lambda row: row['nameLength'] < 10)



In [47]:
hextra

DatasetDict({
    train: Dataset({
        features: ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'Target', 'firstName', 'nameLength'],
        num_rows: 456
    })
})

In [50]:
hextra['train'].train_test_split(train_size=0.8,
                                 seed=4)

DatasetDict({
    train: Dataset({
        features: ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'Target', 'firstName', 'nameLength'],
        num_rows: 364
    })
    test: Dataset({
        features: ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'Target', 'firstName', 'nameLength'],
        num_rows: 92
    })
})

TypeError: ignored