In [None]:
!nvidia-smi

Mon Oct 28 16:52:39 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8              11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Installation

In [None]:
!pip install transformers



## HuggingFace tasks

In [None]:

from transformers import pipeline
#---------------------------------------------------#
#                     NLP TASKS                     #
#---------------------------------------------------#

'''
1. Text Classification: Assigning a category to a piece of text.
Sentiment Analysis
Topic Classification
Spam Detection '''

classifier = pipeline("text-classification")

'''
2. Token Classification: Assigning labels to individual tokens in a sequence.
Named Entity Recognition (NER)
Part-of-Speech Tagging
'''

token_classifier = pipeline("token-classification")

'''
3. Question Answering: Extracting an answer from a given context based on a question.
'''
question_answerer = pipeline("question-answering")

'''
4. Text Generation: Generating text based on a given prompt.
Language Modeling
Story Generation

'''

text_generator = pipeline("text-generation")

'''
5. Summarization: Condensing long documents into shorter summaries.
'''

summarizer = pipeline("summarization")

'''
Translation: Translating text from one language to another.
'''

translator = pipeline("translation",
                      model="Helsinki-NLP/opus-mt-en-fr")

'''
6. Text2Text Generation: General-purpose text transformation, including summarization and translation.
'''

text2text_generator = pipeline("text2text-generation")

'''
7. Fill-Mask: Predicting the masked token in a sequence.
'''

fill_mask = pipeline("fill-mask")

'''
8. Feature Extraction: Extracting hidden states or features from text.
'''

feature_extractor = pipeline("feature-extraction")

'''
9. Sentence Similarity: Measuring the similarity between two sentences.
'''
sentence_similarity = pipeline("sentence-similarity")

#---------------------------------------------------#
#             Computer Vision TASKS                 #
#---------------------------------------------------#

'''
1. Image Classification: Classifying the main content of an image.

'''

image_classifier = pipeline("image-classification")

'''
2. Object Detection: Identifying objects within an image and their bounding boxes.
'''

object_detector = pipeline("object-detection")

'''
3. Image Segmentation: Segmenting different parts of an image into classes.
'''

image_segmenter = pipeline("image-segmentation")

'''
4. Image Generation: Generating images from textual descriptions (using DALL-E or similar models).
'''

#---------------------------------------------------#
#             Speech Processing TASKS               #
#---------------------------------------------------#

'''
1. utomatic Speech Recognition (ASR): Converting spoken language into text.
'''

speech_recognizer = pipeline("automatic-speech-recognition")

'''
2. Speech Translation: Translating spoken language from one language to another.
3. Audio Classification: Classifying audio signals into predefined categories.
'''

#---------------------------------------------------#
#                   Multimodal TASKS                #
#---------------------------------------------------#

'''
1. Image Captioning: Generating a textual description of an image.
'''
image_captioner = pipeline("image-to-text")
'''
2. Visual Question Answering (VQA): Answering questions about the content of an image.
'''

#---------------------------------------------------#
#                     Other TASKS                   #
#---------------------------------------------------#
'''
1. Table Question Answering: Answering questions based on tabular data.
'''
table_qa = pipeline("table-question-answering")

'''
2. Document Question Answering: Extracting answers from documents like PDFs.

'''
doc_qa = pipeline("document-question-answering")
'''
3. Time Series Forecasting: Predicting future values in time series data (not directly supported in the main Transformers library but available through extensions).
'''

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
No model was supplied, defaulted to openai-community/gpt2 and revision 6c0e608 (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
No model was supplied, defaulted to google-t5/t5-base and revision 686f1db (https://huggingface.co/google-t5/t5-base).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
No model was supplied, defaulted to distilbert/distilroberta-base and revision ec58a5b (https://huggingface.co/distilbert/distilroberta-base).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert/distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
No model was supplied, defaulted to distilbert/distilbert-base-cased and revision 935ac13 (https://huggingface.co/distilbert/distilbert-base-cased).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


KeyError: "Unknown task sentence-similarity, available tasks are ['audio-classification', 'automatic-speech-recognition', 'depth-estimation', 'document-question-answering', 'feature-extraction', 'fill-mask', 'image-classification', 'image-feature-extraction', 'image-segmentation', 'image-to-image', 'image-to-text', 'mask-generation', 'ner', 'object-detection', 'question-answering', 'sentiment-analysis', 'summarization', 'table-question-answering', 'text-classification', 'text-generation', 'text-to-audio', 'text-to-speech', 'text2text-generation', 'token-classification', 'translation', 'video-classification', 'visual-question-answering', 'vqa', 'zero-shot-audio-classification', 'zero-shot-classification', 'zero-shot-image-classification', 'zero-shot-object-detection', 'translation_XX_to_YY']"

## Sentiment Analysis

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
result = classifier(" I was so not happy with the last Mission Impossible Movie")
print(result)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'NEGATIVE', 'score': 0.9997795224189758}]


In [None]:
pipeline(task = "sentiment-analysis")("I was confused with the Barbie Movie")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'NEGATIVE', 'score': 0.9992005228996277}]

In [None]:
 pipeline(task="sentiment-analysis")\
                                    ("Working on this project has been an incredible experience; \
                                    although it‚Äôs challenging at times, \
                                      I feel like I‚Äôm learning a lot and making meaningful progress each day, which keeps me motivated and excited")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'POSITIVE', 'score': 0.9998701810836792}]

In [None]:
pipeline(task= "sentiment-analysis", model="facebook/bart-large"  )\
                                    ("Working on this project has been an incredible experience; \
                                    although it‚Äôs challenging at times, \
                                      I feel like I‚Äôm learning a lot and making meaningful progress each day, which keeps me motivated and excited")

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'LABEL_0', 'score': 0.46523335576057434}]

## Text Generation

In [None]:
from transformers  import pipeline
text_generator = pipeline("text-generation", model="distilbert/distilgpt2")
generated_text = text_generator(" Today is a rainy day in London", truncation = True, num_return_sequences=2)

print(" Generated_text: \n", generated_text[0]['generated_text'])


config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 Generated_text: 
  Today is a rainy day in London today, but as you know it's not nearly as bad today as in 2016; it has been the most rainy the country has seen during the last century.


But what's to remember about these days


## Question Answering

In [None]:
from transformers import pipeline

qa_model = pipeline("question-answering")
question = "What is my job?"
context = "I am developing AI models with Python"
qa_model(question = question, context=context)

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


{'score': 0.82672119140625,
 'start': 5,
 'end': 25,
 'answer': 'developing AI models'}

# Tokenization

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DistilBertTokenizer, DistilBertForSequenceClassification


In [None]:
model_name1 = "distilbert-base-uncased-finetuned-sst-2-english"
mytokenizer1 = DistilBertTokenizer.from_pretrained(model_name1)
mymodel1 = DistilBertForSequenceClassification.from_pretrained(model_name1)

classifier = pipeline("sentiment-analysis",)
res = classifier(" I was so not happy with the Barbie Movie")
print(res)

model_name2 = "nlptown/bert-base-multilingual-uncased-sentiment"
mymodel2 = AutoModelForSequenceClassification.from_pretrained(model_name2)
mytokenizer2 = AutoTokenizer.from_pretrained(model_name2)


classifier = pipeline("sentiment-analysis", model=mymodel2, tokenizer=mytokenizer2)
res = classifier(" I was so not happy with the Barbie Movie")
print(res)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'NEGATIVE', 'score': 0.9998108744621277}]


config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': '2 stars', 'score': 0.5099301934242249}]


In [None]:
from transformers import AutoTokenizer

# Load a pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Example test
text = " I was so not happy with the Barbie Movie"

# Tokenize a sentence
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)

# Convert tokens to input IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Input IDs:", input_ids)

# Encode the text (tokenization + converting to input IDs)
encoded_input = tokenizer(text)
print("Encoded Input:", encoded_input)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokens: ['i', 'was', 'so', 'not', 'happy', 'with', 'the', 'barbie', 'movie']
Input IDs: [1045, 2001, 2061, 2025, 3407, 2007, 1996, 22635, 3185]
Encoded Input: {'input_ids': [101, 1045, 2001, 2061, 2025, 3407, 2007, 1996, 22635, 3185, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}




In [None]:
# Decode the text
decoded_text = tokenizer.decode(input_ids)
print("Decoded Text:", decoded_text)

Decoded Text: i was so not happy with the barbie movie


# Fine Tuning IMDB Dataset
*Step 1: Install Necessary Libraries

In [None]:
 !pip install datasets

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m472.7/472.7 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m116.3/116.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-

## Step 2: Load and Prepare the Dataset


In [None]:
from datasets import load_dataset
dataset = load_dataset('imdb')

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

## Step 3: Preprocess the Data

Tokenizing dataset using tokenizer associated with the pre-trained model

In [None]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize and preprocess the data
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length",truncation=True)

tokenized_datasets = dataset.map(preprocess_function, batched=True)



Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})

## Step 4: Setup the Training Arguments

Specify the hyperparameters and training settings.

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',          # Evaluate at the end of each epoch
    learning_rate=2e-5,                   # Learning rate for the optimizer
    num_train_epochs=3,                    # Total number of training epochs
    per_device_train_batch_size=16,        # Batch size for training
    per_device_eval_batch_size=16,         # Batch size for evaluation
    weight_decay=0.01,                     # Weight decay for regularization
    run_name='my_unique_run_name'          # Specify a unique run name for WandB
)

# Display the training arguments
print(training_args)


TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=epoch,
eval_use_gather_object=False,
evaluation_strategy=epoch,
fp



## Step 5: Initialize the Model
Load the pre-trained model and define the training procedure

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer

# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Define the training procedure
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Step 6: Train the Model
fine-tune the pre-trained model on own dataset

In [None]:
# Train the model
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.2095,0.205636


KeyboardInterrupt: 

## Step 7: Evaluate the Model
Assess the model's performance on a validation set

In [None]:
# Evaluate the model
results = trainer.evaluate()
print(results)

Epoch,Training Loss,Validation Loss
1,0.1376,0.226193


{'eval_loss': 0.22619327902793884}


## Step 8: Save the Fine-tuned model
Saving fine-tuned model for later use.

In [None]:
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')

# ArXiv Project

In [None]:
!pip install arxiv
import arxiv
import pandas as pd

Collecting arxiv
  Downloading arxiv-2.1.3-py3-none-any.whl.metadata (6.1 kB)
Collecting feedparser~=6.0.10 (from arxiv)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser~=6.0.10->arxiv)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading arxiv-2.1.3-py3-none-any.whl (11 kB)
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m81.3/81.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6047 sha256=fdeb6467c0a6d6687949e0a0bb66a2a98afdc6f75d074fb0ab6ff6f018e14aad
  Stored in directory: /root/.cache/pip/wheels/f0/69/93/a47e9d621be168e9e33c7ce6052

In [None]:
# Query to fetch AI-related papers
query = 'ai OR artifical intelligence OR machine learning'
search = arxiv.Search(query=query, max_results=10, sort_by=arxiv.SortCriterion.SubmittedDate)

# fetching papers
papers= []
for result in search.results():
    papers.append({
        'title': result.title,
        'abstract': result.summary,
        'authors': result.authors,
        'summary': result.summary,
        'pdf_url': result.pdf_url,
        'categories': result.categories,
    })

# Converting to DataFrame
df = pd.DataFrame(papers)
pd.set_option('display.max_colwidth', None)
df.head(10)

  for result in search.results():


Unnamed: 0,title,abstract,authors,summary,pdf_url,categories
0,The Potential and Value of AI Chatbot in Personalized Cognitive Training,"In recent years, the rapid aging of the global population has led to an\nincrease in cognitive disorders, such as Alzheimer's disease, presenting\nsignificant public health challenges. Although no effective treatments\ncurrently exist to reverse Alzheimer's, prevention and early intervention,\nincluding cognitive training, are critical. This report explores the potential\nof AI chatbots in enhancing personalized cognitive training. We introduce ReMe,\na web-based framework designed to create AI chatbots that facilitate cognitive\ntraining research, specifically targeting episodic memory tasks derived from\npersonal life logs. By leveraging large language models, ReMe provides enhanced\nuser-friendly, interactive, and personalized training experiences. Case studies\ndemonstrate ReMe's effectiveness in engaging users through life recall and\nopen-ended language puzzles, highlighting its potential to improve cognitive\ntraining design. Despite promising results, further research is needed to\nvalidate training effectiveness through large-scale studies that include\ncognitive ability evaluations. Overall, ReMe offers a promising approach to\npersonalized cognitive training, utilizing AI capabilities to meet the growing\ndemand for non-pharmacological interventions in cognitive health, with future\nresearch aiming to expand its applications and efficacy.","[Zilong Wang, Nan Chen, Luna K. Qiu, Ling Yue, Geli Guo, Yang Ou, Shiqi Jiang, Yuqing Yang, Lili Qiu]","In recent years, the rapid aging of the global population has led to an\nincrease in cognitive disorders, such as Alzheimer's disease, presenting\nsignificant public health challenges. Although no effective treatments\ncurrently exist to reverse Alzheimer's, prevention and early intervention,\nincluding cognitive training, are critical. This report explores the potential\nof AI chatbots in enhancing personalized cognitive training. We introduce ReMe,\na web-based framework designed to create AI chatbots that facilitate cognitive\ntraining research, specifically targeting episodic memory tasks derived from\npersonal life logs. By leveraging large language models, ReMe provides enhanced\nuser-friendly, interactive, and personalized training experiences. Case studies\ndemonstrate ReMe's effectiveness in engaging users through life recall and\nopen-ended language puzzles, highlighting its potential to improve cognitive\ntraining design. Despite promising results, further research is needed to\nvalidate training effectiveness through large-scale studies that include\ncognitive ability evaluations. Overall, ReMe offers a promising approach to\npersonalized cognitive training, utilizing AI capabilities to meet the growing\ndemand for non-pharmacological interventions in cognitive health, with future\nresearch aiming to expand its applications and efficacy.",http://arxiv.org/pdf/2410.19733v1,[cs.AI]
1,Counting Ability of Large Language Models and Impact of Tokenization,"Transformers, the backbone of modern large language models (LLMs), face\ninherent architectural limitations that impede their reasoning capabilities.\nUnlike recurrent networks, Transformers lack recurrent connections, confining\nthem to constant-depth computation. This restriction places them in the\ncomplexity class TC$^0$, making them theoretically incapable of solving tasks\nthat demand increasingly deep reasoning as input length grows. Counting, a\nfundamental component of many reasoning tasks, also requires reasoning depth to\ngrow linearly to be performed inductively. While previous studies have\nestablished the upper limits of counting ability in Transformer-based expert\nmodels (i.e., models specifically trained for counting tasks), these findings\ndo not directly extend to general-purpose LLMs due to differences in reasoning\nmechanisms. Recent work has highlighted how Chain of Thought (CoT) reasoning\ncan help alleviate some of the architectural limitations of Transformers in\ncounting tasks. However, little attention has been paid to the role of\ntokenization in these models. Unlike expert models that often use\ncharacter-level tokenization, LLMs typically rely on byte-level (BPE)\ntokenizers, which fundamentally alters the way reasoning is processed. Our work\ninvestigates the impact of tokenization on the counting abilities of LLMs,\nuncovering substantial performance variations based on input tokenization\ndifferences. We provide both theoretical and experimental analyses, offering\ninsights into how tokenization choices can undermine models' theoretical\ncomputability, thereby inspiring the design of new tokenization methods to\nenhance reasoning in LLMs.","[Xiang Zhang, Juntai Cao, Chenyu You]","Transformers, the backbone of modern large language models (LLMs), face\ninherent architectural limitations that impede their reasoning capabilities.\nUnlike recurrent networks, Transformers lack recurrent connections, confining\nthem to constant-depth computation. This restriction places them in the\ncomplexity class TC$^0$, making them theoretically incapable of solving tasks\nthat demand increasingly deep reasoning as input length grows. Counting, a\nfundamental component of many reasoning tasks, also requires reasoning depth to\ngrow linearly to be performed inductively. While previous studies have\nestablished the upper limits of counting ability in Transformer-based expert\nmodels (i.e., models specifically trained for counting tasks), these findings\ndo not directly extend to general-purpose LLMs due to differences in reasoning\nmechanisms. Recent work has highlighted how Chain of Thought (CoT) reasoning\ncan help alleviate some of the architectural limitations of Transformers in\ncounting tasks. However, little attention has been paid to the role of\ntokenization in these models. Unlike expert models that often use\ncharacter-level tokenization, LLMs typically rely on byte-level (BPE)\ntokenizers, which fundamentally alters the way reasoning is processed. Our work\ninvestigates the impact of tokenization on the counting abilities of LLMs,\nuncovering substantial performance variations based on input tokenization\ndifferences. We provide both theoretical and experimental analyses, offering\ninsights into how tokenization choices can undermine models' theoretical\ncomputability, thereby inspiring the design of new tokenization methods to\nenhance reasoning in LLMs.",http://arxiv.org/pdf/2410.19730v1,"[cs.CL, cs.AI]"
2,"cymyc -- Calabi-Yau Metrics, Yukawas, and Curvature","We introduce \texttt{cymyc}, a high-performance Python library for numerical\ninvestigation of the geometry of a large class of string compactification\nmanifolds and their associated moduli spaces. We develop a well-defined\ngeometric ansatz to numerically model tensor fields of arbitrary degree on a\nlarge class of Calabi-Yau manifolds. \texttt{cymyc} includes a machine learning\ncomponent which incorporates this ansatz to model tensor fields of interest on\nthese spaces by finding an approximate solution to the system of partial\ndifferential equations they should satisfy.","[Per Berglund, Giorgi Butbaia, Tristan H√ºbsch, Vishnu Jejjala, Challenger Mishra, Dami√°n Mayorga Pe√±a, Justin Tan]","We introduce \texttt{cymyc}, a high-performance Python library for numerical\ninvestigation of the geometry of a large class of string compactification\nmanifolds and their associated moduli spaces. We develop a well-defined\ngeometric ansatz to numerically model tensor fields of arbitrary degree on a\nlarge class of Calabi-Yau manifolds. \texttt{cymyc} includes a machine learning\ncomponent which incorporates this ansatz to model tensor fields of interest on\nthese spaces by finding an approximate solution to the system of partial\ndifferential equations they should satisfy.",http://arxiv.org/pdf/2410.19728v1,"[hep-th, cs.LG, hep-ph]"
3,"FISHNET: Financial Intelligence from Sub-querying, Harmonizing, Neural-Conditioning, Expert Swarms, and Task Planning","Financial intelligence generation from vast data sources has typically relied\non traditional methods of knowledge-graph construction or database engineering.\nRecently, fine-tuned financial domain-specific Large Language Models (LLMs),\nhave emerged. While these advancements are promising, limitations such as high\ninference costs, hallucinations, and the complexity of concurrently analyzing\nhigh-dimensional financial data, emerge. This motivates our invention FISHNET\n(Financial Intelligence from Sub-querying, Harmonizing, Neural-Conditioning,\nExpert swarming, and Task planning), an agentic architecture that accomplishes\nhighly complex analytical tasks for more than 98,000 regulatory filings that\nvary immensely in terms of semantics, data hierarchy, or format. FISHNET shows\nremarkable performance for financial insight generation (61.8% success rate\nover 5.0% Routing, 45.6% RAG R-Precision). We conduct rigorous ablations to\nempirically prove the success of FISHNET, each agent's importance, and the\noptimized performance of assembling all agents. Our modular architecture can be\nleveraged for a myriad of use-cases, enabling scalability, flexibility, and\ndata integrity that are critical for financial tasks.","[Nicole Cho, Nishan Srishankar, Lucas Cecchi, William Watson]","Financial intelligence generation from vast data sources has typically relied\non traditional methods of knowledge-graph construction or database engineering.\nRecently, fine-tuned financial domain-specific Large Language Models (LLMs),\nhave emerged. While these advancements are promising, limitations such as high\ninference costs, hallucinations, and the complexity of concurrently analyzing\nhigh-dimensional financial data, emerge. This motivates our invention FISHNET\n(Financial Intelligence from Sub-querying, Harmonizing, Neural-Conditioning,\nExpert swarming, and Task planning), an agentic architecture that accomplishes\nhighly complex analytical tasks for more than 98,000 regulatory filings that\nvary immensely in terms of semantics, data hierarchy, or format. FISHNET shows\nremarkable performance for financial insight generation (61.8% success rate\nover 5.0% Routing, 45.6% RAG R-Precision). We conduct rigorous ablations to\nempirically prove the success of FISHNET, each agent's importance, and the\noptimized performance of assembling all agents. Our modular architecture can be\nleveraged for a myriad of use-cases, enabling scalability, flexibility, and\ndata integrity that are critical for financial tasks.",http://arxiv.org/pdf/2410.19727v1,"[cs.AI, cs.CL, cs.IR, cs.LG]"
4,On the Benefits of Active Data Collection in Operator Learning,"We investigate active data collection strategies for operator learning when\nthe target operator is linear and the input functions are drawn from a\nmean-zero stochastic process with continuous covariance kernels. With an active\ndata collection strategy, we establish an error convergence rate in terms of\nthe decay rate of the eigenvalues of the covariance kernel. Thus, with\nsufficiently rapid eigenvalue decay of the covariance kernels, arbitrarily fast\nerror convergence rates can be achieved. This contrasts with the passive\n(i.i.d.) data collection strategies, where the convergence rate is never faster\nthan $\sim n^{-1}$. In fact, for our setting, we establish a\n\emph{non-vanishing} lower bound for any passive data collection strategy,\nregardless of the eigenvalues decay rate of the covariance kernel. Overall, our\nresults show the benefit of active over passive data collection strategies in\noperator learning.","[Unique Subedi, Ambuj Tewari]","We investigate active data collection strategies for operator learning when\nthe target operator is linear and the input functions are drawn from a\nmean-zero stochastic process with continuous covariance kernels. With an active\ndata collection strategy, we establish an error convergence rate in terms of\nthe decay rate of the eigenvalues of the covariance kernel. Thus, with\nsufficiently rapid eigenvalue decay of the covariance kernels, arbitrarily fast\nerror convergence rates can be achieved. This contrasts with the passive\n(i.i.d.) data collection strategies, where the convergence rate is never faster\nthan $\sim n^{-1}$. In fact, for our setting, we establish a\n\emph{non-vanishing} lower bound for any passive data collection strategy,\nregardless of the eigenvalues decay rate of the covariance kernel. Overall, our\nresults show the benefit of active over passive data collection strategies in\noperator learning.",http://arxiv.org/pdf/2410.19725v1,"[stat.ML, cs.LG]"
5,Sparse Decomposition of Graph Neural Networks,"Graph Neural Networks (GNN) exhibit superior performance in graph\nrepresentation learning, but their inference cost can be high, due to an\naggregation operation that can require a memory fetch for a very large number\nof nodes. This inference cost is the major obstacle to deploying GNN models\nwith \emph{online prediction} to reflect the potentially dynamic node features.\nTo address this, we propose an approach to reduce the number of nodes that are\nincluded during aggregation. We achieve this through a sparse decomposition,\nlearning to approximate node representations using a weighted sum of linearly\ntransformed features of a carefully selected subset of nodes within the\nextended neighbourhood. The approach achieves linear complexity with respect to\nthe average node degree and the number of layers in the graph neural network.\nWe introduce an algorithm to compute the optimal parameters for the sparse\ndecomposition, ensuring an accurate approximation of the original GNN model,\nand present effective strategies to reduce the training time and improve the\nlearning process. We demonstrate via extensive experiments that our method\noutperforms other baselines designed for inference speedup, achieving\nsignificant accuracy gains with comparable inference times for both node\nclassification and spatio-temporal forecasting tasks.","[Yaochen Hu, Mai Zeng, Ge Zhang, Pavel Rumiantsev, Liheng Ma, Yingxue Zhang, Mark Coates]","Graph Neural Networks (GNN) exhibit superior performance in graph\nrepresentation learning, but their inference cost can be high, due to an\naggregation operation that can require a memory fetch for a very large number\nof nodes. This inference cost is the major obstacle to deploying GNN models\nwith \emph{online prediction} to reflect the potentially dynamic node features.\nTo address this, we propose an approach to reduce the number of nodes that are\nincluded during aggregation. We achieve this through a sparse decomposition,\nlearning to approximate node representations using a weighted sum of linearly\ntransformed features of a carefully selected subset of nodes within the\nextended neighbourhood. The approach achieves linear complexity with respect to\nthe average node degree and the number of layers in the graph neural network.\nWe introduce an algorithm to compute the optimal parameters for the sparse\ndecomposition, ensuring an accurate approximation of the original GNN model,\nand present effective strategies to reduce the training time and improve the\nlearning process. We demonstrate via extensive experiments that our method\noutperforms other baselines designed for inference speedup, achieving\nsignificant accuracy gains with comparable inference times for both node\nclassification and spatio-temporal forecasting tasks.",http://arxiv.org/pdf/2410.19723v1,"[cs.LG, cs.AI]"
6,Temporal Convolution-based Hybrid Model Approach with Representation Learning for Real-Time Acoustic Anomaly Detection,"The early detection of potential failures in industrial machinery components\nis paramount for ensuring the reliability and safety of operations, thereby\npreserving Machine Condition Monitoring (MCM). This research addresses this\nimperative by introducing an innovative approach to Real-Time Acoustic Anomaly\nDetection. Our method combines semi-supervised temporal convolution with\nrepresentation learning and a hybrid model strategy with Temporal Convolutional\nNetworks (TCN) to handle various intricate anomaly patterns found in acoustic\ndata effectively. The proposed model demonstrates superior performance compared\nto established research in the field, underscoring the effectiveness of this\napproach. Not only do we present quantitative evidence of its superiority, but\nwe also employ visual representations, such as t-SNE plots, to further\nsubstantiate the model's efficacy.","[Sahan Dissanayaka, Manjusri Wickramasinghe, Pasindu Marasinghe]","The early detection of potential failures in industrial machinery components\nis paramount for ensuring the reliability and safety of operations, thereby\npreserving Machine Condition Monitoring (MCM). This research addresses this\nimperative by introducing an innovative approach to Real-Time Acoustic Anomaly\nDetection. Our method combines semi-supervised temporal convolution with\nrepresentation learning and a hybrid model strategy with Temporal Convolutional\nNetworks (TCN) to handle various intricate anomaly patterns found in acoustic\ndata effectively. The proposed model demonstrates superior performance compared\nto established research in the field, underscoring the effectiveness of this\napproach. Not only do we present quantitative evidence of its superiority, but\nwe also employ visual representations, such as t-SNE plots, to further\nsubstantiate the model's efficacy.",http://arxiv.org/pdf/2410.19722v1,"[cs.SD, cs.LG, eess.AS]"
7,2D-DPO: Scaling Direct Preference Optimization with 2-Dimensional Supervision,"Recent advancements in Direct Preference Optimization (DPO) have\nsignificantly enhanced the alignment of Large Language Models (LLMs) with human\npreferences, owing to its simplicity and effectiveness. However, existing\nmethods typically optimize a scalar score or ranking reward, thereby\noverlooking the multi-dimensional nature of human preferences. In this work, we\npropose to extend the preference of DPO to two dimensions: segments and\naspects. We first introduce a 2D supervision dataset called HelpSteer-2D. For\nthe segment dimension, we divide the response into sentences and assign scores\nto each segment. For the aspect dimension, we meticulously design several\ncriteria covering the response quality rubrics. With the 2-dimensional signals\nas feedback, we develop a 2D-DPO framework, decomposing the overall objective\ninto multi-segment and multi-aspect objectives. Extensive experiments on\npopular benchmarks demonstrate that 2D-DPO performs better than methods that\noptimize for scalar or 1-dimensional preferences.","[Shilong Li, Yancheng He, Hui Huang, Xingyuan Bu, Jiaheng Liu, Hangyu Guo, Weixun Wang, Jihao Gu, Wenbo Su, Bo Zheng]","Recent advancements in Direct Preference Optimization (DPO) have\nsignificantly enhanced the alignment of Large Language Models (LLMs) with human\npreferences, owing to its simplicity and effectiveness. However, existing\nmethods typically optimize a scalar score or ranking reward, thereby\noverlooking the multi-dimensional nature of human preferences. In this work, we\npropose to extend the preference of DPO to two dimensions: segments and\naspects. We first introduce a 2D supervision dataset called HelpSteer-2D. For\nthe segment dimension, we divide the response into sentences and assign scores\nto each segment. For the aspect dimension, we meticulously design several\ncriteria covering the response quality rubrics. With the 2-dimensional signals\nas feedback, we develop a 2D-DPO framework, decomposing the overall objective\ninto multi-segment and multi-aspect objectives. Extensive experiments on\npopular benchmarks demonstrate that 2D-DPO performs better than methods that\noptimize for scalar or 1-dimensional preferences.",http://arxiv.org/pdf/2410.19720v1,"[cs.CL, cs.AI]"
8,Arabic Music Classification and Generation using Deep Learning,"This paper proposes a machine learning approach for classifying classical and\nnew Egyptian music by composer and generating new similar music. The proposed\nsystem utilizes a convolutional neural network (CNN) for classification and a\nCNN autoencoder for generation. The dataset used in this project consists of\nnew and classical Egyptian music pieces composed by different composers.\n To classify the music by composer, each sample is normalized and transformed\ninto a mel spectrogram. The CNN model is trained on the dataset using the mel\nspectrograms as input features and the composer labels as output classes. The\nmodel achieves 81.4\% accuracy in classifying the music by composer,\ndemonstrating the effectiveness of the proposed approach.\n To generate new music similar to the original pieces, a CNN autoencoder is\ntrained on a similar dataset. The model is trained to encode the mel\nspectrograms of the original pieces into a lower-dimensional latent space and\nthen decode them back into the original mel spectrogram. The generated music is\nproduced by sampling from the latent space and decoding the samples back into\nmel spectrograms, which are then transformed into audio.\n In conclusion, the proposed system provides a promising approach to\nclassifying and generating classical Egyptian music, which can be applied in\nvarious musical applications, such as music recommendation systems, music\nproduction, and music education.","[Mohamed Elshaarawy, Ashrakat Saeed, Mariam Sheta, Abdelrahman Said, Asem Bakr, Omar Bahaa, Walid Gomaa]","This paper proposes a machine learning approach for classifying classical and\nnew Egyptian music by composer and generating new similar music. The proposed\nsystem utilizes a convolutional neural network (CNN) for classification and a\nCNN autoencoder for generation. The dataset used in this project consists of\nnew and classical Egyptian music pieces composed by different composers.\n To classify the music by composer, each sample is normalized and transformed\ninto a mel spectrogram. The CNN model is trained on the dataset using the mel\nspectrograms as input features and the composer labels as output classes. The\nmodel achieves 81.4\% accuracy in classifying the music by composer,\ndemonstrating the effectiveness of the proposed approach.\n To generate new music similar to the original pieces, a CNN autoencoder is\ntrained on a similar dataset. The model is trained to encode the mel\nspectrograms of the original pieces into a lower-dimensional latent space and\nthen decode them back into the original mel spectrogram. The generated music is\nproduced by sampling from the latent space and decoding the samples back into\nmel spectrograms, which are then transformed into audio.\n In conclusion, the proposed system provides a promising approach to\nclassifying and generating classical Egyptian music, which can be applied in\nvarious musical applications, such as music recommendation systems, music\nproduction, and music education.",http://arxiv.org/pdf/2410.19719v1,"[cs.SD, cs.AI, eess.AS]"
9,Evolving Neural Networks Reveal Emergent Collective Behavior from Minimal Agent Interactions,"Understanding the mechanisms behind emergent behaviors in multi-agent systems\nis critical for advancing fields such as swarm robotics and artificial\nintelligence. In this study, we investigate how neural networks evolve to\ncontrol agents' behavior in a dynamic environment, focusing on the relationship\nbetween the network's complexity and collective behavior patterns. By\nperforming quantitative and qualitative analyses, we demonstrate that the\ndegree of network non-linearity correlates with the complexity of emergent\nbehaviors. Simpler behaviors, such as lane formation and laminar flow, are\ncharacterized by more linear network operations, while complex behaviors like\nswarming and flocking show highly non-linear neural processing. Moreover,\nspecific environmental parameters, such as moderate noise, broader field of\nview, and lower agent density, promote the evolution of non-linear networks\nthat drive richer, more intricate collective behaviors. These results highlight\nthe importance of tuning evolutionary conditions to induce desired behaviors in\nmulti-agent systems, offering new pathways for optimizing coordination in\nautonomous swarms. Our findings contribute to a deeper understanding of how\nneural mechanisms influence collective dynamics, with implications for the\ndesign of intelligent, self-organizing systems.","[Guilherme S. Y. Giardini, John F. Hardy II, Carlo R. da Cunha]","Understanding the mechanisms behind emergent behaviors in multi-agent systems\nis critical for advancing fields such as swarm robotics and artificial\nintelligence. In this study, we investigate how neural networks evolve to\ncontrol agents' behavior in a dynamic environment, focusing on the relationship\nbetween the network's complexity and collective behavior patterns. By\nperforming quantitative and qualitative analyses, we demonstrate that the\ndegree of network non-linearity correlates with the complexity of emergent\nbehaviors. Simpler behaviors, such as lane formation and laminar flow, are\ncharacterized by more linear network operations, while complex behaviors like\nswarming and flocking show highly non-linear neural processing. Moreover,\nspecific environmental parameters, such as moderate noise, broader field of\nview, and lower agent density, promote the evolution of non-linear networks\nthat drive richer, more intricate collective behaviors. These results highlight\nthe importance of tuning evolutionary conditions to induce desired behaviors in\nmulti-agent systems, offering new pathways for optimizing coordination in\nautonomous swarms. Our findings contribute to a deeper understanding of how\nneural mechanisms influence collective dynamics, with implications for the\ndesign of intelligent, self-organizing systems.",http://arxiv.org/pdf/2410.19718v1,"[nlin.AO, cs.AI, cs.MA]"


In [None]:
from transformers import pipeline
# Example abstract from API
abstract = df['abstract'][0]

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Summarization
summarization_result = summarizer(abstract)

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
summarization_result[0]['summary_text']

"ReMe is a web-based framework designed to create AI chatbots that facilitate cognitive training. ReMe specifically targeting episodic memory tasks derived from personal life logs. Case studies demonstrate ReMe's effectiveness in engaging users through life recall and open-ended language puzzles. Despite promising results, further research is needed tovalidate training effectiveness."