In [None]:
from datasets import load_dataset, Dataset
import pandas as pd

# Load the dataset from Hugging Face
sentiment_dataset = load_dataset('carblacac/twitter-sentiment-analysis')

# Access the training and testing sets directly
train_dataset = sentiment_dataset['train']
test_dataset = sentiment_dataset['test']

# Convert to Pandas DataFrame
train_df = train_dataset.to_pandas()
test_df = test_dataset.to_pandas()

# Select the first 1000 rows of the train dataset
train_df = train_df[:30000] #30000

# Select the first 50 rows of the test dataset
test_df = test_df[:50]


In [2]:
# Define two detailed sets of instructions
instructions1 = """
Please read the tweet provided below. Your task is to analyze the content and context of the tweet to determine whether it 
expresses a positive or negative sentiment. Consider the use of emotive language, punctuation, and any emoticons used. 
Classify the tweet accordingly as either '1' if it conveys a favorable opinion or emotion, or '0' if it 
expresses an unfavorable opinion or emotion.
### Tweet: {tweet}
### Sentiment: {sentiment}
"""

instructions2 = """
Examine the following tweet carefully. Assess whether the tweet has a positive or negative tone based on the wording, 
sentence structure, and any emoticons present. If the tweet suggests a positive outlook or a pleasant emotional state, 
classify it as '1'. Conversely, if the tweet indicates dissatisfaction or unhappiness, classify it as '0'.
### Tweet: {tweet}
### Sentiment: {sentiment}
"""

# Function to append instructions to each row
def add_instructions(df):
    df['Instruction1'] = df.apply(lambda x: instructions1.format(tweet=x['text'], sentiment=x['feeling']), axis=1)
    df['Instruction2'] = df.apply(lambda x: instructions2.format(tweet=x['text'], sentiment=x['feeling']), axis=1)
    return df

# Apply instructions to the full train and test datasets
train_df = add_instructions(train_df)
test_df = add_instructions(test_df)


In [3]:
print(train_df.iloc[0]['Instruction1'])


Please read the tweet provided below. Your task is to analyze the content and context of the tweet to determine whether it 
expresses a positive or negative sentiment. Consider the use of emotive language, punctuation, and any emoticons used. 
Classify the tweet accordingly as either '1' if it conveys a favorable opinion or emotion, or '0' if it 
expresses an unfavorable opinion or emotion.
### Tweet: @fa6ami86 so happy that salman won.  btw the 14sec clip is truely a teaser
### Sentiment: 0



In [4]:
print(train_df.iloc[0]['Instruction2'])


Examine the following tweet carefully. Assess whether the tweet has a positive or negative tone based on the wording, 
sentence structure, and any emoticons present. If the tweet suggests a positive outlook or a pleasant emotional state, 
classify it as '1'. Conversely, if the tweet indicates dissatisfaction or unhappiness, classify it as '0'.
### Tweet: @fa6ami86 so happy that salman won.  btw the 14sec clip is truely a teaser
### Sentiment: 0



In [5]:
train_dset = Dataset.from_pandas(train_df)

Task 2: Fine-Tuning

In [6]:
import os
import json
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

In [7]:
base_model = "mistralai/Mistral-7B-v0.1"

new_model = '/work/gns938/nlp_hw3/Mistral-sentiment-fine-tuned'

In [8]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [9]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards: 100%|██████████| 2/2 [00:20<00:00, 10.35s/it]


In [10]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [11]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [12]:
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=2, #4
    gradient_accumulation_steps=8, #16
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25, #100
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard" #all
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dset,
    peft_config=peft_params,
    dataset_text_field="Instruction1",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)

In [14]:
trainer.train()
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

Step,Training Loss
25,1.1921
50,0.4539
75,0.8279
100,0.4462
125,0.8258
150,0.4517
175,0.8078
200,0.4087
225,0.7801
250,0.4271


('/work/gns938/nlp_hw3/Mistral-sentiment-fine-tuned/tokenizer_config.json',
 '/work/gns938/nlp_hw3/Mistral-sentiment-fine-tuned/special_tokens_map.json',
 '/work/gns938/nlp_hw3/Mistral-sentiment-fine-tuned/tokenizer.json')

In [15]:
alpaca_dataset = load_dataset("tatsu-lab/alpaca")

# Split the dataset into training and testing sets
split_datasets_alpaca = alpaca_dataset['train'].train_test_split(test_size=0.02, seed=42)

# Access the training and testing sets directly
train_dataset_alpaca = split_datasets_alpaca['train']
test_dataset_alpaca = split_datasets_alpaca['test']

# Convert to Pandas DataFrame
train_df_alpaca = train_dataset_alpaca.to_pandas()
test_df_alpaca = test_dataset_alpaca.to_pandas()

# Select the first 1000 rows of the train dataset
train_df_alpaca = train_df_alpaca[:30000]

# Select the first 50 rows of the test dataset
test_df_alpaca = test_df_alpaca[:50]

In [16]:
# Adjust the sentiment dataset to match the alpaca dataset structure
train_df_renamed = train_df.rename(columns={
    'Instruction1': 'text',
    'text': 'instruction',
    'feeling': 'output'
})

# Convert the 'output' column to string to match expected data types
train_df_renamed['output'] = train_df_renamed['output'].astype(str)

# Add the missing 'input' column filled with empty strings for the sentiment dataset
train_df_renamed['input'] = ''

# Reorder columns to match the alpaca dataset
train_df_restructured = train_df_renamed[['instruction', 'input', 'output', 'text']]

# Concatenate the alpaca dataset and the restructured sentiment dataset
combined_train_dataset = pd.concat([train_df_alpaca, train_df_restructured], ignore_index=True)

# Optionally, convert the combined pandas dataframe back to a Hugging Face dataset if needed
combined_train_hf_dataset = Dataset.from_pandas(combined_train_dataset)

# Print the combined dataset to verify
# print(combined_train_dataset.head())


In [17]:
# Display rows 995 up to 1005 of the combined_train_dataset
combined_train_dataset.iloc[29995:30005]


Unnamed: 0,instruction,input,output,text
29995,Name three computer programs which can be used...,,Three computer programs which can be used to c...,Below is an instruction that describes a task....
29996,Write pseudocode for a program to calculate th...,,// Function to calculate average of numbers in...,Below is an instruction that describes a task....
29997,Name an alternative to using money as a currency.,,Bartering is an alternative to using money as ...,Below is an instruction that describes a task....
29998,Provide three examples of chemical reactions.,,1. Combustion: C8H18 + O2 --> CO2 + H2O\n2. Ac...,Below is an instruction that describes a task....
29999,Generate a funny icebreaker for a group of fri...,,What do you call an alligator in a vest? \nAns...,Below is an instruction that describes a task....
30000,@fa6ami86 so happy that salman won. btw the 1...,,0,\nPlease read the tweet provided below. Your t...
30001,@phantompoptart .......oops.... I guess I'm ki...,,0,\nPlease read the tweet provided below. Your t...
30002,@bradleyjp decidedly undecided. Depends on the...,,1,\nPlease read the tweet provided below. Your t...
30003,@Mountgrace lol i know! its so frustrating isn...,,1,\nPlease read the tweet provided below. Your t...
30004,@kathystover Didn't go much of any where - Lif...,,1,\nPlease read the tweet provided below. Your t...


In [18]:
train_dset_mixed = Dataset.from_pandas(combined_train_dataset)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dset_mixed,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)

In [20]:
new_model_mixed = '/work/gns938/nlp_hw3/Mistral-mixed-fine-tuned'

In [21]:
trainer.train()
trainer.model.save_pretrained(new_model_mixed)
trainer.tokenizer.save_pretrained(new_model_mixed)

Step,Training Loss
25,1.2241
50,0.7205
75,0.9083
100,0.6752
125,0.8579
150,0.6599
175,0.8523
200,0.6934
225,0.8795
250,0.6506


('/work/gns938/nlp_hw3/Mistral-mixed-fine-tuned/tokenizer_config.json',
 '/work/gns938/nlp_hw3/Mistral-mixed-fine-tuned/special_tokens_map.json',
 '/work/gns938/nlp_hw3/Mistral-mixed-fine-tuned/tokenizer.json')