In [None]:
import torch

if torch.has_mps:
    print("Congratulations, you have GPU support for PyTorch! \U0001F389")
else:
    print("Sorry, it looks like something isn't working right with PyTorch GPU support")

device = torch.device('mps' if torch.has_mps else 'cpu')
device

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
input_column_names = ['NE01_AHU7_RESET_POLL_TL', 'NE01_AHU7_HCV_POLL_TL', 'NE01_AHU7_HC_SWT_POLL_TL', 'NE01_AHU7_HC_RWT_POLL_TL', 'NE01_AHU7_MAD_FB_POLL_TL',
                      'NE01_AHU7_HC_SAT_POLL_TL', 'NE01_AHU7_MAT_POLL_TL', 'NE01_AHU7_RAT_POLL_TL', 'NE01_AHU7_SF_SPD_POLL_TL', 'NE01_AHU7_EF_SPD_POLL_TL', 'NE01_AHU5_OAT_GV_POLL_TL']
output_column_names = ['VAV4_1_RT_TL', 'VAV4_2_RT_TL', 'VAV4_3_RT_TL',
                       'VAV4_4_RT_TL', 'VAV4_5_RT_TL', 'VAV4_6_RT_TL', 'VAV4_7_RT_TL']
all_column_names = input_column_names + output_column_names

In [None]:
dataframe = pd.read_csv('pulled_data.csv')

In [None]:
train_dataframe, test_dataframe = train_test_split(dataframe, test_size=0.2, shuffle=False)

In [None]:
train_dataframe[all_column_names].to_csv('train_dataset.txt', index=False, header=False)
test_dataframe[all_column_names].to_csv('test_dataset.txt', index=False, header=False)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('gpt2')

train_path = 'train_dataset.txt'
test_path = 'test_dataset.txt'

In [None]:
from transformers import TextDataset,DataCollatorForLanguageModeling

def load_dataset(train_path, test_path, tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset, test_dataset, data_collator = load_dataset('train_dataset.txt', 'test_dataset.txt', tokenizer)

In [None]:
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead

model = AutoModelWithLMHead.from_pretrained('gpt2').to(device)
print(model.device)

training_args = TrainingArguments(
    output_dir='./gpt2-hvac',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_steps=400,
    save_steps=800,
    warmup_steps=500,
    prediction_loss_only=True,
    log_level='debug'
)

training_args.device

print(training_args.device)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model()