In [None]:
import os
import urllib.request

In [None]:
if (not os.path.exists('tinystories_vocab.csv')):
  !git clone https://huggingface.co/datasets/schuler/TinyStories4Pascal
  !unzip TinyStories4Pascal/tinystories_tokenized_81.csv.zip
  !unzip TinyStories4Pascal/tinystories_vocab.csv.zip

In [None]:
!apt-get update && apt-get -y install fpc fpc-source lazarus git subversion

In [None]:
!svn checkout https://svn.code.sf.net/p/cai/svncode/trunk/lazarus neural-api

In [None]:
!svn checkout https://svn.code.sf.net/p/lazarus-ccr/svn/components/multithreadprocs mtprocs

In [None]:
!lazbuild mtprocs/multithreadprocslaz.lpk

In [None]:
code = """
program smallTransformer;
(*
Copyright (C) 2024 Joao Paulo Schwarz Schuler

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*)

{$mode objfpc}{$H+}

uses {$IFDEF UNIX} {$IFDEF UseCThreads}
  cthreads, {$ENDIF} {$ENDIF}
  Classes,
  neuralnetwork,
  neuralvolume,
  neuralfit,
  neuralthread,
  neuraldatasets,
  neuralab,
  CustApp,
  Math,
  sysutils;

const
  csContextLen = 80;
  csTrainingFileName = 'tinystories_tokenized_81.csv';
  csVocabFileName = 'tinystories_vocab.csv';
  csMinSampleSize = 3; // Minimum of 1 token.
  csEmbedDim = 128;
  csModelVocabSize = 3000;

type
  TTestFitLoading = class(TCustomApplication)
  protected
    FDataset: array of array of integer;
    FDictionary: TStringListInt;
    FDatasetSize: integer;
    FNN: THistoricalNets;
    NFit: TNeuralDataLoadingFit;
    FSampler: TNNetSamplerBase;
    FMaxPredictCharPos: integer;
    FVocabSize: integer; // Character based vocabulary/dictionary.
    procedure LoadDataset;
    procedure DoRun; override;
  public
    procedure OnAfterEpoch(Sender: TObject);
    procedure OnAfterStep(Sender: TObject);
    procedure GetTrainingPair(Idx: integer; ThreadId: integer; pInput, pOutput: TNNetVolume);
    procedure GetValidationPair(Idx: integer; ThreadId: integer; pInput, pOutput: TNNetVolume);
    procedure GetTestPair(Idx: integer; ThreadId: integer; pInput, pOutput: TNNetVolume);
  end;

  procedure TTestFitLoading.LoadDataset;
  var
    Tokens: array of integer;
  begin
    WriteLn('Loading vocabulary: ', csVocabFileName);
    FDictionary.LoadFromFile(csVocabFileName);
    WriteLn('Indexing vocabulary with ', FDictionary.Count,' words.');
    FDictionary.SaveCurrentPositionAndSort();
    FDictionary.StringToIntegerArray('one day a', Tokens);
    WriteLn('one day a: ',Tokens[0],' ',Tokens[1],' ',Tokens[2]);

    WriteLn('Dic 93: ', FDictionary.IntegerToWord(93));
    WriteLn('Dic 88: ', FDictionary.IntegerToWord(88));
    WriteLn('Dic 40: ', FDictionary.IntegerToWord(40));

    FilterCSVWithNumbersUpToMax(csTrainingFileName, 'temp.csv', csModelVocabSize-1);
    LoadIntegersInCSV('temp.csv', FDataset);

    FVocabSize := FDictionary.Count;
    FDatasetSize := Length(FDataSet);

    WriteLn('Loaded dataset with ', FDatasetSize, ' rows');
  end;

  function CreateCaiTransformerDecoder(pVocabSize, pEmbedDim, pLayers, pContextSize: integer): THistoricalNets;
  var
    CntLayer: integer;
  begin
    Result := THistoricalNets.Create();
    Result.AddLayer([
      TNNetInput.Create(pContextSize, 1, 1),
      TNNetTokenAndPositionalEmbedding.Create(pVocabSize, pEmbedDim),
      TNNetPointwiseConvLinear.Create(768),
      TNNetSignedSquareRoot1.Create()
    ]);
    for CntLayer := 1 to pLayers do
    begin
      Result.AddTransformerBlockCAI( 12, 4*768, true, true, false);
    end;
    Result.AddLayer([
      TNNetPointwiseConvLinear.Create(pVocabSize, 1),
      TNNetPointwiseSoftMax.Create(1)
    ]);
  end;

  procedure TTestFitLoading.DoRun;
  var
    W: TNNetLayer;
    I: integer;
    Opt: TNeuralOptimizerAdam;
    NNAux: THistoricalNets;
    LocalContextSize: integer;
  begin
    FDictionary := TStringListInt.Create();
    LoadDataset();
    FNN := THistoricalNets.Create();
    Opt := TNeuralOptimizerAdam.Create(0.9, 0.98);
    NFit := TNeuralDataLoadingFit.Create();
    FMaxPredictCharPos := csContextLen;
    FSampler := TNNetSamplerTopP.Create(0.4);

    LocalContextSize := 40;
    FNN := CreateCaiTransformerDecoder(csModelVocabSize, csEmbedDim, 12, LocalContextSize);

    DebugThreadCount();
    FNN.DebugStructure;
    FNN.DebugWeights();

    while ( true ) do // LocalContextSize < csContextLen
    begin
      WriteLn('Computing...');
      Opt := TNeuralOptimizerAdam.Create(0.9, 0.98);
      NFit := TNeuralDataLoadingFit.Create();
      NFit.InitialLearningRate := 0.0001;
      NFit.LogEveryBatches := 100;
      NFit.Optimizer := Opt;
      NFit.SaveBest := SaveBestLoss;
      NFit.LearningRateDecay := 0.00;
      NFit.L2Decay := 0;
      NFit.EnableMultiClassLoss();
      NFit.EnableClassComparisonInLastPixel();
      NFit.AvgWeightEpochCount := 1;
      NFit.OnAfterEpoch := @OnAfterEpoch;
      NFit.OnAfterStep := @OnAfterStep;
      NFit.TargetAccuracy := 0.2;

      NFit.FitLoading(
        FNN,
        {TrainingVolumesCount=}48000*1,
        {ValidationVolumesCount=}48000*1 div 20,
        {TestVolumesCount=}48000*1 div 20,
        {batchsize=}32,
        {epochs=}1,
        @GetTrainingPair, @GetValidationPair, @GetTestPair
      );
      OnAfterEpoch(Self);
      NFit.Free;
      Opt.Free;
      LocalContextSize := Min(LocalContextSize * 2, csContextLen);
      WriteLn('Context size is: ',LocalContextSize,'.');

      NNAux := CreateCaiTransformerDecoder(csModelVocabSize, csEmbedDim, 12, LocalContextSize);
      NNAux.CopyWeights(FNN);
      FNN.Free;
      FNN := NNAux;
    end;
    FSampler.Free;
    FNN.Free;
    FDictionary.Free;
    Terminate;
  end;

  procedure TTestFitLoading.OnAfterEpoch(Sender: TObject);
  begin
    WriteLn(GenerateStringFromCasualNN(NFit.NN, FDictionary, 'one day', nil),'.');
    WriteLn(GenerateStringFromCasualNN(NFit.NN, FDictionary, 'once upon a', nil),'.');
    WriteLn(GenerateStringFromCasualNN(NFit.NN, FDictionary, 'once upon a time', nil),'.');
    WriteLn(GenerateStringFromCasualNN(NFit.NN, FDictionary, 'once upon', nil),'.');
    WriteLn(GenerateStringFromCasualNN(NFit.NN, FDictionary, 'billy', FSampler),'.');
  end;

  procedure TTestFitLoading.OnAfterStep(Sender: TObject);
  begin
    //if Random(100)=0 then OnAfterEpoch(Sender);
    //NFit.ThreadNN[0].DebugWeights();
  end;

  procedure TTestFitLoading.GetTrainingPair(Idx: integer; ThreadId: integer;
    pInput, pOutput: TNNetVolume);
  var
    SampleId: integer;
    SampleLen: integer;
    SampleCutPosition: integer;
    ExpectedTokenInt: integer;
    AIntegerArray: array of integer;
  begin
    // Make sure that expected input and output have the proper sizes.
    if FNN.GetFirstLayer().Output.Size <> pInput.Size then pInput.ReSize(FNN.GetFirstLayer().Output);
    if FNN.GetLastLayer().Output.Size <> pOutput.Size then pOutput.ReSize(FNN.GetLastLayer().Output);
    // Get the input sample
    SampleId := Random(FDatasetSize);//Random(Min(48000,FDatasetSize));
    SampleLen := Min(Length(FDataset[SampleId]), pInput.SizeX);
    SampleLen := Min(FMaxPredictCharPos, SampleLen);
    SampleCutPosition := SampleLen;
    // The expected token is the next character in the string
    ExpectedTokenInt := FDataset[SampleId][SampleCutPosition-1];
    // Encode the input volume
    AIntegerArray := Copy(FDataset[SampleId], 0, SampleCutPosition);
    pInput.Fill(0);
    pInput.CopyNoChecksIntArr( AIntegerArray );
    // Encode the output volume (includes the predicted word)
    AIntegerArray := Copy(FDataset[SampleId], 1, SampleCutPosition);
    pOutput.OneHotEncoding(AIntegerArray) ;
    pOutput.Tag := ExpectedTokenInt;
  end;

  procedure TTestFitLoading.GetValidationPair(Idx: integer; ThreadId: integer;
    pInput, pOutput: TNNetVolume);
  var
    SampleId: integer;
    SampleLen: integer;
    SampleCutPosition: integer;
    ExpectedTokenChar: char;
    ExpectedTokenInt: integer;
    AIntegerArray: array of integer;
  begin
    // Make sure that expected input and output have the proper sizes.
    if FNN.GetFirstLayer().Output.Size <> pInput.Size then pInput.ReSize(FNN.GetFirstLayer().Output);
    if FNN.GetLastLayer().Output.Size <> pOutput.Size then pOutput.ReSize(FNN.GetLastLayer().Output);
    // Get the input sample
    SampleId := (Idx * 20) mod FDatasetSize; // Min(FDatasetSize,48000);
    SampleLen := Min(Length(FDataset[SampleId]), pInput.SizeX);
    SampleCutPosition := SampleLen;
    // The expected token is the next character in the string
    ExpectedTokenInt := FDataset[SampleId][SampleCutPosition-1];
    // Encode the input and output volumes
    AIntegerArray := Copy(FDataset[SampleId], 0, SampleCutPosition);
    pInput.Fill(0);
    pInput.CopyNoChecksIntArr( AIntegerArray );
    // Encode the output volume (includes the predicted word)
    AIntegerArray := Copy(FDataset[SampleId], 1, SampleCutPosition);
    pOutput.OneHotEncoding(AIntegerArray) ;
    pOutput.Tag := ExpectedTokenInt;
  end;

  procedure TTestFitLoading.GetTestPair(Idx: integer; ThreadId: integer;
    pInput, pOutput: TNNetVolume);
  begin
    GetValidationPair(Idx, ThreadId, pInput, pOutput);
  end;

var
  Application: TTestFitLoading;
begin
  Application := TTestFitLoading.Create(nil);
  Application.Title:='Nano Covolutional Based NLP Trained from File';
  Application.Run;
  Application.Free;
end.
"""
with open("neural-api/examples/CaiOptimizedDenseNet/CaiOptimizedDenseNet.lpr", "w") as text_file:
    text_file.write(code)
!lazbuild neural-api/examples/CaiOptimizedDenseNet/CaiOptimizedDenseNet.lpi
!ls -l neural-api/bin/x86_64-linux/bin/CaiOptimizedDenseNet

In [None]:
!neural-api/bin/x86_64-linux/bin/CaiOptimizedDenseNet

Loading vocabulary: tinystories_vocab.csv
Indexing vocabulary with 21438 words.
one day a: 93 88 40
Dic 93: one
Dic 88: day
Dic 40: a
Loaded dataset with 945153 rows
CPU threads reported by the operating system: 8.
 Layers: 1374
 Neurons:86713
 Weights:87720960 Weight Sum: -218.199295 Bias sum:    0.000000 Inertia sum:    0.000000 Delta sum:    0.000000
Has AVX: TRUE Has AVX2: FALSE Has AVX512: FALSE
Layer  0 Neurons:   0 Weights:     0 TNNetInput(40,1,1,0,0) Output:40,1,1 Learning Rate:0.0100 Inertia:0.90 Weight Sum:  0.0000 Bias Sum:  0.0000 Branches:1
Layer  1 Neurons:   1 Weights:384000 TNNetTokenAndPositionalEmbedding(3000,128,0,10000,0) Output:40,1,128 Learning Rate:0.0100 Inertia:0.90 Weight Sum: -1.9365 Bias Sum:  0.0000 Parent:0 Branches:1
Layer  2 Neurons: 768 Weights: 98304 TNNetPointwiseConvLinear(768,1,0,1,0) Output:40,1,768 Learning Rate:0.0100 Inertia:0.90 Weight Sum:-17.8693 Bias Sum:  0.0000 Parent:1 Branches:1
Layer  3 Neurons:   0 Weights:     0 TNNetSignedSquareRoot

In [None]:
!cat autosave.csv