In [None]:
!apt-get -y install fpc fpc-source lazarus git subversion

In [None]:
!svn checkout https://svn.code.sf.net/p/cai/svncode/trunk/lazarus neural-api

In [None]:
!svn checkout https://svn.code.sf.net/p/lazarus-ccr/svn/components/multithreadprocs mtprocs

In [None]:
!lazbuild mtprocs/multithreadprocslaz.lpk

In [None]:
!pip install datasets

In [None]:
import os
import urllib.request
from datasets import load_dataset

In [None]:
wikids = load_dataset("roneneldan/TinyStories", split="train")

In [None]:
wikids[1]['text']

'Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun. Beep was a healthy car because he always had good fuel. Good fuel made Beep happy and strong.\n\nOne day, Beep was driving in the park when he saw a big tree. The tree had many leaves that were falling. Beep liked how the leaves fall and wanted to play with them. Beep drove under the tree and watched the leaves fall on him. He laughed and beeped his horn.\n\nBeep played with the falling leaves all day. When it was time to go home, Beep knew he needed more fuel. He went to the fuel place and got more healthy fuel. Now, Beep was ready to go fast and play again the next day. And Beep lived happily ever after.'

In [None]:
MIN_TRAINING_SEQ_LEN = 20
prepared_ds = []
row_cnt = 0
# for ds_row in wikids:
for ds_row in wikids:
  row_cnt = row_cnt + 1
  new_row = ds_row['text'].strip(" '\"")
  new_row = new_row.replace(' .', '.').replace(' ,', ',').replace(' !', '!').replace(' ?', '?').replace(' ;', ';').replace(' :', ':').replace(" '", "'")
  new_row = new_row.replace('<unk>', '').replace('  ', ' ')
  # remove non ascii characters from new_row
  new_row = ''.join([i if (ord(i) < 128) and (ord(i) > 31) else '' for i in new_row])
  # remove any linefeed
  new_row = new_row.replace('\n', '')
  new_row_len = len(new_row)
  if ( new_row_len > MIN_TRAINING_SEQ_LEN ):
    prepared_ds.append(new_row)
  # if row_cnt > 100000: break
  if row_cnt % 100000 == 0:
    print(len(prepared_ds), "loaded rows.")
print("We have", len(prepared_ds), "strings in the dataset out of a total of", row_cnt,'.')

99983 loaded rows.
199955 loaded rows.
299954 loaded rows.
399946 loaded rows.
499945 loaded rows.
599945 loaded rows.
699935 loaded rows.
799935 loaded rows.
899934 loaded rows.
999929 loaded rows.
1099908 loaded rows.
1199892 loaded rows.
1299892 loaded rows.
1399870 loaded rows.
1499833 loaded rows.
1599832 loaded rows.
1699819 loaded rows.
1799819 loaded rows.
1899818 loaded rows.
1999789 loaded rows.
2099789 loaded rows.
We have 2119489 strings in the dataset out of a total of 2119719 .


In [None]:
small_ds = prepared_ds[:600]
medium_ds = prepared_ds[:60000]

In [None]:
!rm *.txt

rm: cannot remove '*.txt': No such file or directory


In [None]:
def save_dataset(dataset, filename):
    with open(filename, 'w') as f:
      for item in dataset:
        f.write("%s\n" % item)

save_dataset(small_ds,   'tinystories_small.txt')
save_dataset(medium_ds,  'tinystories_medium_ds.txt')
save_dataset(prepared_ds,'tinystories.txt')

In [None]:
!zip -r tinystories_small.zip tinystories_small.txt
!zip -r tinystories_medium_ds.zip tinystories_medium_ds.txt
!zip -r tinystories.zip tinystories.txt

  adding: tinystories_small.txt (deflated 68%)
  adding: tinystories_medium_ds.txt (deflated 70%)
  adding: tinystories.txt (deflated 69%)


In [None]:
code = """
program SimpleNLP;
(*
Copyright (C) 2023 Joao Paulo Schwarz Schuler

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*)

{$mode objfpc}{$H+}

uses {$IFDEF UNIX} {$IFDEF UseCThreads}
  cthreads, {$ENDIF} {$ENDIF}
  Classes,
  neuralnetwork,
  neuralvolume,
  neuralfit,
  neuralthread,
  CustApp,
  Math;

const
  csContextLen = 81;
  csTrainingFileName = 'tinystories.txt';
  csVocabSize  = 128; // Character based vocabulary/dictionary.
  csMinSampleSize = 3; // Minimum of 3 characters.

type

  { TTestFitLoading }

  TTestFitLoading = class(TCustomApplication)
  protected
    FDataset: TStringList;
    FDatasetSize: integer;
    FNN: TNNet;
    NFit: TNeuralDataLoadingFit;
    FSampler: TNNetSamplerBase;
    FMaxPredictCharPos: integer;
    procedure LoadDataset;
    procedure DoRun; override;
  public
    procedure OnAfterEpoch(Sender: TObject);
    procedure GetTrainingPair(Idx: integer; ThreadId: integer; pInput, pOutput: TNNetVolume);
    procedure GetValidationPair(Idx: integer; ThreadId: integer; pInput, pOutput: TNNetVolume);
    procedure GetTestPair(Idx: integer; ThreadId: integer; pInput, pOutput: TNNetVolume);
  end;

  procedure TTestFitLoading.LoadDataset;
  var
    RowCnt: integer;
  begin
    FDataset.LoadFromFile(csTrainingFileName);
    FDatasetSize := FDataset.Count;
    for RowCnt := FDatasetSize-1 downto 0 do
    begin
      // removes too short strings
      if Length(FDataset[RowCnt])<csMinSampleSize then FDataset.Delete(RowCnt);
    end;
    FDatasetSize := FDataset.Count;
    for RowCnt := FDatasetSize-1 downto 0 do
    begin
      // removes too short strings
      FDataset[RowCnt] := LowerCase(FDataset[RowCnt]) + chr(1);
    end;
    WriteLn('Loaded dataset with ', FDatasetSize, ' rows');
  end;

  procedure TTestFitLoading.DoRun;
  begin
    FDataset := TStringList.Create();
    LoadDataset();
    FNN := TNNet.Create();
    NFit := TNeuralDataLoadingFit.Create();
    FMaxPredictCharPos := csMinSampleSize;
    FSampler := TNNetSamplerTopP.Create(0.4);
    FNN.AddLayer([
      TNNetInput.Create(csContextLen, 1, csVocabSize),
      TNNetPointwiseConv.Create(32,1),
      TNNetPadXY.Create(1,0),
      TNNetConvolutionReLU.Create(64,3,0,1,1),
      TNNetMaxPool.Create(3),
      TNNetPadXY.Create(1,0),
      TNNetConvolutionReLU.Create(128*3,3,0,1,1),
      TNNetPointwiseConvReLU.Create(1024,0),
      TNNetMaxPoolWithPosition.Create(27,27,0,1,0),
      TNNetPointwiseConvReLU.Create(1024),
      TNNetPointwiseConvReLU.Create(128),
      TNNetFullConnectLinear.Create(csVocabSize),
      TNNetSoftMax.Create()
    ]);
    DebugThreadCount();
    FNN.DebugStructure;

    WriteLn('Computing...');
    NFit.MaxThreadNum := 32;
    NFit.LogEveryBatches := 100;
    NFit.InitialLearningRate := 0.0001;
    NFit.LearningRateDecay := 0;
    NFit.L2Decay := 0;
    NFit.EnableClassComparison();
    NFit.EnableDefaultLoss();
    NFit.AvgWeightEpochCount := 1;
    NFit.OnAfterEpoch := @OnAfterEpoch;
    NFit.FitLoading(
      FNN,
      {TrainingVolumesCount=}32000*3,
      {ValidationVolumesCount=}32000*3 div 20,
      {TestVolumesCount=}32000*3 div 20,
      {batchsize=}320,
      {epochs=}500,
      @GetTrainingPair, @GetValidationPair, @GetTestPair
    );
    FNN.DebugWeights();
    OnAfterEpoch(Self);
    FSampler.Free;
    NFit.Free;
    FNN.Free;
    FDataset.Free;
    Terminate;
  end;

  procedure TTestFitLoading.OnAfterEpoch(Sender: TObject);
  begin
    WriteLn('Testing.');
    WriteLn(GenerateStringFromChars(NFit.NN, 'once', FSampler),'.');
    WriteLn(GenerateStringFromChars(NFit.NN, 'one ', FSampler),'.');
    WriteLn(GenerateStringFromChars(NFit.NN, 'once upon ', FSampler),'.');
    if NFit.TrainingAccuracy < 0.5
      then FMaxPredictCharPos := Max(FMaxPredictCharPos-1, csMinSampleSize)
      else FMaxPredictCharPos := Min(FMaxPredictCharPos+1, csContextLen);
    WriteLn('Max prediction pos is: ', FMaxPredictCharPos);
  end;

  procedure TTestFitLoading.GetTrainingPair(Idx: integer; ThreadId: integer;
    pInput, pOutput: TNNetVolume);
  var
    SampleId: integer;
    SampleLen: integer;
    SampleCutPosition: integer;
    ExpectedTokenChar: char;
    ExpectedTokenInt: integer;
  begin
    // Make sure that expected input and output have the proper sizes.
    if FNN.GetFirstLayer().Output.Size <> pInput.Size then pInput.ReSize(FNN.GetFirstLayer().Output);
    if FNN.GetLastLayer().Output.Size <> pOutput.Size then pOutput.ReSize(FNN.GetLastLayer().Output);
    // Get the input sample
    SampleId := Random(FDatasetSize);
    SampleLen := Min(Length(FDataset[SampleId]), pInput.SizeX);
    SampleLen := Min(FMaxPredictCharPos, SampleLen);
    SampleCutPosition := Random(SampleLen-csMinSampleSize)+csMinSampleSize; // -1
    // The expected token is the next character in the string
    ExpectedTokenChar := FDataset[SampleId][SampleCutPosition+1];
    ExpectedTokenInt := Min(Ord(ExpectedTokenChar),pInput.Depth-1);
    // Encode the input and output volumes
    pInput.OneHotEncodingReversed(copy(FDataset[SampleId], 1, SampleCutPosition));
    pOutput.SetClassForSoftMax(ExpectedTokenInt);
    pOutput.Tag := ExpectedTokenInt;
  end;

  procedure TTestFitLoading.GetValidationPair(Idx: integer; ThreadId: integer;
    pInput, pOutput: TNNetVolume);
  var
    SampleId: integer;
    SampleLen: integer;
    SampleCutPosition: integer;
    ExpectedTokenChar: char;
    ExpectedTokenInt: integer;
  begin
    // Make sure that expected input and output have the proper sizes.
    if FNN.GetFirstLayer().Output.Size <> pInput.Size then pInput.ReSize(FNN.GetFirstLayer().Output);
    if FNN.GetLastLayer().Output.Size <> pOutput.Size then pOutput.ReSize(FNN.GetLastLayer().Output);
    // Get the input sample
    SampleId := Idx;
    SampleLen := Min(Length(FDataset[SampleId]), pInput.SizeX);
    SampleCutPosition := (Idx mod (1+SampleLen-csMinSampleSize))+csMinSampleSize-1;
    // The expected token is the next character in the string
    ExpectedTokenChar := FDataset[SampleId][SampleCutPosition+1];
    ExpectedTokenInt := Min(Ord(ExpectedTokenChar),pInput.Depth-1);
    // Encode the input and output volumes
    pInput.OneHotEncodingReversed(copy(FDataset[SampleId], 1, SampleCutPosition));
    pOutput.SetClassForSoftMax(ExpectedTokenInt);
    pOutput.Tag := ExpectedTokenInt;
  end;

  procedure TTestFitLoading.GetTestPair(Idx: integer; ThreadId: integer;
    pInput, pOutput: TNNetVolume);
  begin
    GetValidationPair(Idx, ThreadId, pInput, pOutput);
  end;

var
  Application: TTestFitLoading;
begin
  Application := TTestFitLoading.Create(nil);
  Application.Title:='Nano Covolutional Based NLP Trained from File';
  Application.Run;
  Application.Free;
end.
"""
with open("neural-api/examples/CaiOptimizedDenseNet/CaiOptimizedDenseNet.lpr", "w") as text_file:
    text_file.write(code)
!lazbuild neural-api/examples/CaiOptimizedDenseNet/CaiOptimizedDenseNet.lpi
!ls -l neural-api/bin/x86_64-linux/bin/CaiOptimizedDenseNet

In [None]:
!neural-api/bin/x86_64-linux/bin/CaiOptimizedDenseNet

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
13792000 Examples seen. Accuracy: 0.8147 Error: 0.48197 Loss: 0.55661 Threads: 8 Forward time: 3.58s Backward time: 4.75s Step time: 37.59s
13824000 Examples seen. Accuracy: 0.8122 Error: 0.45405 Loss: 0.52630 Threads: 8 Forward time: 3.53s Backward time: 4.41s Step time: 37.42s
Starting Validation.
Epochs: 144 Examples seen:13824000 Validation Accuracy: 0.7821 Validation Error: 0.5543 Validation Loss: 0.7302 Total time: 302.47min
Layer  0                                                     Max Output:  1.000 Min Output:  0.000 TNNetInput 81,1,128 Times: 0.00s 0.00s
Layer  1 Neurons: 32 Max Weight:   0.832 Min Weight:  -0.863 Max Output:  0.681 Min Output: -0.698 TNNetPointwiseConv 81,1,32 Times: 0.21s 0.08s Parent:0
Layer  2                                                     Max Output:  0.681 Min Output: -0.698 TNNetPadXY 83,1,32 Times: 0.01s 0.00s Parent:1
Layer  3 Neurons: 64 Max Weight:   0.555 Min Weight:  -0.535 M

In [None]:
!cat autosave.csv

epoch,training accuracy,training loss,training error,validation accuracy,validation loss,validation error,learning rate,time,test accuracy,test loss,test error
1,0.7344,0.6005,0.4323,0.2026,10.7440,1.6365,0.0001000,95,,,
2,0.8830,0.4258,0.3229,0.1271,9.7572,1.7555,0.0001000,187,,,
3,0.8523,0.3943,0.3334,0.1125,8.8159,1.7831,0.0001000,280,,,
4,0.8470,0.6063,0.4446,0.1098,11.2453,1.7933,0.0001000,375,,,
5,0.8424,0.5759,0.5095,0.1048,11.0278,1.8023,0.0001000,469,,,
6,0.8712,0.4172,0.3530,0.1144,11.0418,1.7850,0.0001000,565,,,
7,0.8884,0.3228,0.2597,0.1279,10.1913,1.7566,0.0001000,663,,,
8,0.8909,0.4237,0.3561,0.1379,9.6974,1.7370,0.0001000,767,,,
9,0.8898,0.3702,0.2837,0.1686,9.8982,1.6769,0.0001000,877,,,
10,0.8921,0.3697,0.3066,0.1987,8.9174,1.6257,0.0001000,993,0.1987,8.9174,1.6257
11,0.8946,0.3656,0.2824,0.2154,9.7150,1.5984,0.0001000,1103,,,
12,0.8854,0.4107,0.3113,0.2204,8.9187,1.5939,0.0001000,1216,,,
13,0.8849,0.5218,0.3871,0.2325,7.9209,1.5727,0.0001000,1331,,,
14,0.8870,0.3961,0

In [None]:
!zip JP45F01 autosave.nn

  adding: autosave.nn (deflated 59%)
