# Learning Regular Grammar (Long Sequence)

### Import Packages

In [None]:
from gpt_api import *
import sys
sys.path.insert(0, '../../')
from PyGol import *
import os
import numpy as np
import matplotlib.pyplot as plt
import pylab as plb

### function to read all the files from data folder
#### Get the list of all files and directories

In [None]:
path = "Dataset/"
data_files = os.listdir(path)
result_folder = "result/" #result folder name 

### List to save the metrics from all the runs

In [None]:
final_accuracy_pygol = []
final_accuracy_GPT = []
default_accuracy = []

### read the files from dataset and separate it into positive and negative examples

In [None]:
examples = {}
for i in data_files:
    P,N = read_and_print_lines(path+"/"+i)
    examples[i] = [P,N]
del examples['.DS_Store']

### List to guide the number of runs

In [None]:
list_1 = [3,4,5,6,8,10,15,20,30,40,50]

### Learning Phase (PyGol and GPT)

In [None]:
for eachnum in list_1:
    acc_pygol = []
    acc_gpt = []
    print("Number of pos. examples:",eachnum)
    for i, j in examples.items():
        #train and test data split 
        pos_train, neg_train, pos_test, neg_test = generate_data_split_language(j[0],j[1],eachnum, eachnum, 250, 250)
        test_size = len(pos_test) + len(neg_test)
        #Background knowledge generation - PyGol
        B, P, N = generate_bk_for_language(pos_list = pos_train, neg_list = neg_train)
        #alphabets
        A = sorted(extract_alphabets(pos_train))
        #Bottom Clause generation - PyGol
        pos_bc, neg_bc = bottom_clause_generation(file=B, 
                                       constant_set=A, 
                                       positive_example = P,
                                       negative_example = N, 
                                       container = "memory", tqdm_disable=True)
        #Learning Phase of PyGol
        #SM - State Matrix
        #S  - States
        #H - PyGol Hypothesis
        SM, S, H = pygol_learn_languages(positive_file_dictionary=pos_bc, 
                                      negative_file_dictionary=neg_bc,
                                      alphabets=A, file_name=result_folder+"dfa"+i, view_dfa=False)
        #Production rules
        Pt = generate_production_rules(A,SM,S)
        # Testing Phase of PyGol starts Here
        pos= 0
        neg= 0
        for eachst in pos_test:
            if final_state_check(SM,S, A ,eachst,S[0], S[-1])==1:
                pos = pos + 1
        for eachst in neg_test:
            if final_state_check(SM,S, A ,eachst,S[0], S[-1])!=1:
                neg= neg + 1
        accuracy = (pos+ neg) / test_size
        acc_pygol.append(accuracy)
        # Testing Phase of PyGol Ends Here
        # Testing Phase of GPT starts Here
        result = evaluate_model_with_chatgpt(pos_train, neg_train, pos_test, neg_test)
        numbers = re.findall(r'\d+', result)
        numbers = list(map(int, numbers))
        TP, TN = numbers
        accuracy_chat = (TP+TN)/ test_size
        if accuracy_chat<0.5:
            accuracy_chat = 0.5
        acc_gpt.append(accuracy_chat)
        # Testing Phase of GPT Ends Here
    print("\t PyGol",np.mean(acc_pygol))
    print("\t GPT",np.mean(acc_gpt))
    final_accuracy_pygol.append(acc_pygol)
    final_accuracy_GPT.append(acc_gpt)

### Set Default Accuracy list according to the length of items in 'list_1'

In [None]:
sample_list = [0.5, 0.5]
for i in list_1:
    default_accuracy.append(sample_list)

### Plot accuracy graph

In [None]:
plb.rcParams['font.size'] = 12

means_default = []
errors_default = []
for sublist in default_accuracy:
    np_sublist = np.array(sublist)
    mean = np.mean(np_sublist)
    std_dev = np.std(np_sublist)
    sem = std_dev / np.sqrt(len(sublist))  
    means_default.append(mean)
    errors_default.append(sem)

means_pygol = []
errors_pygol = []
for sublist in final_accuracy_pygol:
    np_sublist = np.array(sublist)
    mean = np.mean(np_sublist)
    std_dev = np.std(np_sublist)
    sem = std_dev / np.sqrt(len(sublist))  
    means_pygol.append(mean)
    errors_pygol.append(sem)

means_chat = []
errors_chat = []
for sublist in final_accuracy_GPT:
    np_sublist = np.array(sublist)
    mean = np.mean(np_sublist)
    std_dev = np.std(np_sublist)
    sem = std_dev / np.sqrt(len(sublist))  
    means_chat.append(mean)
    errors_chat.append(sem)


# Creating the plot
plt.figure(figsize=(10, 6))
plt.errorbar(list_1, means_pygol, yerr=errors_pygol, fmt='--', capsize=5, ecolor='red', color='red', markersize=1, label='PyGol')
plt.errorbar(list_1, means_chat, yerr=errors_chat, fmt='--', capsize=5, ecolor='blue', color='blue', markersize=1, label='GPT')
plt.errorbar(list_1, means_default, yerr=errors_default, fmt='--', capsize=5, ecolor='black', color='black', markersize=1, label='Deafult')

custom_xtick_positions = [0, 4,  8,  12,  16, 20,  24,  28,   32,  36,  40,  44,  48, 52]
custom_xtick_labels = ['0', '4',  '8',  '12',  '16',  '20',  '24',  '28',  '32',  '36', '40',  '44', '48', '52']

plt.xticks(ticks=custom_xtick_positions, labels=custom_xtick_labels)
plt.legend()
plt.xlabel('Number of examples')
plt.ylabel('Accuracy')
plt.xlim(1.7, 52) 
plt.ylim(0.4, 1.1)
# Show the plot
plt.legend( bbox_to_anchor=[1.1, 0.5], 
           loc='center', ncol=1)
plt.savefig('Plot.png')
plt.show()