In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)


import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'colab'

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sage.all import primes_first_n


def is_prime(n):
    if n < 2:
        return False
    # Check divisibility up to the square root of n
    for i in np.arange(2, int(np.sqrt(n)) + 1):
        if n % i == 0:
            return False
    return True

PRIME_COLS_BIG = [str(n+1) for n in range(1000) if is_prime(n+1)]
LfunctionTypes = ['Artin', 'BMF', 'CMF', 'DIR', 'ECNF', 'ECQ', 'G2Q', 'HMF'] 
# Does 'NF' really occur? Yes as the Riemann zeta, which we possibly want to remove

   
def build_lfunctions_df_big():
    """Creates the rational L-functions data frame from the data in """
    # Using the file downloaded from Zenodo 
    filename = 'lfun_rat_withap.txt'
    DF = pd.read_table(filename,delimiter=":",header='infer', low_memory=False)
    BadLtypes =  sorted(list(set(list(DF['instance_types']))))
    BadToGood = {}
    for badLtype in BadLtypes:
        good = []
        for Ltype in LfunctionTypes:
            if badLtype.count(Ltype) > 0:
                good.append(Ltype)
        good = tuple(good)
        BadToGood[badLtype] = good

    def bad_to_good_Ltypes(bad):
        return BadToGood[bad]

    DF['instance_types'] = DF.apply(lambda x: bad_to_good_Ltypes(x.instance_types), axis=1)
    return DF

def write_to_int(an_list):
    '''Function to convert the an strings to a list of ints, returns column labels and an list'''
    an_list = an_list.replace('[','')
    an_list = an_list.replace(']','')
    an_list = [int(an) for an in an_list.split(',')]
    #print('list length is ', len(an_list))
    return an_list

def write_to_hasse_normalized_primes_big(ap_list, w, d = 1):
    '''Function to convert the an strings to a list of normalized floats, returns column labels and an list of primes'''
    ap_list = write_to_int(ap_list)
    normalized_list = []
    for p, ap in zip(PRIME_COLS_BIG, ap_list):
        p = int(p)
        if not is_prime(p): continue
        normalization_quotient = (d*p**(w/2))**(-1)
        normalized_list.append(np.float32(round(ap * normalization_quotient, 5)))
    return normalized_list


def build_hasse_ap_df_big(DF):
    DF_new = pd.DataFrame()
    for rlf_label in DF.columns:
        if rlf_label == 'ap': continue
        DF_new[rlf_label] = DF[rlf_label].copy()
    DF_new[PRIME_COLS_BIG] = [write_to_hasse_normalized_primes_big(a, w, d) for w, a, d in zip(DF['motivic_weight'], DF['ap'], DF['degree'])]
    return DF_new



DF = build_lfunctions_df_big()
print('Number of rows in all of the dataset = ', len(DF))
DF = DF[DF['primitive'] == True]

print(DF['order_of_vanishing'].max())
print(DF['order_of_vanishing'].value_counts())
     
mask1=DF['order_of_vanishing']<4
mask2=DF['degree']==4
mask3=DF['motivic_weight']==1
mask4=mask1&mask2&mask3
DF=DF[mask4]
DF['index'] = [i for i in range(DF.shape[0])]
DF = DF.set_index('index')
print('Number of rows that we will actually use = ', len(DF))

# Now build some normalized dataframes
# We are using the 
DF_big = build_hasse_ap_df_big(DF)
# Do some 2D PCA

X = DF_big[PRIME_COLS_BIG]
y = DF_big['label']

# Perform PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X)

# Create a DataFrame with the principal components
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
pca_df['label'] = y.values
pca_df['order_of_vanishing'] = DF_big['order_of_vanishing']


col = 'order_of_vanishing'
vals = DF_big[col].value_counts()

if len(vals) < 10:
    color = pca_df[col].astype(str)
else:
    color = pca_df[col]

fig = px.scatter(pca_df,
                x='PC1',
                y='PC2',
                color=color,
                opacity=float(0.7),
                hover_data='label')
fig.update_layout(title=f'PCA colored by order of vanishing',
                title_x=float(0.5),
                xaxis_title="PC1",
                yaxis_title="PC2",
                template='plotly_white',
                legend_title='Van. order',
                autosize=False,
                width=int(800),
                height=int(500))
print('have figure, now writing')
#fig.write_image('images/ap_1000_hasse_norm_pca.pdf')
fig.show()

#Now sort and replot
pca_df = pca_df.sort_values(by=['order_of_vanishing'], ascending = True)

if len(vals) < 10:
    color = pca_df[col].astype(str)
else:
    color = pca_df[col]
fig = px.scatter(pca_df,
                x='PC1',
                y='PC2',
                color=color,
                opacity=float(0.7),
                hover_data='label')
fig.update_layout(title=f'PCA colored by order of vanishing',
                title_x=float(0.5),
                xaxis_title="PC1",
                yaxis_title="PC2",
                template='plotly_white',
                legend_title='Van. order',
                autosize=False,
                width=int(800),
                height=int(500))
print('have figure, now writing')
#fig.write_image('images/ap_1000_hasse_norm_pca_ascending.pdf')
fig.show()


pca_df = pca_df.sort_values(by=['order_of_vanishing'], ascending = False)

if len(vals) < 10:
    color = pca_df[col].astype(str)
else:
    color = pca_df[col]
fig = px.scatter(pca_df,
                x='PC1',
                y='PC2',
                color=color,
                opacity=float(0.7),
                hover_data='label')
fig.update_layout(title=f'PCA colored by order of vanishing',
                title_x=float(0.5),
                xaxis_title="PC1",
                yaxis_title="PC2",
                template='plotly_white',
                legend_title='Van. order',
                autosize=False,
                width=int(800),
                height=int(500))
print('have figure, now writing')
#fig.write_image('images/ap_1000_hasse_norm_pca_descending.pdf')
fig.show()



# And now for some LDA
X, y = DF_big[PRIME_COLS_BIG], DF_big['order_of_vanishing']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=float(0.2), random_state=int(42))

# Create an LDA object
lda = LinearDiscriminantAnalysis()

# Fit the LDA model on the training data
lda.fit(X_train, y_train)

# Make predictions on the test data
y_pred = lda.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("LDA Accuracy:", accuracy)
