# Data Cleaning

At the end of this notebook there are dataframes that are ready to be placed into the GAN

### Import Packages

In [1]:
import pandas as pd

import argparse
import json
import logging
import os

import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.parallel
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision
import torchvision.models
import torchvision.transforms as transforms

from typing import List

## Set up logger to get details of errors
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

### Import data

In [2]:
# Importing Data
rawdata = pd.read_csv('s3://sagemaker-studio-dfml0t4nnx4/clean_data.csv')
rawdata.drop(columns = ['lmfdb_label', 'rank'])

coef = rawdata.drop(columns = ['Unnamed: 0','lmfdb_label'])
coef['a1'] = pd.to_numeric(coef['a1'], errors='coerce')
coef['a6'] = pd.to_numeric(coef['a6'], errors='coerce')
coef = coef.dropna()

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
coef.describe()

Unnamed: 0,a1,a2,a3,a4,a6,rank
count,480517.0,480517.0,480517.0,480517.0,480517.0,480517.0
mean,0.496051,-0.059665,0.35082,-3700028000.0,4902672000000000.0,0.724176
std,0.499985,0.800867,0.477227,803268100000.0,5.447199e+18,0.648116
min,0.0,-1.0,0.0,-423406400000000.0,-9.801624e+20,0.0
25%,0.0,-1.0,0.0,-21179.0,-45263.0,0.0
50%,0.0,0.0,0.0,-814.0,6.0,1.0
75%,1.0,1.0,1.0,-12.0,52644.0,1.0
max,1.0,1.0,1.0,996148600000.0,3.353391e+21,3.0


### Making Data Binary

Making the data binary allows us to use the sigmoid function as the output layer of the nn, and it helps us make sure that the output is in whole numbers rather than decimals. 

In [6]:
# Create new dataframe to work with
binaryraw = coef

# Set limits for coefficients
binaryraw = binaryraw[binaryraw['a4'] > -(2**13)]
binaryraw = binaryraw[binaryraw['a4'] < 2**13]
binaryraw = binaryraw[binaryraw['a6'] > -(2**13)]
binaryraw = binaryraw[binaryraw['a6'] < 2**13]
binarycoef = binaryraw

In [7]:
binarycoef.describe()

Unnamed: 0,a1,a2,a3,a4,a6,rank
count,177460.0,177460.0,177460.0,177460.0,177460.0,177460.0
mean,0.465316,-0.036093,0.355889,-103.498067,28.793244,0.828626
std,0.498797,0.805042,0.478783,465.556261,2573.171074,0.682717
min,0.0,-1.0,0.0,-8141.0,-8191.0,0.0
25%,0.0,-1.0,0.0,-233.0,-547.0,0.0
50%,0.0,0.0,0.0,-52.0,2.0,1.0
75%,1.0,1.0,1.0,11.0,619.0,1.0
max,1.0,1.0,1.0,8191.0,8191.0,3.0


In [8]:
# Binary for a1
binarycoef['a1b'] = binarycoef['a1'].astype('int')

# Binary for a2
# The first coefficient is for the sign, 0 means positive, 1 means negative
binarycoef['a2b1'] = binarycoef['a2'].apply(lambda x : 0 if x != -1 else 1) 
binarycoef['a2b2'] = binarycoef['a2'].apply(lambda x : 1 if x != 0 else 0) 

# Binary for a3
binarycoef['a3b'] = binarycoef['a3'].apply(lambda x : 1 if x != 0 else 0)

# Binary for a4
### First I find whether it is positive or negative
binarycoef['a4b1'] = binarycoef['a4'].apply(lambda x : 1 if x < 0 else 0)
### Then I create a function that produces a binary list representation of an integer 
def create_binary_list_from_int(number: int) -> List[int]:
    """Creates a list of the binary representation of a positive integer

    Args:
        number: An integer

    Returns:
        The binary representation of the provided positive integer number as a list.
    """
    if number < 0 or type(number) is not int:
        raise ValueError("Only Positive integers are allowed")
    data = [int(x) for x in list(bin(number))[2:]]
    data = ([0] * (13 - len(data))) + data

    return data
### Then I run the integer to binary list function on the value of a4 and create column 'a4b'
binarycoef1 = binarycoef.copy()
binarycoef1['a4b'] = (binarycoef['a4'].abs().astype(int)).apply(create_binary_list_from_int)
### Then create columns from that list
a4binary = pd.DataFrame(binarycoef1['a4b'].to_list(), columns = ['a4b2', 'a4b3', 'a4b4', 'a4b5', 'a4b6', 'a4b7', 'a4b8', 'a4b9', 
                                                                'a4b10', 'a4b11', 'a4b12', 'a4b13', 'a4b14'])
rank_df1 = pd.concat([binarycoef.reset_index(), a4binary], axis = 1)

# Binary for a6 (similar process to a4)
rank_df1['a6b1'] = rank_df1['a6'].apply(lambda x : 1 if x < 0 else 0)
rank_values2 = binarycoef.copy(deep=False)
rank_values2['a6b'] = (binarycoef['a6'].abs().astype(int)).apply(create_binary_list_from_int)
a6binary = pd.DataFrame(rank_values2['a6b'].to_list(), columns = ['a6b2', 'a6b3', 'a6b4', 'a6b5', 'a6b6', 'a6b7', 'a6b8', 'a6b9', 
                                                                 'a6b10', 'a6b11', 'a6b12', 'a6b13', 'a6b14'])
coef_df_binary1 = pd.concat([rank_df1.reset_index(), a6binary], axis = 1)

### Create DataFrame with only variables of interest

In [9]:
coef_df_binary = coef_df_binary1[['a1b', 'a2b1', 'a2b2', 'a3b', 'a4b1', 'a4b2', 'a4b3', 'a4b4', 'a4b5', 'a4b6', 'a4b7', 'a4b8', 'a4b9', 
                                  'a4b10', 'a4b11', 'a4b12', 'a4b13', 'a4b14', 'a6b1', 'a6b2', 'a6b3', 'a6b4', 'a6b5', 'a6b6', 'a6b7', 
                                  'a6b8', 'a6b9', 'a6b10', 'a6b11', 'a6b12', 'a6b13', 'a6b14', 'rank']]

### Filtering data 

Making all the curves rank 1

In [10]:
coef_df_binary = coef_df_binary[coef_df_binary['rank'] == 1]
coef_df_binary = coef_df_binary.drop(columns = ['rank'])

In [11]:
coef_df_binary.describe()

Unnamed: 0,a1b,a2b1,a2b2,a3b,a4b1,a4b2,a4b3,a4b4,a4b5,a4b6,...,a6b5,a6b6,a6b7,a6b8,a6b9,a6b10,a6b11,a6b12,a6b13,a6b14
count,91430.0,91430.0,91430.0,91430.0,91430.0,91430.0,91430.0,91430.0,91430.0,91430.0,...,91430.0,91430.0,91430.0,91430.0,91430.0,91430.0,91430.0,91430.0,91430.0,91430.0
mean,0.471465,0.340829,0.648934,0.356294,0.685081,0.00187,0.006989,0.021295,0.112108,0.197977,...,0.303675,0.353943,0.394181,0.431937,0.457333,0.482861,0.464705,0.48401,0.419676,0.468479
std,0.499188,0.473991,0.477306,0.478906,0.464486,0.043207,0.083308,0.144367,0.315501,0.398477,...,0.459846,0.478194,0.488677,0.495348,0.498179,0.499709,0.498755,0.499747,0.493509,0.499008
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Download DataFrames

In [13]:
coef_df_binary.to_csv('rank_1_curves.csv')