In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import urllib3
import sys
from pathlib import Path
import numpy as np

# disable chained assignments
pd.options.mode.chained_assignment = None 

movesCount = 0

def get_soup(URL):  
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    print(URL + '\n')
    SOUP = BeautifulSoup(requests.get(URL, verify=False).content, "html.parser")
    return SOUP

def num_to_RNA(Min,Max,Value,aaKey,aa):    
    normalized = (Value-Min)/(Max-Min)
    index = int((aaKey*normalized))
    try:
        RNA = degenAADict[aa][index]
    except:
        RNA = degenAADict[aa][index-1]
    return RNA

def move_to_RNA(inputDF, dfIndex, typeAADict, degenAADict):
    moveType = inputDF.iloc[dfIndex]['Type']
    moveCat = inputDF.iloc[dfIndex]['Category']
    moveTypeAA = typeAADict[moveType]
    moveCatAA = typeAADict[moveCat]
    aaKey = (len(degenAADict[moveTypeAA]))
    ppRNA = num_to_RNA(1,40, int(inputDF.iloc[dfIndex]['PP'].replace('*','')),aaKey, moveTypeAA)
    try:
        powerRNA = num_to_RNA(10,210,int(inputDF.iloc[dfIndex]['Power']),aaKey, moveTypeAA)
    except ValueError:
        powerRNA = '---'
    try:
        accRNA = num_to_RNA(30,101,int(inputDF.iloc[dfIndex]['Accuracy'].replace('%','')),aaKey, moveTypeAA)
    except ValueError:
        accRNA = '---'
    tempSeq = degenAADict[moveTypeAA][0]+degenAADict[moveCatAA][0]+ppRNA+powerRNA+accRNA
    inputDF.at[dfIndex,'MOVE_RNA_SEQUENCE']=tempSeq
    #print(f'{inputDF.iloc[dfIndex]["Name"]}: {tempSeq}')

def get_moves_dataframe(Path):   
    movesCSV = Path
    if movesCSV.exists():
        movesDF = pd.read_csv(movesCSV, index_col=0)
        movesCount = len(movesDF.index)
        print('MOVES DATABSE FILE EXISTS: ' + str(movesCount) + ' ENTRIES - Loaded')
    else:
        print('MOVES DATABSE FILE DOES NOT EXISTS; DOWNLOADING ')
        movesListURL = 'https://bulbapedia.bulbagarden.net/wiki/List_of_moves'
        movesListSoup = get_soup(movesListURL)
        movesListTables = movesListSoup.find_all('table')
        movesDict = pd.read_html(str(movesListTables))[2]
        movesDF = pd.DataFrame.from_dict(movesDict)
        # If you know the name of the column skip this
        first_column = movesDF.columns[0]
        colNum = len(movesDF.columns)-1
        last_column = movesDF.columns[colNum]
        movesDF = movesDF.drop([first_column], axis=1)
        movesDF = movesDF.drop([last_column], axis=1)
        movesDF.insert(6, 'MOVE_RNA_SEQUENCE',np.nan)
        movesDF['MOVE_RNA_SEQUENCE'] = movesDF.MOVE_RNA_SEQUENCE.astype(str)
        #display(movesDict)
        i=0
        while i < len(movesDF):
            move_to_RNA(movesDF, i, typeAADict, degenAADict)
            i=i+1 
    if 'MOVE_RNA_SEQUENCE' in movesDF.columns:
        finalMovesSeqDF = pd.DataFrame()
        m=0
        while m < len(movesDF):
            tempName = movesDF['Name'][m].replace('*','')
            tempSeq = movesDF['MOVE_RNA_SEQUENCE'][m]
            #print(f'{tempName}: {tempSeq}')
            tempDF2 = pd.DataFrame({'Name':[tempName],'RNA_Seq': [tempSeq]})
            #print(tempDF)
            finalMovesSeqDF = finalMovesSeqDF .append(tempDF2, ignore_index = True)
            m=m+1
        print(f'SAVING UPDATED DATAFRAME: {Path}') 
        finalMovesSeqDF.to_csv(movesCSV)
        return finalMovesSeqDF
    else:
        return movesDF

def aa_to_RNA(inputSeqList, degenAADict,normIndex,outPath):
    #display(pd.DataFrame(inputSeqList))
    tempDict = {}
    keyList = list(degenAADict.keys())
    for s, seq in enumerate(inputSeqList):
        tempSeq = ''
        for a, aa in enumerate(seq[2]):
            if aa in keyList:
                tempSeq = tempSeq + degenAADict[aa][normIndex]
        tempDict[seq[1]] = tempSeq
    df = pd.DataFrame({'Input': tempDict.keys(), 'Nucleotide Sequence': tempDict.values()})
    print(f'SAVING UPDATED DATAFRAME: {outPath}') 
    df.to_csv(outPath)
    return df

def add_gene_to_dict(geneDictionary, organismName, geneSequence):
    geneDictionary[organismName] = geneSequence
    return geneDictionary

def dict_to_phylip(geneDictionary):
    dictCount = len(geneDictionary)
    dictLen = max(map(len, geneDictionary.values()))
    phylipAlignment = f" {dictCount} {dictLen}\n"
    phylipAlignment += '\n'.join(f"{org_id:<10} {sequence}" for org_id, sequence in geneDictionary.items())
    print(phylipAlignment)
    return phylipAlignment


# Written as AA
typeAADict = {'Fairy':'C',
              'Water':'S',
              'Ground':'E',
              'Dragon':'Q',
              'Bug':'V',
              'Special':'B',
              'Electric':'I',
              'Rock':'N',
              'Status':'W',
              'Ghost':'Y',
              'Fighting':'P',
              'Ice':'H',
              'Normal':'L',
              'Dark':'R',
              'Unknown':'Z',
              '---':'---',
              '???':'---',
              'Fire':'T',
               'Poison':'D',
              'Psychic':'K',
              'Steel':'F',
              'Grass':'A',
              'Flying':'G',
              'Physical':'Z'}




egAAList = [['Fairy', 'Fae(Fairy)', 'MCACCVINYPHLRCDKFSGO'],
             ['Grass', 'Plant&Ha-Plants(Grass)', 'MCAEQVAAYPHARADKFSGO'],
             ['Mineral', 'Mineral', 'MCAEQVINYPHLETDKFEGO'],
             ['Dragon', 'Ryu(Dragon)', 'MQAEQQINYPQLRTDQFSGO'],
             ['Bug', 'Arthropods & Mullosks(Bug)', 'MCAEQVINVPHVRTDKFSGO'],
             ['Human-Like', 'Human-Like', 'MCALLVILYPHLRTDKFLGO'],
             ['Monster', 'Monster', 'MPAEQPINYPHLRTDKFSGO'],
             ['Amorphous', 'Ghost&Soft-Bodied(Amorphous)', 'MCAEQYINYPHLRTDKFSGO'],
             ['Water-1', 'Aquatic-Terrestrial(W1)', 'MCAEQVINYPHLRSDKFSGO'],
             ['Water-2', 'Fish-Cephelopod(W2)', 'MSSESSINSSSSRSDKSSGO'],
             ['Water-3', 'AquaticInvertebrates(W3)', 'MSASSVSNYPHSRSDKSSGO'],
             ['Flying', 'Birds&Bats(Flying)', 'MCAGQGINYPGLRTDKFSGO'],
             ['Field', 'Terrestrial(Field)', 'MCAELLINLPHLRTDKFSGO'],
             ['Undiscovered', 'Undiscovered (No Egg Discovered/Produced)', 'MCAEQWINYPHLRTDKFSGO'],
             ['Undiscovered', 'Undiscovered(NoEggDiscovered/Produced)', 'MWWWWWWWWWWWWWWWWWWO'],
             ['Undiscovered', 'Undiscovered(Baby)', 'MCAWWWINWPHLWTDKWSGO'],
             ['Ditto', 'Ditto', 'MCAEQVINYPHLRTDKFSGO']]

degenAADict = {'L':['UUA','UUG','CUU','CUC','CUA','CUG'],
               'R':['AGG','AGA','CGG','CGA','CGC','CGU'],
               'A':['GCU','GCC','GCA','GCG'],
               'G':['GGA','GGG','GGC','GGU'],
               'P':['CCU','CCC','CCA','CCG'],
               'T':['AAU','AAC','AAA','AAG'],
               'V':['GUU','GUC','GUA','GUG'],
               'I':['AUU','AUC','AUA'],
               'N':['AAU','AAC'],
               'D':['GAU','GAC'],
               'B':['AAC','GAC'],
               'C':['UGU','UGC'],
               'E':['GAA','GAG'],
               'Q':['CAA','CAG'],
               'Z':['GAG','CAG'],
               'H':['CAU','CAC'],
               'K':['AAA','AAG'],
               'F':['UUU','UUC'],
               'S':['AGU','AGC'],
               'Y':['UAU','UAC'],
               'W':['UGG'],
               'M':['AUG'],
               'O':['UAA'],
               '---':['---']}

btRNADict = {'Bipedal':['AGUAUUGCAUUU'],
         'Quadraped':['AGCAUUGCGUUU'],
         'Bipedal&Tailed':['AGUAUCGCAUUU'],
         'Head&Arms':['AGUAUUGCAUUC'],
         'Head&Legs':['AGUAUUGCGUUU'],
         'Head&Base':['AGUAUUGCGUUC'],
         'OnlyHead':['AGUAUAGCGUUC'],
         'Finned':['AGUAUCGCUUUU'],
         'Serpentine':['AGUAUCGCGUUC'],
         'SingleWings':['AGUAUUGCCUUC'],
         'Insectiod':['AGCAUUGCCUUU'],
         'Two+Wings':['AGCAUUGCCUUC'],
         'Tentacled/MultiBody':['AGCAUUGCAUUC'],
         'MultiBody':['AGCAUUGCGUUC']}

def dict_to_RNA_df(inputDict):
    tempRows = list(inputDict.keys())
    tempColumns = ['Input','Nucleotide Sequence']
    df = pd.DataFrame(index=tempColumns)
    for bt in tempRows:
        #print(f'{bt}: {inputDict[bt]}')
        df = df.append({'Input': bt, 'Nucleotide Sequence': inputDict[bt][0]}, ignore_index=True).dropna()
    print('LOADING DICTIONARY')
    return df

##### WORKING SPACE




In [2]:
bodyTypesRNADF = dict_to_RNA_df(btRNADict)

#display(bodyTypeRNADF)

LOADING DICTIONARY


In [3]:
movesCSV = Path('./MOVES_DICT.csv')

movesRNADF = get_moves_dataframe(movesCSV)

#display(movesRNADF)

MOVES DATABSE FILE EXISTS: 850 ENTRIES - Loaded


In [4]:
egCSV = Path('./EG_DICT.csv')

eggGroupsRNADF = aa_to_RNA(egAAList, degenAADict,0,egCSV)

#display(egRNADF)

SAVING UPDATED DATAFRAME: EG_DICT.csv


In [None]:
# movesRNADict = movesRNADF.set_index('Name')['RNA_Seq'].to_dict()

# print(currentIndicies)
# fastaWrite = []
# outputPath='./tempFasta.fna'
# for index in currentIndicies:
#     nameLine = f"{currentEntries['REGION'][index]} {currentEntries['NAME'][index]}"
#     moveSet = currentEntries['MOVES'][index]
#     fullMoveSeq = 'AUG'
#     #print(moveSet)
#     for m, move in enumerate(moveSet):
#         #print(move)
#         if move in movesRNADict:
#             moveSeq = movesRNADict[move]
#             #print(moveSeq)
#             fullMoveSeq = fullMoveSeq+moveSeq
#     fullMoveSeq = fullMoveSeq+'UAA'
#     print(name)
#     print(fullMoveSeq)
#     fastaWrite.append(f'>{nameLine}\n{fullMoveSeq}\n')
# with open(outputPath,'w') as fastaOut:
#     fastaOut.writelines(fastaWrite)