# Collecting Data for Research

In [None]:
# Spelling correction
%pip install biopython
from Bio import Entrez
Entrez.email = 'learnbiopython@gmail.com'
sciNames = ['Bos gaurus']

# Lists every data
record = Entrez.read(Entrez.espell(db='pmc', term='biopythonn'))
print(type(record))
print(record.keys())
for key in record.keys():
    print(key, ':', record[key])



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting biopython
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m74.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.81
<class 'Bio.Entrez.Parser.DictionaryElement'>
dict_keys(['Database', 'Query', 'CorrectedQuery', 'SpelledQuery'])
Database : pmc
Query : biopythonn
CorrectedQuery : biopython
SpelledQuery : ['', 'biopython']


In [None]:
# Research Collection
record = Entrez.read(Entrez.esearch(db='pmc', term='biopython', retmax=100))
print(type(record))
print(record.keys())
for key in record.keys():
    print(key, ':', record[key])
biopythonID = record['IdList']
print(biopythonID)

<class 'Bio.Entrez.Parser.DictionaryElement'>
dict_keys(['Count', 'RetMax', 'RetStart', 'IdList', 'TranslationSet', 'TranslationStack', 'QueryTranslation'])
Count : 2894
RetMax : 100
RetStart : 0
IdList : ['10256225', '10254443', '10254083', '10250245', '10250203', '10241889', '10241417', '10238937', '10235069', '10232014', '10227427', '10227399', '10224666', '10221840', '10219788', '10219531', '10215675', '10251718', '10249658', '10246253', '10243083', '10234545', '10227362', '10221105', '10216779', '10204868', '10204111', '10203616', '10201985', '10199479', '10198650', '10198438', '10187174', '10185341', '10169335', '10169329', '10168271', '10159843', '10158992', '10153705', '10153118', '10148868', '10148629', '10145048', '10143361', '10134851', '10131221', '10123101', '10120676', '10105235', '10104372', '10104344', '10100218', '10083194', '10072491', '10070815', '10070360', '10070209', '10066034', '10060750', '10055274', '10055226', '10053088', '10049856', '10049292', '10028922', '1

In [None]:
# Journals Lists
for ID in biopythonID[:10]:
    summary = Entrez.read(Entrez.esummary(db='pmc', id=ID))
    for handle in summary:
        print(handle['Title'], '\t',
              handle['FullJournalName'], '\t', handle['DOI'])


CRISPR-Analytics (CRISPR-A): A platform for precise analytics and simulations for gene editing 	 PLOS Computational Biology 	 10.1371/journal.pcbi.1011137
Insights into the Structural Conformations of the Tau Protein in Different Aggregation Status 	 Molecules 	 10.3390/molecules28114544
Pathogenicity and Genomic Characterization of a Novel Genospecies, Bacillus shihchuchen, of the Bacillus cereus Group Isolated from Chinese Softshell Turtle (Pelodiscus sinensis) 	 International Journal of Molecular Sciences 	 10.3390/ijms24119636
GeoBind: segmentation of nucleic acid binding interface on protein surface with geometric deep learning 	 Nucleic Acids Research 	 10.1093/nar/gkad288
Core defense hotspots within Pseudomonas aeruginosa are a consistent and rich source of anti-phage defense systems 	 Nucleic Acids Research 	 10.1093/nar/gkad317
Genome-scale CRISPR screen reveals neddylation to contribute to cisplatin resistance of testicular germ cell tumours 	 British Journal of Cancer 	 10.

# Exporting Genomic Data from NCBI

In [None]:
# Exporting Genomic Data from NCBI

# GPI = glucose phosphate isomerase protein family (Protein Coding)
# term => is similar with advanced search in NCBI sequence
record = Entrez.read(Entrez.esearch(db='nucleotide',
                                    term='GPI[Gene Name] AND Homo sapiens [Organism] AND refSeq[Keyword]',
                                    retmax=100, idtype='acc'))
print(record)

# Fetching mRNA only
# NM_ means reference mRNA
# NC_ means Genomic Data

for ID in record['IdList']:
    if 'NM_' in ID:
        fetch = Entrez.efetch(db='nucleotide', id=ID,
                              rettype='fasta', retmode='text')
        readFetch = fetch.readline()
        print(readFetch)
print(record)

# Summarize!
counter = 0
fetchList = []
for ID in record['IdList']:
    if 'NM_' in ID:
        counter += 1
        fetch = Entrez.efetch(db='nucleotide',
                              id=ID,
                              rettype='fasta',
                              retmode='text')
        readFetch = fetch.readline()
        fetchList.append(readFetch)

print(fetchList)
print(len(fetchList))
for files in fetchList:
  with open ('GPI.fasta','a+') as savedFile:
    savedFile.write(files)

{'Count': '23', 'RetMax': '23', 'RetStart': '0', 'IdList': ['XM_054351370.1', 'XM_054351369.1', 'XM_054320582.1', 'XM_054333257.1', 'XM_047416582.1', 'XM_005268348.2', 'XM_011526754.4', 'NC_060929.1', 'NC_060943.1', 'NW_025791809.1', 'XM_006714747.2', 'NC_000005.10', 'NC_000019.10', 'NT_187619.1', 'NG_012838.3', 'NM_001289790.3', 'NM_001329911.2', 'NM_005471.5', 'NM_000175.5', 'NM_001289789.1', 'NM_001329910.1', 'NM_001329909.1', 'NM_001184722.1'], 'TranslationSet': [{'From': 'Homo sapiens[Organism]', 'To': '"Homo sapiens"[Organism]'}], 'TranslationStack': [{'Term': 'GPI[Gene Name]', 'Field': 'Gene Name', 'Count': '48823', 'Explode': 'N'}, {'Term': '"Homo sapiens"[Organism]', 'Field': 'Organism', 'Count': '28450859', 'Explode': 'Y'}, 'AND', {'Term': 'refSeq[Keyword]', 'Field': 'Keyword', 'Count': '97020864', 'Explode': 'N'}, 'AND'], 'QueryTranslation': 'GPI[Gene Name] AND "Homo sapiens"[Organism] AND refSeq[Keyword]'}
>NM_001289790.3 Homo sapiens glucose-6-phosphate isomerase (GPI), tr

# Record all the data

In [None]:
# Here I took 1 data with this ID -> NM_001289790.3

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

sequence = "GCTTGCTGCGCGCTGCCGGCGCTCCTTCCTCCTCGGCTCGCGTCTCACTCAGTGTACCTTCTAGTCCCGC"\
           "CATGGCCGCTCTCACCCGGGACCCCCAGTTCCAGAAGCTGCAGCAATGGTACCGCGAGCACCGCTCCGAG"\
           "CTGAACCTGCGCCGCCTCTTCGATGCCAACAAGGACCGCTTCAACCACTTCAGCTTGACCCTCAACACCA"\
           "ACCATGGGCATATCCTGGTGGATTACTCCAAGAACCTGGTGACGGAGGACGTGATGCGGATGCTGGTGGA"\
           "CTTGGCCAAGTCCAGGGGCGTGGAGGCCGCCCGGGAGCGGATGTTCAATGGTGAGAAGATCAACTACACC"\
           "GAGGGTCGAGCCGTGCTGCACGTGGCTCTGCGGAACCGGTCAAACACACCCATCCTGGTAGACGGCAAGG"\
           "ATGTGATGCCAGAGGTCAACAAGGTTCTGGACAAGATGAAGTCTTTCTGCCAGGGACCCCTCATGGTGAC"\
           "TGAAGCCCTTAAGCCATACTCTTCAGGAGGTCCCCGCGTCTGGTATGTCTCCAACATTGATGGAACTCAC"\
           "ATTGCCAAAACCCTGGCCCAGCTGAACCCCGAGTCCTCCCTGTTCATCATTGCCTCCAAGACCTTTACTA"\
           "CCCAGGAGACCATCACGAATGCAGAGACGGCGAAGGAGTGGTTTCTCCAGGCGGCCAAGGATCCTTCTGC"\
           "AGTGGCGAAGCACTTTGTTGCCCTGTCTACTAACACAACCAAAGTGAAGGAGTTTGGAATTGACCCTCAA"\
           "AACATGTTCGAGTTCTGGGATTGGGTGGGAGGACGCTACTCGCTGTGGTCGGCCATCGGACTCTCCATTG"\
           "CCCTGCACGTGGGTTTTGACAACTTCGAGCAGCTGCTCTCGGGGGCTCACTGGATGGACCAGCACTTCCG"\
           "CACGACGCCCCTGGAGAAGAACGCCCCCGTCTTGCTGGCCCTGCTGGGTATCTGGTACATCAACTGCTTT"\
           "GGGTGTGAGACACACGCCATGCTGCCCTATGACCAGTACCTGCACCGCTTTGCTGCGTACTTCCAGCAGG"\
           "GCGACATGGAGTCCAATGGGAAATACATCACCAAATCTGGAACCCGTGTGGACCACCAGACAGGCCCCAT"\
           "TGTGTGGGGGGAGCCAGGGACCAATGGCCAGCATGCTTTTTACCAGCTCATCCACCAAGGCACCAAGATG"\
           "ATACCCTGTGACTTCCTCATCCCGGTCCAGACCCAGCACCCCATACGGAAGGGTCTGCATCACAAGATCC"\
           "TCCTGGCCAACTTCTTGGCCCAGACAGAGGCCCTGATGAGGGGAAAATCGACGGAGGAGGCCCGAAAGGA"\
           "GCTCCAGGCTGCGGGCAAGAGTCCAGAGGACCTTGAGAGGCTGCTGCCACATAAGGTCTTTGAAGGAAAT"\
           "CGCCCAACCAACTCTATTGTGTTCACCAAGCTCACACCATTCATGCTTGGAGCCTTGGTCGCCATGTATG"\
           "AGCACAAGATCTTCGTTCAGGGCATCATCTGGGACATCAACAGCTTTGACCAGTGGGGAGTGGAGCTGGG"\
           "AAAGCAGCTGGCTAAGAAAATAGAGCCTGAGCTTGATGGCAGTGCTCAAGTGACCTCTCACGACGCTTCT"\
           "ACCAATGGGCTCATCAACTTCATCAAGCAGCAGCGCGAGGCCAGAGTCCAATAAACTCGTGCTCATCTGC"\
           "AGCCTCCTCTGTGACTCCCCTTTCTCTTCTCGTCCCTCCTCCCCGGAGCCGGCACTGCATGTTCCTGGAC"\
           "ACCACCCAGAGCACCCTCTGGTTGTGGGCTTGGACCACGAGCCCTTAGCAGGGAAGGCTGGTCTCCCCCA"\
           "GCCTAACCCCCAGCCCCTCCATGTCTATGCTCCCTCTGTGTTAGAATTGGCTGAAGTGTTTTTGTGCAGC"\
           "TGACTTTTCTGACCCATGTTCACGTTGTTCACATCCCATGTAGAAAAATAAAGATGCCACGGAGGAGGTT"\
           "GTAGGCTCAGCCTCTGATTTTTTTTTTCCTGTGATGGTGCTTTATGTAGCAGAGGGCAGGAGCGCTCAGC"\
           "AGGACGCAGGCTGTGCCTCTGCGGACACTTAACACTAAGTGGTGAGCGGGTCTAGAGTGGAGCAAGGTGC"\
           "CCTGAGAAGACAATAGTGGGGTGGGGGCACAATCAGTCAGGACGGCAACTTGGCCTGTGTCACCAAATCC"\
           "CAAGACTGTTTTCCACTCCTCACCTCTGTGACTGCAGAAATTGGATACTCTGTTCACTCGATGGTTCTAA"\
           "AAACTGCATTGAGATTATGTTTGTTTCGGGTGAATTCCTGGACAAGACCGAGGATGACTGCCATCTCCTG"\
           "GCAAGACGCTCAGGTAGTTCTTTTGCTTTAAAAGGCAGATATTGAAAACTGGAATTTTTTTTTTTTGAGT"\
           "CTCGCTCTGTCACCCAGACTGGAGTGCAGTGGTGCAATCTCGGCTCACTGCAACCTCCGCCTCCCGGGTT"\
           "CAAGCTATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGATTACACGGCGCACACCACCATACCCAGCTAA"\
           "TTTTTGTATTTTTAGTAGTGAAGGGGTTTTACCATGTTGGGCAGGCTGGTCTTGAACTCCTGACCTCAGG"\
           "TGATCTGCCCGCCTCAGCCTCCCACAGTGCTGGGATTACAGGTATGAGCCACCACGCCCGGCCCATTTTT"\
           "TTTTTTTTTTTGACAACTTTTTTTTTTTTTTGAGACAGGGTCTTGTTCCATTGCCCAGACTGGAGTGCAG"\
           "TGGCATGATCACAGCTCACTGCAGCCAGTAATCCTCTTGCCTCAGCCTCCCAAGTAGTTGAGACTACAGG"\
           "TTGTACCACTATGCCCTGCTAGTTTTTTCATTTTTTGTAGAGAGACGGGTCTTTTTTTTTTTTGAGACGG"\
           "AGTCTCGCTCTGTCGCCCAAGCTGGAGTGCAGTAGCACGGTCTCAGCTCATTGCAAGCTCCGCCTCCCAG"\
           "GTTCACGCCATTCTCCTGCCTCAGACTCCTGTGTAGCTGGGAGTACAGGCACCTGCCACCATGCCCGGCT"\
           "AATTTTTTATATATTTTTTTAGCAGAGACAGTGTCTCACTGTGTTAGTCAGGATGGTCTCGATCTCCTGA"\
           "CCTCGTGATCCGCCCGCCTCAGCCTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACCGCGCCCAGCAAG"\
           "GCGGGTCTTGCTGTGTTTCCCAGACTAGACTGGTCTTGAATTCCAGGGCTCAAGAGATCTCCCACCTCAG"\
           "CCTCCCACAGTGCTGGGATTACAGGCGTGAGCCGCCACACCCAGCCTATTCAAAATTTTTTTTTCTTAGA"\
           "GACAGGGTCTTTGTTGCCCAGGCTGGACTGCAGTGATACAATCATAGCTGACTGAAGCCTCAAATTCCCA"\
           "GGCTAAGGTGATCTTCTCACCTCAGCCTTCCAAGTAGCTGGGTCCGCAGATGCATGCCAGTACACCCAGC"\
           "TCATTTAAAAAAAAATTTTTTTCTTTTTTGAGAGTCTTGCTTTGTTGCCCAGGCTGGAGTGCAGTGGTGT"\
           "GATCTCGGCTCACTGCAAGCTCCACCTCCCGGCTTCACGCCATTCTCCTGCCTCAGCCTCCCGAGTAGCT"\
           "GGGACTACAGGTGCCCGCCACCACACCCGGCTAATTTTTTGTATTTTTAGTAGAGACGGGGTTTCACCGT"\
           "GTTAGCCAGGATGGTCTTGATCTCCTGACCTCATGATCCGCCTGCTTTGGCCTCCCAAAGTGCTGGGATT"\
           "ACAGGCGTGAGTCACCGCGCCCGGCTCATTTTAAAATTTTTGTAGATCAGTGTACTGTTGTAAAAAAAAA"\
           "AAAATAAGAAAAATAAAAAATAAATTTTTGTAGTGATAGGATCCCACTGAGGCCAGAGAATAGGGTCTGG"\
           "AGACAAAGGAGCATTCACTTCAGCCTCTGACTGGTGGCAGGCCAAGTCTTTATTTACATAGGGTGTAACC"\
           "AAATAGGAAACCTCTAAAGGGTACTTAAACCCCAGATTTTCTACACAGGGCACTTGCTTGAGCCTCATCC"\
           "CGCTTTCTGGAATGTACTTTTGCTTCAATAAATCTGTGCTTTTGTTCCTTC"\

record = SeqRecord(Seq(sequence), id="NM_001289790.3", description="Homo sapiens glucose-6-phosphate isomerase (GPI), transcript variant 4, mRNA")
SeqIO.write(record, "sequence.fasta", "fasta")
# SeqIO.write(record, "sequence.gb", "genbank")

1

# Protein EDA

In [None]:
from Bio import SeqIO
# Load and print A FASTA file format
for record in SeqIO.parse("sequence.fasta", "fasta"):
    print(record.id)
    print(record.description)

for record in SeqIO.parse("sequence.fasta", "fasta"):
    print(record)

dna_record = SeqIO.read("sequence.fasta", "fasta")

dna_seq = dna_record.seq

dna_seq


NM_001289790.3
NM_001289790.3 Homo sapiens glucose-6-phosphate isomerase (GPI), transcript variant 4, mRNA
ID: NM_001289790.3
Name: NM_001289790.3
Description: NM_001289790.3 Homo sapiens glucose-6-phosphate isomerase (GPI), transcript variant 4, mRNA
Number of features: 0
Seq('GCTTGCTGCGCGCTGCCGGCGCTCCTTCCTCCTCGGCTCGCGTCTCACTCAGTG...TTC')


Seq('GCTTGCTGCGCGCTGCCGGCGCTCCTTCCTCCTCGGCTCGCGTCTCACTCAGTG...TTC')

In [None]:
# Read and Load a GenBank File format
for record in SeqIO.parse("sequence.gb", "genbank"):
    print(record)

gb_dna_record = SeqIO.read("sequence.gb", "gb")

FileNotFoundError: ignored

In [None]:
# Writing Exported Data into FASTA
from Bio import SeqIO

record_dict = SeqIO.to_dict(SeqIO.parse('sequence.fasta', 'fasta'))

with open('output_sequence_fasta.fasta', 'w') as handle:
    SeqIO.write(record_dict.values(), handle, 'fasta')

In [None]:
# Write exported Data into GenBank
from Bio import SeqIO

record_dict = SeqIO.to_dict(SeqIO.parse('sequence.fasta', 'fasta'))

for record in record_dict.values():
    record.annotations["molecule_type"] = "DNA"

with open('output_sequence_genbank.gb', 'w') as handle:
    SeqIO.write(record_dict.values(), handle, 'gb')

In [None]:
from Bio.Seq import Seq
from Bio import SeqIO
seq_test = SeqIO.read("sequence.fasta", "fasta")
dna_seq = seq_test.seq

# Transcription
# DNA to mRNA = Writing the msg
protein_test = dna_seq.transcribe().translate()
protein_test

# Longest Seq AA before a stop codon
protein_test_clean = protein_test.split("*")
protein_test_clean = [str(i) for i in protein_test_clean]
protein_test_clean

['ACCALPALLPPRLASHSVYLLVPPWPLSPGTPSSRSCSNGTASTAPS',
 'TCAASSMPTRTASTTSA',
 'PSTPTMGISWWITPRTW',
 'RRT',
 'CGCWWTWPSPGAWRPPGSGCSMVRRSTTPRVEPCCTWLCGTGQTHPSW',
 'TARM',
 'CQRSTRFWTR',
 'SLSARDPSW',
 'LKPLSHTLQEVPASGMSPTLMELTLPKPWPS',
 'TPSPPCSSLPPRPLLPRRPSRMQRRRRSGFSRRPRILLQWRSTLLPCLLTQPK',
 'RSLELTLKTCSSSGIGWEDATRCGRPSDSPLPCTWVLTTSSSCSRGLTGWTSTSARRPWRRTPPSCWPCWVSGTSTALGVRHTPCCPMTSTCTALLRTSSRATWSPMGNTSPNLEPVWTTRQAPLCGGSQGPMASMLFTSSSTKAPR',
 'YPVTSSSRSRPSTPYGRVCITRSSWPTSWPRQRP',
 '',
 'GENRRRRPERSSRLRARVQRTLRGCCHIRSLKEIAQPTLLCSPSSHHSCLEPWSPCMSTRSSFRASSGTSTALTSGEWSWESSWLRK',
 'SLSLMAVLK',
 'PLTTLLPMGSSTSSSSSARPESNKLVLICSLLCDSPFSSRPSSPEPALHVPGHHPEHPLVVGLDHEPLAGKAGLPQPNPQPLHVYAPSVLELAEVFLCS',
 'LF',
 'PMFTLFTSHVEK',
 'RCHGGGCRLSL',
 'FFFSCDGALCSRGQERSAGRRLCLCGHLTLSGERV',
 'SGARCPEKTIVGWGHNQSGRQLGLCHQIPRLFSTPHLCDCRNWILCSLDGSKNCIEIMFVSGEFLDKTEDDCHLLARRSGSSFALKGRY',
 'KLEFFFFESRSVTQTGVQWCNLGSLQPPPPGFKLFSCLSLPSSWDYTAHTTIPS',
 'FLYF',
 '',
 '',
 'RGFTMLGRLVLNS',
 'PQVICPPQPPTVLGLQV',
 'ATTPGPFFFFF

In [None]:
%pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Data Frame (OPTIONAL BUT IMPORTANT FOR DATA SCIENCE)
import pandas as pd
df = pd.DataFrame({"amino_acids": protein_test_clean})

df['count'] = df['amino_acids'].str.len()

print(df.head())

print(df.nlargest(10, 'count'))

df.nlargest(10, 'count')

                                        amino_acids  count
0   ACCALPALLPPRLASHSVYLLVPPWPLSPGTPSSRSCSNGTASTAPS     47
1                                 TCAASSMPTRTASTTSA     17
2                                 PSTPTMGISWWITPRTW     17
3                                               RRT      3
4  CGCWWTWPSPGAWRPPGSGCSMVRRSTTPRVEPCCTWLCGTGQTHPSW     48
                                          amino_acids  count
10  RSLELTLKTCSSSGIGWEDATRCGRPSDSPLPCTWVLTTSSSCSRG...    147
15  PLTTLLPMGSSTSSSSSARPESNKLVLICSLLCDSPFSSRPSSPEP...     99
20  SGARCPEKTIVGWGHNQSGRQLGLCHQIPRLFSTPHLCDCRNWILC...     89
13  GENRRRRPERSSRLRARVQRTLRGCCHIRSLKEIAQPTLLCSPSSH...     87
38  KKIFFFFESLALLPRLECSGVISAHCKLHLPASRHSPASASRVAGT...     68
21  KLEFFFFESRSVTQTGVQWCNLGSLQPPPPGFKLFSCLSLPSSWDY...     54
9   TPSPPCSSLPPRPLLPRRPSRMQRRRRSGFSRRPRILLQWRSTLLP...     53
35  IPGLKRSPTSASHSAGITGVSRHTQPIQNFFFLETGSLLPRLDCSD...     51
4    CGCWWTWPSPGAWRPPGSGCSMVRRSTTPRVEPCCTWLCGTGQTHPSW     48
0     ACCALPALLPPRLASHSVYLLVPPWPLSPG

Unnamed: 0,amino_acids,count
10,RSLELTLKTCSSSGIGWEDATRCGRPSDSPLPCTWVLTTSSSCSRG...,147
15,PLTTLLPMGSSTSSSSSARPESNKLVLICSLLCDSPFSSRPSSPEP...,99
20,SGARCPEKTIVGWGHNQSGRQLGLCHQIPRLFSTPHLCDCRNWILC...,89
13,GENRRRRPERSSRLRARVQRTLRGCCHIRSLKEIAQPTLLCSPSSH...,87
38,KKIFFFFESLALLPRLECSGVISAHCKLHLPASRHSPASASRVAGT...,68
21,KLEFFFFESRSVTQTGVQWCNLGSLQPPPPGFKLFSCLSLPSSWDY...,54
9,TPSPPCSSLPPRPLLPRRPSRMQRRRRSGFSRRPRILLQWRSTLLP...,53
35,IPGLKRSPTSASHSAGITGVSRHTQPIQNFFFLETGSLLPRLDCSD...,51
4,CGCWWTWPSPGAWRPPGSGCSMVRRSTTPRVEPCCTWLCGTGQTHPSW,48
0,ACCALPALLPPRLASHSVYLLVPPWPLSPGTPSSRSCSNGTASTAPS,47
