In [1]:
def calculate_aminoacid_frequencies(fasta_filename, subsequences_filename, number_of_repetitions, output_filename):
    # Read and store subsequences
    with open(subsequences_filename, 'r') as file:
        subsequences = [line.strip() for line in file]
    print("Subsequences read:", subsequences)  # Debugging

    # Read and process the FASTA file
    with open(fasta_filename, 'r') as fd:
        fasta_contents = fd.read().split('>')[1:]
        proteins = ["".join(protein.split("\n")[1:]) for protein in fasta_contents]
    print("Number of proteins read:", len(proteins))  # Debugging

    # Initialize a dictionary to count occurrences
    subseqdic = {subseq: 0 for subseq in subsequences}

    # Count occurrences
    for protein in proteins:
        for subseq in subsequences:
            if protein.count(subseq) >= number_of_repetitions:
                subseqdic[subseq] += 1

    # Sort and write to file
    sorted_subseq = sorted(subseqdic.items(), key=lambda x: x[1], reverse=True)
    with open(output_filename, 'w') as out_file:
        out_file.write(f"#Number of proteins: {len(proteins)}\n")
        out_file.write(f"#Number of subsequences: {len(subseqdic)}\n")
        out_file.write("#Subsequence proportions:\n")
        for subseq, count in sorted_subseq:
            proportion = count / len(proteins)
            out_file.write(f"{subseq} {count} {proportion:.4f}\n")

# Example usage
calculate_aminoacid_frequencies('example_fasta_file.fa', 'sequence_fragments.txt', 1, 'output.txt')


Subsequences read: ['AAVP', 'AC', 'ACV', 'AEAV', 'AF', 'AFV', 'AINP', 'ALSV', 'CGTG', 'CITP', 'CT', 'CTD', 'CVDW', 'DA', 'DAC', 'DF', 'DFY', 'DG', 'DGI', 'DKED', 'DKPD', 'DKTI', 'DP', 'DPS', 'DRYF', 'DS', 'DSN', 'DSSD', 'DTDC', 'EDPT', 'EEYY', 'EG', 'EGV', 'EIEL', 'EL', 'ELI', 'EMMI', 'EPSE', 'ETAQ', 'EY', 'EYD', 'FALL', 'FEPK', 'FG', 'FGK', 'FI', 'FIS', 'FQ', 'FQD', 'FQRE', 'FQS', 'FR', 'FRI', 'FVAG', 'FVNE', 'GA', 'GAL', 'GAT', 'GEAC', 'GGSD', 'GI', 'GIK', 'GP', 'GPV', 'GS', 'GSN', 'GT', 'GTI', 'GTQ', 'HKTA', 'HL', 'HLR', 'HSGV', 'IF', 'IFN', 'IGAT', 'IKPL', 'IL', 'ILL', 'IP', 'IPE', 'IPIV', 'IQEM', 'IQLR', 'IS', 'ISGP', 'ISK', 'IV', 'IVL', 'IVMD', 'KAVE', 'KAYN', 'KD', 'KDE', 'KF', 'KFR', 'KFS', 'KIYT', 'KM', 'KML', 'KN', 'KNIT', 'KNT', 'KR', 'KRE', 'KT', 'KTE', 'KTLY', 'KVYD', 'KYDT', 'LA', 'LAA', 'LFCV', 'LHKR', 'LIVG', 'LNEV', 'LP', 'LPK', 'LSED', 'LT', 'LTD', 'LTHL', 'LV', 'LVE', 'LVKD', 'LY', 'LYD', 'LYER', 'LYP', 'MD', 'MDK', 'MQRL', 'MT', 'MTS', 'MV', 'MVS', 'MYFL', 'NA', 'NA