# First Data preprocessing and cleaning

In [2]:
# Notebook to preprocess the raw data file and
# handle it properly. 
# Will prune the unnecessary data for now.
# Reducing data file from 600mb to 170mb.
println(readdir())

[".ipynb_checkpoints", "get_angles_from_coords_py.ipynb", "julia_preprocessing_full_coords_under_200.ipynb"]


In [3]:
f = open("../data/training_30.txt")

IOStream(<file ../data/training_30.txt>)

In [4]:
# Reading file
lines = readlines(f)

340989-element Array{String,1}:
 "[ID]"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [5]:
function coords_split(lister, splice)
    # Split all passed sequences by "splice" and return an array of them
    # Convert string fragments to float 
    coords = []
    for c in lister
        push!(coords, [parse(Float64, a) for a in split(c, splice)])
    end
    return coords
end

coords_split (generic function with 1 method)

In [6]:
# Scan first n proteins
names = []
seqs = []
coords = []
pssms = []

# Record names, seqs and coords for each protein btwn 1-n
for i in 1:length(lines)
    if length(coords) == 995
        break
    end
    
    # Start recording
    if lines[i] == "[ID]"
        push!(names, lines[i+1]) 
    elseif lines[i] == "[PRIMARY]"
        push!(seqs, lines[i+1])
    elseif lines[i] == "[TERTIARY]"
        push!(coords, coords_split(lines[i+1:i+3], "\t"))
    elseif lines[i] == "[EVOLUTIONARY]"
        push!(pssms, coords_split(lines[i+1:i+21], "\t"))
        # Progress control
        if length(names)%50 == 0
            println("Currently @ ", length(names), " out of n")
        end
    end  
end

Currently @ 50 out of n
Currently @ 100 out of n
Currently @ 150 out of n
Currently @ 200 out of n
Currently @ 250 out of n
Currently @ 300 out of n
Currently @ 350 out of n
Currently @ 400 out of n
Currently @ 450 out of n
Currently @ 500 out of n
Currently @ 550 out of n
Currently @ 600 out of n
Currently @ 650 out of n
Currently @ 700 out of n
Currently @ 750 out of n
Currently @ 800 out of n
Currently @ 850 out of n
Currently @ 900 out of n
Currently @ 950 out of n


In [7]:
# Could use "Using LinearAlgebra + built-in norm()" but gotta learn Julia
function norm(vector)
    return sqrt(sum([v*v for v in vector]))
end

norm (generic function with 1 method)

In [8]:
println("Total number of proteins: ", length(seqs))

n = 200
under = []
for i in 1:length(seqs)
    if length(seqs[i])<200
        push!(under, i)
        # println("Seelected with: ", length(seqs[i]), " number: ", i)
    end
end

println("Number of proteins under ", n, " : ", length(under))

Total number of proteins: 995
Number of proteins under 200 : 636


In [9]:
dists = []
# Get distances btwn pairs of AAs - only for prots under 200
for k in under
    # Get distances from coordinates
    dist = []
    for i in 1:length(coords[k][1])
        # Only pick coords for C-alpha carbons! - position (1/3 of total data)
        # i%3 == 2 Because juia arrays start at 1 - Python: i%3 == 1
        if i%3 == 2
            aad = [] # Distance to every AA from a given AA
            for j in 1:length(coords[k][1])
                if j%3 == 2
                    push!(aad, norm([coords[k][1][i],coords[k][2][i],coords[k][3][i]]-[coords[k][1][j],coords[k][2][j],coords[k][3][j]]))
                end
            end
            push!(dist, aad)
        end
    end
    push!(dists, dist)
    
    # Progress control
    if length(dists)%50 == 0
        println("Dists Currently @ ", length(dists), " out of n (500)")
    end
end

Dists Currently @ 50 out of n (500)
Dists Currently @ 100 out of n (500)
Dists Currently @ 150 out of n (500)
Dists Currently @ 200 out of n (500)
Dists Currently @ 250 out of n (500)
Dists Currently @ 300 out of n (500)
Dists Currently @ 350 out of n (500)
Dists Currently @ 400 out of n (500)
Dists Currently @ 450 out of n (500)
Dists Currently @ 500 out of n (500)
Dists Currently @ 550 out of n (500)
Dists Currently @ 600 out of n (500)


In [16]:
# Check everything's alright
n = 2
println("id: ", names[n])
println("seq: ", seqs[n]) 
println("sample coord: ", coords[n][1][1]) 
println("sample dist: ", dists[n][1][5])

id: 2EUL_d2euld1
seq: MAREVKLTKAGYERLMQQLERERERLQEATKILQELMESSDDYDDSGLEAAKQEKARIEARIDSLEDILSRAVILEE
sample coord: 981.8
sample dist: 2674.450579090965


In [17]:
# Data is OK. Save it to a file.
using DelimitedFiles
open("../data/full_under_200.txt", "a+") do f
    aux = [0]
    for k in under
        push!(aux, aux[length(aux)]+1)
        # ID
        write(f, "\n[ID]\n")
        write(f, names[k])
        # Seq
        write(f, "\n[PRIMARY]\n")
        write(f, seqs[k])
        # PSSMS
        write(f, "\n[EVOLUTIONARY]\n")
        writedlm(f, pssms[k])
        # Coords
        write(f, "\n[TERTIARY]\n")
        writedlm(f, coords[k])
        # Dists
        write(f, "\n[DIST]\n")
        # Check that saved proteins are less than 200 AAs
        if length(dists[aux[length(aux)]][1])>200
            print("error when checking protein in dists n: ", aux[length(aux)], " length: ", length(dists[aux[length(aux)]][1]))
            break
        else
            writedlm(f, dists[aux[length(aux)]])
        end
    end
end

# Done!