Working with Tabular Data
#A common format is the CSV file (Comma separated values)

In [2]:
import numpy
help(numpy.genfromtxt)

Help on function genfromtxt in module numpy:

genfromtxt(fname, dtype=<class 'float'>, comments='#', delimiter=None, skip_header=0, skip_footer=0, converters=None, missing_values=None, filling_values=None, usecols=None, names=None, excludelist=None, deletechars=" !#$%&'()*+,-./:;<=>?@[\\]^{|}~", replace_space='_', autostrip=False, case_sensitive=True, defaultfmt='f%i', unpack=None, usemask=False, loose=True, invalid_raise=True, max_rows=None, encoding='bytes', *, ndmin=0, like=None)
    Load data from a text file, with missing values handled as specified.
    
    Each line past the first `skip_header` lines is split at the `delimiter`
    character, and characters following the `comments` character are discarded.
    
    Parameters
    ----------
    fname : file, str, pathlib.Path, list of str, generator
        File, filename, list, or generator to read.  If the filename
        extension is ``.gz`` or ``.bz2``, the file is first decompressed. Note
        that generators must retu

In [3]:
#Now, let us import data

import os 

distance_file = os.path.join('data', 'distance_data_headers.csv')

distances = numpy.genfromtxt(fname=distance_file, delimiter=',', dtype = 'unicode')
print(distances)

[['Frame' 'THR4_ATP' 'THR4_ASP' 'TYR6_ATP' 'TYR6_ASP']
 ['1' '8.9542' '5.8024' '11.5478' '9.9557']
 ['2' '8.6181' '6.0942' '13.9594' '11.6945']
 ...
 ['9998' '8.6625' '7.7306' '9.5469' '10.3063']
 ['9999' '9.2456' '7.8886' '9.8151' '10.7564']
 ['10000' '8.8135' '7.917' '9.9517' '10.7848']]


In [4]:
#Now, let us manipulate the data to make it easy to work with

headers = distances[0]
print(headers)

#The above generated a list of lists, so we can parse through
#Treat each individual list as a value within a list

['Frame' 'THR4_ATP' 'THR4_ASP' 'TYR6_ATP' 'TYR6_ASP']


In [8]:
#Now, let us try slicing the data

data = distances[1:]
print(data)

"Every list value on from 1 will only include the data"

[['1' '8.9542' '5.8024' '11.5478' '9.9557']
 ['2' '8.6181' '6.0942' '13.9594' '11.6945']
 ['3' '9.0066' '6.0637' '13.0924' '11.3043']
 ...
 ['9998' '8.6625' '7.7306' '9.5469' '10.3063']
 ['9999' '9.2456' '7.8886' '9.8151' '10.7564']
 ['10000' '8.8135' '7.917' '9.9517' '10.7848']]


'Every list value on from 1 will only include the data'

In [10]:
data = data.astype(float)
print(data)

#Now, we can use this to make every value from a string into a float

[[1.00000e+00 8.95420e+00 5.80240e+00 1.15478e+01 9.95570e+00]
 [2.00000e+00 8.61810e+00 6.09420e+00 1.39594e+01 1.16945e+01]
 [3.00000e+00 9.00660e+00 6.06370e+00 1.30924e+01 1.13043e+01]
 ...
 [9.99800e+03 8.66250e+00 7.73060e+00 9.54690e+00 1.03063e+01]
 [9.99900e+03 9.24560e+00 7.88860e+00 9.81510e+00 1.07564e+01]
 [1.00000e+04 8.81350e+00 7.91700e+00 9.95170e+00 1.07848e+01]]


In [11]:
print(data[0,1])
print(data[1,0])

8.9542
2.0


In [12]:
#Now, let us try to take a 2D slice of an array, 
#say the first 10 rows and first 3 columns

small_data = data[0:10, 0:3] #array_name[row, column]
print(small_data)

#Means including 0, include all rows up to but not including 10
#Same thing with the column, up to but not including 3

[[ 1.      8.9542  5.8024]
 [ 2.      8.6181  6.0942]
 [ 3.      9.0066  6.0637]
 [ 4.      9.2002  6.0227]
 [ 5.      9.1294  5.9365]
 [ 6.      9.0462  6.2553]
 [ 7.      8.8657  5.9186]
 [ 8.      9.3256  6.2351]
 [ 9.      9.4184  6.1993]
 [10.      9.06    6.0478]]


In [13]:
print(small_data[5, :]) #one particular row and every column
print(small_data[:, 1:]) #every row, and all columns except the first

[6.     9.0462 6.2553]
[[8.9542 5.8024]
 [8.6181 6.0942]
 [9.0066 6.0637]
 [9.2002 6.0227]
 [9.1294 5.9365]
 [9.0462 6.2553]
 [8.8657 5.9186]
 [9.3256 6.2351]
 [9.4184 6.1993]
 [9.06   6.0478]]


In [15]:
#To analyze tabular data, the numpy library has built in functions 

thr4_atp = data[:,1] #Every row in the THR4_ATP column

avg_thr4_atp = numpy.mean(thr4_atp)
print(avg_thr4_atp)

10.876950930000001


In [18]:
#Now, we have the average of one column but let's do every column

num_columns = len(data[0,:]) #Calculates number of columns
print(num_columns)

5


In [19]:
for i in range(1, num_columns):
    column = data[:, i]
    avg_col = numpy.mean(column)
    print(F'{headers[i]} : {avg_col}')

THR4_ATP : 10.876950930000001
THR4_ASP : 7.342344959999999
TYR6_ATP : 11.209791329999998
TYR6_ASP : 10.9934435


In [21]:
#Now let us make a geometry analysis project that can be transfered

import numpy
import os 

file_location = os.path.join('data', 'water.xyz')
xyzfile = open(file_location, 'r')
data = xyzfile.readlines()
num_atoms = int(data[0])
data = data[2:]

symbols = []
coordinates = []

for atom in data:
    atom_data = atom.split()
    symbol = atom_data[0]
    symbols.append(symbol)
    x, y, z = atom_data[1], atom_data[2], atom_data[3]
    coordinates.append([float(x), float(y), float(z)])
    
for num1 in range (0, num_atoms):
    for num2 in range (0, num_atoms):
        atom1 = coordinates[num1]
        atom2 = coordinates[num2]
        x_distance = atom1[0] - atom2[0]
        y_distance = atom1[1] - atom2[1]
        z_distance = atom1[2] - atom2[2]
        bond_length_12 = numpy.sqrt(x_distance**2 + y_distance**2 + z_distance**2)
        print(F'{symbols[num1]} to {symbols[num2]} : {bond_length_12:.3f}')

O to O : 0.000
O to H1 : 0.969
O to H2 : 0.969
H1 to O : 0.969
H1 to H1 : 0.000
H1 to H2 : 1.527
H2 to O : 0.969
H2 to H1 : 1.527
H2 to H2 : 0.000


In [22]:
#Now, if we want to keep a cutoff for actual bond lengths...

import numpy
import os 

file_location = os.path.join('data', 'water.xyz')
xyzfile = open(file_location, 'r')
data = xyzfile.readlines()
num_atoms = int(data[0])
data = data[2:]

symbols = []
coordinates = []

for atom in data:
    atom_data = atom.split()
    symbol = atom_data[0]
    symbols.append(symbol)
    x, y, z = atom_data[1], atom_data[2], atom_data[3]
    coordinates.append([float(x), float(y), float(z)])
    
for num1 in range (0, num_atoms):
    for num2 in range (0, num_atoms):
        atom1 = coordinates[num1]
        atom2 = coordinates[num2]
        x_distance = atom1[0] - atom2[0]
        y_distance = atom1[1] - atom2[1]
        z_distance = atom1[2] - atom2[2]
        bond_length_12 = numpy.sqrt(x_distance**2 + y_distance**2 + z_distance**2)
        if bond_length_12 > 0 and bond_length_12 <= 1.5:
            print(F'{symbols[num1]} to {symbols[num2]} : {bond_length_12:.3f}')

O to H1 : 0.969
O to H2 : 0.969
H1 to O : 0.969
H2 to O : 0.969


In [24]:
#Now, let us remove duplicates

import numpy
import os 

file_location = os.path.join('data', 'water.xyz')
xyz_file = numpy.genfromtxt(fname = file_location, skip_header = 2, dtype = 'unicode')
symbols = xyz_file[:, 0]

coordinates = (xyz_file[:, 1:])
coordinates = coordinates.astype(float)

num_atoms = len(symbols)

for num1 in range(0,num_atoms):
    for num2 in range(0,num_atoms):
        if num1<num2: #Adds another if loop
            x_distance = coordinates[num1,0] - coordinates[num2,0]
            y_distance = coordinates[num1,1] - coordinates[num2,1]
            z_distance = coordinates[num1,2] - coordinates[num2,2]
            bond_length_12 = numpy.sqrt(x_distance**2+y_distance**2+z_distance**2)
            if bond_length_12 > 0 and bond_length_12 <= 1.5:
                print(F'{symbols[num1]} to {symbols[num2]} : {bond_length_12:.3f}')

O to H1 : 0.969
O to H2 : 0.969


In [25]:
import numpy
import os 

file_location = os.path.join('data', 'water.xyz')
xyz_file = numpy.genfromtxt(fname = file_location, skip_header = 2, dtype = 'unicode')
symbols = xyz_file[:, 0]

coordinates = (xyz_file[:, 1:])
coordinates = coordinates.astype(float)

num_atoms = len(symbols)

BLfile = open('bond_lengths.txt', 'w+')

for num1 in range(0,num_atoms):
    for num2 in range(0,num_atoms):
        if num1<num2: #Adds another if loop
            x_distance = coordinates[num1,0] - coordinates[num2,0]
            y_distance = coordinates[num1,1] - coordinates[num2,1]
            z_distance = coordinates[num1,2] - coordinates[num2,2]
            bond_length_12 = numpy.sqrt(x_distance**2+y_distance**2+z_distance**2)
            if bond_length_12 > 0 and bond_length_12 <= 1.5:
                BLfile.write(F'{symbols[num1]} to {symbols[num2]} : {bond_length_12:.3f}')

BLfile.close()