# Chapter 3 Analyzing a Data Column
# 第三章 分析数据列

## 3.2 STORY: DENDRITIC LENGTHS
## 3.2 案例：树突长度

In [1]:
# Common imports
import numpy as np
import os
# Where to save the data
PROJECT_ROOT_DIR = "./data/chap_3/"

In [16]:
neuron_data = [16.38, 139.90, 441.46, 29.03, 40.93, 202.07, 142.30, \
346.00, 300.00]
open(os.path.join(PROJECT_ROOT_DIR,"neuron_data.txt"),"w").writelines([str(i)+'\n' for i in neuron_data])

In [7]:
a = [2,3,4,1]
a.sort()
a

[1, 2, 3, 4]

### 3.2.2 Example Python Session
### 3.3.2 Python会话示例

In [17]:
data = []

for line in open(os.path.join(PROJECT_ROOT_DIR,"neuron_data.txt"),"r"):
    length = float(line.strip())
    data.append(length)

n_items = len(data)
total = sum(data)
shortest = min(data)
longest = max(data)
data.sort()

output = open(os.path.join(PROJECT_ROOT_DIR,"results.txt"),"w")
output.write("number of dendritic lengths : %4i \n"%(n_items))
output.write("total dendritic length : %6.1f \n"%(total))
output.write("shortest dendritic length : %7.2f \n"%(shortest))
output.write("longest dendritic length : %7.2f \n"%(longest))
output.write("%37.2f\n%37.2f"%(data[-2], data[-3]))
output.close()

## Example 3.1 How to Calculate a Mean Value
## 例3.1： 如何计算平均值

$$\mu = \cfrac{1}{N} \sum_{i=1}^N x_i$$

In [21]:
import numpy as np

data = [3.53, 3.47, 3.51, 3.72, 3.43]
average = np.average(data)
print(average)

3.532 3.532


## Example 3.2 How to Calculate a Standard Deviation
## 例3.2： 如何计算标准差

$$\sigma = \sqrt{\cfrac{1}{N} \sum_{i=1}^N (x_i - \mu)^2}$$

In [23]:
import numpy as np

data = [3.53, 3.47, 3.51, 3.72, 3.43]
stddev = np.std(data)
print(stddev)

0.100079968026


## Example 3.3 How to Calculate a Median Value
## 例3.3： 如何计算中位数

In [25]:
import numpy as np

data = [3.53, 3.47, 3.51, 3.72, 3.43]
median = np.median(data)
print(median)

3.51


## 3.5 TESTING YOURSELF
## 3.5 自测题

### Exercise 3.1 Read and Write a File
### 3.1 读取和写入文件

Write a program that reads the file with neuron lengths and saves an identical copy of the file.

In [37]:
with open(os.path.join(PROJECT_ROOT_DIR,"neuron_data.txt"),"r") as f:
    l = f.readlines()
    print(l)
    open(os.path.join(PROJECT_ROOT_DIR,"neuron_data_bak.txt"),"w").write("Length is : %3i \n" % len(l))

['16.38\n', '139.9\n', '441.46\n', '29.03\n', '40.93\n', '202.07\n', '142.3\n', '346.0\n', '300.0\n']


### Exercise 3.2 Calculate Average and Standard Deviation
### 3.2 计算平均值和标准差
Extend the example in Section 3.2.2 so that it calculates the average neuron
length and standard deviation.

In [40]:
import numpy as np

neuron_data = [16.38, 139.90, 441.46, 29.03, 40.93, 202.07, 142.30, \
346.00, 300.00]
average = np.average(data)
stddev = np.std(data)
print("Average neuron length is:",average, "\t","Standard deviation is:",stddev)

Average neuron length is: 3.532 	 Standard deviation is: 0.100079968026


### Exercise 3.3 Frequency of Nucleotides
### 3.3 核苷酸的频率

Write a program that reads a DNA sequence from a plain text file. Count the frequency of each base. The program has to determine how often the most frequent base occurs.

**Hint**: You don’t have to identify which base it is.

In [23]:
seq = ''
d ={}

with open(os.path.join(PROJECT_ROOT_DIR,"dna_example.fasta"),"r") as f:
    f.readline()
    for line in f:
        seq +=  ''.join(line.strip())

for nucleotide in "ATCG":
    number = seq.count(nucleotide)
    d[number] = nucleotide
    print("The number of %s is %d" % (nucleotide,number))

print("The most nucletodie is %s(%d)" %(d[sorted(d.keys())[-1]],sorted(d.keys())[-1]))   

The number of A is 954
The number of T is 1131
The number of C is 1729
The number of G is 1677
The most nucletodie is C(1729)


In [33]:
from collections import Counter

count = Counter(seq)
for k,v in count.items():
    print("The number of %s is %d" % (k,v))

print("The most nucletodie is %s(%d)" %sorted(count.items(), key=lambda d:d[1], reverse=True)[0])

The number of G is 1677
The number of A is 954
The number of T is 1131
The number of C is 1729
The most nucletodie is C(1729)


### Exercise 3.4 GC-Content from a DNA Sequence
### 3.4 DNA序列的GC含量

Write a program that calculates the GC-content of a DNA sequence from a plain text file.

In [4]:
with open(os.path.join(PROJECT_ROOT_DIR,"dna_example.fasta"),"r") as f:
    for line in f:
        if line.startswith(">"):
            pass
        else:
            seq += ''.join(line.strip())

length = len(seq)
G = seq.count("G")
C = seq.count("C")

GC_content = (G+C) / length  
print("The GC-content is %5.3f%%"%(GC_content * 100))

The GC-content is 62.029%


In [15]:
from collections import Counter

counts = Counter(seq)
print("The GC-content is %5.3f%%"%((counts['C']+ counts["G"]) /  sum(counts.values()) * 100))

The GC-content is 62.029%


In [17]:
sorted(counts.values())

[2862, 3393, 5031, 5187]

In [61]:
f = open(os.path.join(PROJECT_ROOT_DIR,"neuron_data.txt"),"r")

while True:
    try:
        print(next(f))
    
    except StopIteration:
        break

16.38

139.9

441.46

29.03

40.93

202.07

142.3

346.0

300.0



In [62]:
f = open(os.path.join(PROJECT_ROOT_DIR,"neuron_data.txt"),"r")
line = True

while line:
    line = f.readline()
    print(line)

16.38

139.9

441.46

29.03

40.93

202.07

142.3

346.0

300.0


