# 数据分析中常用的Python技巧

## 1. 条件表达式

In [1]:
import math

def get_log(x):
    # 普通写法
    if x > 0:
        y = math.log(x)
    else:
        y = float('nan')
    return y

In [3]:
x = 9
log_val1 = get_log(x)
# 使用条件表达式
log_val2 = math.log(x) if x > 0 else float('nan')

print(log_val1)
print(log_val2)

2.1972245773362196
2.1972245773362196


In [7]:
y = 10
z = 2
log_val3_positive = math.log(y) if y > 5 else float(0)
log_val3_negative = math.log(2) if z > 5 else float(0)

print(log_val3_positive)
print(log_val3_negative)

2.302585092994046
0.0


## 2. 列表推导式

In [8]:
print('找出1000内的偶数(for循环)：')
l1 = []
for i in range(10):
    if i % 2 == 0:
        l1.append(i)
print(l1)

找出1000内的偶数(for循环)：
[0, 2, 4, 6, 8]


In [9]:
print('找出1000内的偶数(列表推导式)：')
l2 = [i for i in range(10) if i % 2 == 0]
print(l2)

找出1000内的偶数(列表推导式)：
[0, 2, 4, 6, 8]


## 3. Python常用容器类型

* list

In [39]:
l = [1, 'a', 2, 'b']
print(type(l))
print('修改前：', l)
print('Slicing list : ', l[:2])

# 修改list的内容
l[0] = 3
print('修改后：', l)

# 末尾添加元素
l.append(4)
print('添加后：', l)

# 遍历list
print('遍历list(for循环)：')
for item in l:
    print(item)
    
# 通过索引遍历list
print('遍历list(while循环)：')
i = 0
while i != len(l):
    print(l[i])
    i += 1
    
# 列表合并
print('列表合并(+)：', [1, 2] + [3, 4])

# 列表重复
print('列表重复(*)：', [1, 2] * 5)

# 判断元素是否在列表中
print('判断元素存在(in)：', 1 in [1, 2])

<class 'list'>
修改前： [1, 'a', 2, 'b']
Slicing list :  [1, 'a']
修改后： [3, 'a', 2, 'b']
添加后： [3, 'a', 2, 'b', 4]
遍历list(for循环)：
3
a
2
b
4
遍历list(while循环)：
3
a
2
b
4
列表合并(+)： [1, 2, 3, 4]
列表重复(*)： [1, 2, 1, 2, 1, 2, 1, 2, 1, 2]
判断元素存在(in)： True


* tuple

In [35]:
t = (1, 'a', 2, 'b')
print(type(t))

#元组的内容不能修改，否则会报错
# t[0] = 3 

# 遍历tuple
print('遍历list(for循环)：')
for item in t:
    print(item)
    
# 通过索引遍历tuple
print('遍历tuple(while循环)：')
i = 0
while i != len(t):
    print(t[i])
    i += 1

# Multiple assignment
(x, y) = (10, 20)
print('x is : ', x)
print('y is : ', y)
    
# 解包 unpack
a, b, c, _ = t
print('unpack: ', c)

# 确保unpack接收的变量个数和tuple的长度相同，否则报错
# 经常出现在函数返回值的赋值时
# a, b, c = t

<class 'tuple'>
遍历list(for循环)：
1
a
2
b
遍历tuple(while循环)：
1
a
2
b
x is :  10
y is :  20
unpack:  2


10
20


* dictionary

In [38]:
d = {'小象学院': 'http://www.chinahadoop.cn/',
    '百度': 'https://www.baidu.com/',
    '阿里巴巴': 'https://www.alibaba.com/',
    '腾讯': 'https://www.tencent.com/'}

print('通过key获取value: ', d['小象学院'])
print('通过key获取value: ', d['百度'])

# 遍历key
print('遍历key: ')
for key in d.keys():
    print(key)
    
# 遍历value
print('遍历value: ')
for value in d.values():
    print(value)
    
# 遍历item, Multiple Assignment
print('遍历item: ')
for key, value in d.items():
    print(key + ': ' + value)

# format输出格式
print('format输出格式：')
for key, value in d.items():
    print('{}的网址是{}'.format(key, value))


通过key获取value:  http://www.chinahadoop.cn/
通过key获取value:  https://www.baidu.com/
遍历key: 
小象学院
百度
阿里巴巴
腾讯
遍历value: 
http://www.chinahadoop.cn/
https://www.baidu.com/
https://www.alibaba.com/
https://www.tencent.com/
遍历item: 
小象学院: http://www.chinahadoop.cn/
百度: https://www.baidu.com/
阿里巴巴: https://www.alibaba.com/
腾讯: https://www.tencent.com/
format输出格式：
小象学院的网址是http://www.chinahadoop.cn/
百度的网址是https://www.baidu.com/
阿里巴巴的网址是https://www.alibaba.com/
腾讯的网址是https://www.tencent.com/


* set

In [40]:
print('创建set:')
my_set = {1, 2, 3}
print(my_set)
my_set = set([1, 2, 3, 2])
print(my_set)

print('添加单个元素:')
my_set.add(3)
print('添加3', my_set)

my_set.add(4)
print('添加4', my_set)

print('添加多个元素：')
my_set.update([4, 5, 6])
print(my_set)

创建set:
{1, 2, 3}
{1, 2, 3}
添加单个元素:
添加3 {1, 2, 3}
添加4 {1, 2, 3, 4}
添加多个元素：
{1, 2, 3, 4, 5, 6}


In [43]:
my_dic = {x.upper(): x * 3 for x in 'abcd'}
print(my_dic)

{'A': 'aaa', 'B': 'bbb', 'C': 'ccc', 'D': 'ddd'}


## 4. Counter

* 初始化

In [61]:
import collections

c1 = collections.Counter(['a', 'b', 'c', 'a', 'b', 'b'])
c2 = collections.Counter({'a':2, 'b':3, 'c':1})
c3 = collections.Counter(a=2, b=3, c=1)

print(type(c1))
print(c1)
print(c2)
print(c3)

<class 'collections.Counter'>
Counter({'b': 3, 'a': 2, 'c': 1})
Counter({'b': 3, 'a': 2, 'c': 1})
Counter({'b': 3, 'a': 2, 'c': 1})


* 更新内容

In [45]:
# 注意这里是做“加法”，不是“替换”
c1.update({'a': 4, 'c': -2, 'd': 4})
print(c1)

Counter({'a': 6, 'd': 4, 'b': 3, 'c': -1})


* 访问内容

In [46]:
print('a=', c1['a'])
print('b=', c1['b'])
# 对比和dict的区别
print('e=', c1['e'])

a= 6
b= 3
e= 0


* element()方法

In [47]:
for element in c1.elements():
    print(element)

a
a
a
a
a
a
b
b
b
d
d
d
d


* most_common()方法

In [49]:
c1.most_common(2)

[('a', 6), ('d', 4)]

## 5. defaultdict

In [52]:
# 统计每个字母出现的次数
s = 'chinadoop'

# 使用Counter
print(collections.Counter(s))

Counter({'o': 2, 'c': 1, 'h': 1, 'i': 1, 'n': 1, 'a': 1, 'd': 1, 'p': 1})


In [57]:
# 使用dict
counter = {}
for c in s:
    if c not in counter:
        counter[c] = 1
    else:
        counter[c] += 1
        
print(counter.items())
print(counter['d'])
#print(counter['z']) KeyError because key z doesn't exist

dict_items([('c', 1), ('h', 1), ('i', 1), ('n', 1), ('a', 1), ('d', 1), ('o', 2), ('p', 1)])
1


In [62]:
# 使用defaultdict
counter2 = collections.defaultdict(int)
print(type(counter2))
for c in s:
    counter2[c] += 1
print(counter2.items())
# No KeyError because the default value is 0 for any key not exist   
print(counter2['z'])

<class 'collections.defaultdict'>
dict_items([('c', 1), ('h', 1), ('i', 1), ('n', 1), ('a', 1), ('d', 1), ('o', 2), ('p', 1)])
0


In [60]:
# 记录相同元素的列表
colors = [('yellow', 1), ('blue', 2), ('yellow', 3), ('blue', 4), ('red', 1)]
d = collections.defaultdict(list)
for k, v in colors:
    d[k].append(v)

print(d.items())

dict_items([('blue', [2, 4]), ('yellow', [1, 3]), ('red', [1])])


## 6. map()函数

In [63]:
import math

print('示例1，获取两个列表对应位置上的最小值：')
l1 = [1, 3, 5, 8, 9]
l2 = [2, 4, 6, 7, 10]
mins = map(min, l1, l2)
print(mins)
print(type(mins))

# map()函数操作时，直到访问数据时才会执行
for item in mins:
    print(item)

print('示例2，对列表中的元素进行平方根操作：')
squared = map(math.sqrt, l2)
print(squared)
print(list(squared))

示例1，获取两个列表对应位置上的最小值：
<map object at 0x000001D632368EF0>
<class 'map'>
1
3
5
7
9
示例2，对列表中的元素进行平方根操作：
<map object at 0x000001D632368BA8>
[1.4142135623730951, 2.0, 2.449489742783178, 2.6457513110645907, 3.1622776601683795]


## 7. 匿名函数 lambda

In [62]:
# my_func = lambda a, b, c: a * b
# print(my_func)
# print(my_func(1, 2, 3))

# 结合map
print('lambda结合map')
l1 = [1, 3, 5, 7, 9]
l2 = [2, 4, 6, 8, 10]
result = map(lambda x, y: x * 2 + y, l1, l2)
print(list(result))

lambda结合map
[4, 10, 16, 22, 28]


## 8. Python操作CSV数据文件

In [80]:
import csv

with open('grades.csv') as csvfile:
    grades_data = list(csv.DictReader(csvfile))
    grades_data_set = set(csv.DictReader(csvfile))

print(type(grades_data))
print('记录个数：', len(grades_data))
print(type(grades_data[0]))
first_row = grades_data[0]
print('第1条记录：', first_row)
print('前2条记录：', grades_data[:2])
print('列名：', list(first_row.keys()))

print(first_row['student_id'])

print(type(grades_data_set))
print('记录个数：', len(grades_data_set))

<class 'list'>
记录个数： 2315
<class 'collections.OrderedDict'>
第1条记录： OrderedDict([('student_id', 'B73F2C11-70F0-E37D-8B10-1D20AFED50B1'), ('assignment1_grade', '92.73394640624123'), ('assignment1_submission', '2015-11-02 06:55:34.282000000'), ('assignment2_grade', '83.03055176561709'), ('assignment2_submission', '2015-11-09 02:22:58.938000000'), ('assignment3_grade', '67.16444141249367'), ('assignment3_submission', '2015-11-12 08:58:33.998000000'), ('assignment4_grade', '53.01155312999494'), ('assignment4_submission', '2015-11-16 01:21:24.663000000'), ('assignment5_grade', '47.710397816995446'), ('assignment5_submission', '2015-11-20 13:24:59.692000000'), ('assignment6_grade', '38.16831825359636'), ('assignment6_submission', '2015-11-22 18:31:15.934000000')])
前2条记录： [OrderedDict([('student_id', 'B73F2C11-70F0-E37D-8B10-1D20AFED50B1'), ('assignment1_grade', '92.73394640624123'), ('assignment1_submission', '2015-11-02 06:55:34.282000000'), ('assignment2_grade', '83.03055176561709'), ('assi

In [79]:
avg_assign1 = sum([float(row['assignment1_grade']) for row in grades_data]) / len(grades_data) 
print('assignment1平均分数：', avg_assign1)

assignment1平均分数： 74.5357320747794


In [81]:
assign1_sub_month = set(row['assignment1_submission'][:7] for row in grades_data)
print(assign1_sub_month)

{'2016-01', '2015-11', '2015-12', '2016-07', '2016-08', '2016-05', '2015-10', '2015-09', '2016-02', '2016-06', '2016-04', '2016-03'}


In [86]:
max_assign2 = max([float(row['assignment2_grade']) for row in grades_data])
print(max_assign2)

99.93620568918459
