## 准备数据

In [6]:
import numpy as np
import pandas as pd

data = [['000005', 2017, 402999, 44444],
        ['000005', 2017, 92440, 9967],
        ['000005', 2017, 174959, 137458],
        ['000005', 2018, 174959, 137458],
        ['000006', 2017, 174959, 137458]]


df = pd.DataFrame(data, columns=['code', 'year', 'var1', 'var2'])
df

Unnamed: 0,code,year,var1,var2
0,5,2017,402999,44444
1,5,2017,92440,9967
2,5,2017,174959,137458
3,5,2018,174959,137458
4,6,2017,174959,137458


## 问题
如何将数据中，同一股票代码同一年的某个字段加总成一条？

我想把某公司同一年的数据var加总到一起

## 思路
可以通过pandas库实现这个需求

1. 获取公司股票代码列表
2. 获取某公司年份列表
3. 对某个公司同年的var进行加总
4. for循环对所有的公司重复2-3操作


## 注意
1. 一定要清楚每个变量的数据类型
2. 股票代码很容易被当成数字。例如000005，可能读取的时候显示5。原因是excel存储的时把股票代码当成了数字类型。


## 代码
### 1. 获取公司股票代码列表

In [25]:
codes = df.code.unique()
codes

array(['000005', '000006'], dtype=object)

### 2. 获取某公司年份列表


以000005为例

In [15]:
years = set(df[df['code']=='000005']['year'].values)
years

{2017, 2018}

### 3. 对某个公司同年的var进行加总

以000005公司2017年为例

In [21]:
ndf = df[df['code']=='000005']
ndf

Unnamed: 0,code,year,var1,var2
0,5,2017,402999,44444
1,5,2017,92440,9967
2,5,2017,174959,137458
3,5,2018,174959,137458


In [22]:
ndf[ndf['year']==2017]

Unnamed: 0,code,year,var1,var2
0,5,2017,402999,44444
1,5,2017,92440,9967
2,5,2017,174959,137458


In [23]:
ndf[ndf['year']==2017]['var1']

0    402999
1     92440
2    174959
Name: var1, dtype: int64

In [24]:
ndf[ndf['year']==2017]['var1'].sum()

670398

### for循环对所有的公司重复2-3操作
汇总代码

In [27]:
results = []
codes = df.code.unique()
for code in codes:
    years = set(df[df['code']==code]['year'].values)
    for year in years:
        var1_sum = ndf[ndf['year']==year]['var1'].sum()
        data = (code, year, var1_sum)
        results.append(data)
        
result_df = pd.DataFrame(results, columns=['code', 'year', 'var1_sum'])
result_df

Unnamed: 0,code,year,var1_sum
0,5,2017,670398
1,5,2018,174959
2,6,2017,670398


In [None]:
#保存结果
result_df.to_csv('result.csv', index=False)