Pandas是一个开源的软件库。DataFrames和Series是其两个主要数据结构，被广泛用于数据分析。Series是单维索引数组，而DataFrame是具有列级和行级索引的表格数据结构。Pandas是预处理数据集的绝佳工具，可提供高度优化的性能。

In [1]:
import pandas as pd 

In [2]:
series_1 = pd.Series([2,9,0,1]) 

In [3]:
print(series_1.values) 

[2 9 0 1]


In [4]:
# default index of the series object
series_1.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
# setting index of the series object
series_1.index = ['a','b','c','d'] 

In [6]:
series_1['d']  

1

In [7]:
## creating dataframe using pandas
class_data = {'Names':['John','Ryan','Emily'],
             'Standard':[7,5,8],
             'Subject':['English','Mathematics','Science']}

class_df = pd.DataFrame(class_data,index = ['Student1','Student2','Student3'],
                        columns = ['Names','Standard','Subject']) 

In [8]:
print(class_df) 

          Names  Standard      Subject
Student1   John         7      English
Student2   Ryan         5  Mathematics
Student3  Emily         8      Science


In [9]:
class_df.Names

Student1     John
Student2     Ryan
Student3    Emily
Name: Names, dtype: object

In [10]:
# Adding new entry to the dataframe
import numpy as np
# class_df.ix['Student4'] = ['Robin',np.nan,'History'] 
class_df.loc['Student4'] = ['Robin',np.nan,'History'] 

In [11]:
class_df

Unnamed: 0,Names,Standard,Subject
Student1,John,7.0,English
Student2,Ryan,5.0,Mathematics
Student3,Emily,8.0,Science
Student4,Robin,,History


In [12]:
# Take transpose of the dataframe
class_df.T

Unnamed: 0,Student1,Student2,Student3,Student4
Names,John,Ryan,Emily,Robin
Standard,7,5,8,
Subject,English,Mathematics,Science,History


In [13]:
# sorting of rows by one column
class_df.sort_values(by='Standard')

Unnamed: 0,Names,Standard,Subject
Student2,Ryan,5.0,Mathematics
Student1,John,7.0,English
Student3,Emily,8.0,Science
Student4,Robin,,History


In [14]:
# Adding one more column to the dataframe as Series object
col_entry = pd.Series(['A','B','A+','C'],
                     index=['Student1','Student2','Student3','Student4'])

class_df['Grade'] = col_entry

In [15]:
class_df

Unnamed: 0,Names,Standard,Subject,Grade
Student1,John,7.0,English,A
Student2,Ryan,5.0,Mathematics,B
Student3,Emily,8.0,Science,A+
Student4,Robin,,History,C


In [16]:
# Filling the missing entries in the dataframe , inplace
class_df.fillna(10,inplace=True) 

In [17]:
class_df

Unnamed: 0,Names,Standard,Subject,Grade
Student1,John,7.0,English,A
Student2,Ryan,5.0,Mathematics,B
Student3,Emily,8.0,Science,A+
Student4,Robin,10.0,History,C


In [18]:
# Concatenation of 2 dataframes
student_age = pd.DataFrame(data={'Age':[13,10,15,18]},
                          index=['Student1','Student2','Student3','Student4']) 

In [19]:
print(student_age) 

          Age
Student1   13
Student2   10
Student3   15
Student4   18


In [20]:
class_data = pd.concat([class_df,student_age],axis=1) 

In [21]:
class_data

Unnamed: 0,Names,Standard,Subject,Grade,Age
Student1,John,7.0,English,A,13
Student2,Ryan,5.0,Mathematics,B,10
Student3,Emily,8.0,Science,A+,15
Student4,Robin,10.0,History,C,18


In [22]:
# MAP Function
class_data['Subject'] = class_data['Subject'].map(lambda x: x + 'Sub')
class_data['Subject']

Student1        EnglishSub
Student2    MathematicsSub
Student3        ScienceSub
Student4        HistorySub
Name: Subject, dtype: object

In [23]:
# apply function
def age_add(x):
    return (x+1)

print('-----Old values------')
print(class_data['Age'])
print('-----New values-------')
print(class_data['Age'].apply(age_add))  

-----Old values------
Student1    13
Student2    10
Student3    15
Student4    18
Name: Age, dtype: int64
-----New values-------
Student1    14
Student2    11
Student3    16
Student4    19
Name: Age, dtype: int64


In [24]:
# change datatype of the column
class_data['Grade'] = class_data['Grade'].astype('category')

In [25]:
class_data.Grade.dtypes

CategoricalDtype(categories=['A', 'A+', 'B', 'C'], ordered=False)

In [26]:
# sorting the results
class_data.to_csv('class_dataset.csv',index=False)  