In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.DataFrame({
        'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
        'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
        'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})

In [4]:
df

Unnamed: 0,one,two,three
a,1.620583,-1.08914,
b,-0.170925,0.5069,-0.390176
c,-0.954298,0.999732,-0.903138
d,,-0.645613,-0.284411


In [5]:
df2 = df.copy()

In [6]:
df.gt(df2)

Unnamed: 0,one,two,three
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False


In [8]:
# np.nan == np.nan returns False.
df2.ne(df)

Unnamed: 0,one,two,three
a,False,False,True
b,False,False,False
c,False,False,False
d,True,False,False


In [13]:
df['two'] = df['two']**2
(df>0).all()

one      False
two       True
three    False
dtype: bool

In [14]:
(df > 0).any()

one       True
two       True
three    False
dtype: bool

In [15]:
(df > 0).any().any()

True

In [17]:
#To evaluate single-element pandas objects in a boolean context, use the method bool():
pd.Series([True]).bool()

True

In [18]:
pd.DataFrame([[True]])

Unnamed: 0,0
0,True


In [20]:
# You can conveniently perform element-wise comparisons when comparing a pandas data structure with a scalar value
pd.Series(['foo', 'bar', 'baz']) == 'foo'

0     True
1    False
2    False
dtype: bool

In [21]:
# Pandas also handles element-wise comparisons between different array-like objects of the same length
pd.Series(['foo', 'bar', 'baz']) == pd.Index(['foo', 'bar', 'qux'])

0     True
1     True
2    False
dtype: bool

In [22]:
# Trying to compare Index or Series objects of different lengths will create a ValueError
pd.Series(['foo', 'bar', 'baz']) == pd.Series(['foo', 'bar'])

ValueError: Can only compare identically-labeled Series objects

In [24]:
#Often you may find that there is more than one way to compute the same result. For example, consider df + df and df * 2. To test that these two computations produce the same result, use:
(df + df).equals(df*2)
# Note that the Series or DataFrame index needs to be in the same order for the equality to be True.

True

In [28]:
# aggregation by column
df.mean() # same as df.mean(0)

one      0.165120
two      0.714864
three   -0.525908
dtype: float64

In [29]:
# aggregation by row /index
df.mean(1)

a    1.403405
b   -0.101384
c   -0.285991
d    0.066203
dtype: float64

In [32]:
# By applying vectorized operations, we can describe various statistical procedures, like standardization (rendering data zero mean and standard deviation 1), very concisely:
ts_stand = (df - df.mean()) / df.std()
ts_stand.std()

one      1.0
two      1.0
three    1.0
dtype: float64

In [34]:
series = pd.Series(np.random.randn(1000))
series[::2] = np.nan
series.describe()

count    500.000000
mean      -0.114068
std        1.029081
min       -2.846442
25%       -0.788759
50%       -0.079258
75%        0.627547
max        3.274720
dtype: float64

In [35]:
frame = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e'])
frame.iloc[::2] = np.nan
frame.describe()

Unnamed: 0,a,b,c,d,e
count,500.0,500.0,500.0,500.0,500.0
mean,0.05852,-0.00477,0.059647,-0.037445,0.057345
std,0.977227,1.056146,0.957373,1.020441,0.975875
min,-3.032999,-3.300841,-2.932643,-3.662454,-2.428544
25%,-0.596088,-0.68715,-0.555052,-0.718186,-0.634367
50%,0.071088,0.025088,0.014741,-0.055999,0.041154
75%,0.775894,0.726382,0.703317,0.603353,0.710834
max,3.281307,2.408974,3.074434,2.937563,2.992962


In [36]:
# For a non-numerical Series object, describe() will give a simple summary of the number of unique values and the most frequently occurring values
s = pd.Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a'])
s.describe()

count     9
unique    4
top       a
freq      5
dtype: object

In [38]:
# The idxmin() and idxmax() functions on Series and DataFrame compute the index labels with the minimum and maximum corresponding values
s1 = pd.Series(np.random.randn(5))
s1
s1.idxmin(), s1.idxmax()

(2, 4)

In [42]:
df1 = pd.DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C'])
df1

Unnamed: 0,A,B,C
0,-1.295114,-0.635909,-1.178036
1,-0.981134,-0.70567,-1.24597
2,-0.776477,1.011725,0.335245
3,0.984234,0.393125,-1.064797
4,-1.788268,-0.617619,1.861266


In [43]:
df1.idxmin(axis=0)


A    4
B    1
C    1
dtype: int64

In [44]:
df1.idxmax(axis=1)

0    B
1    B
2    B
3    A
4    C
dtype: object

In [45]:
# The behavior of basic iterations over pandas objects depends on the type. When iterating over a Series, it is regarded as array-like, and basic iterations produces the values. DataFrames follow the dict-like convention of iterating over the keys of the objects.
df = pd.DataFrame({'col1': np.random.randn(3), 'col2': np.random.randn(3)}, index=['a', 'b', 'c'])
for col in df:
        print(col)

col1
col2


To iterate over the rows of a DataFrame, you can use the following methods:

    items(): to iterate over the (key, value) pairs.
    iterrows(): Iterate over the rows of a DataFrame as (index, Series) pairs. This converts the rows to Series objects, which can change the dtypes and has some performance implications.
    itertuples(): Iterate over the rows of a DataFrame as namedtuples of the values. This is a lot faster than iterrows() and is in most cases preferable to use to iterate over the values of a DataFrame.

! Iterating through pandas objects is generally slow. In many cases, iterating manually over the rows is not needed and can be avoided.
Warning

You should never modify something you are iterating over. This is not guaranteed to work in all cases.




In [46]:
df = pd.DataFrame({'a': [1, 2, 3], 'b': ['a', 'b', 'c']})
df

Unnamed: 0,a,b
0,1,a
1,2,b
2,3,c


In [47]:
# items(): to iterate over the (key, value) pairs
# for a data frame key value pairs are column name, and column series
for label, ser in df.items():
        print(label)
        print(ser)

a
0    1
1    2
2    3
Name: a, dtype: int64
b
0    a
1    b
2    c
Name: b, dtype: object


In [48]:
# iterrows(): Iterate over the rows of a DataFrame as (index, Series) pairs. This converts the rows to Series objects, which can change the dtypes and has some performance implications.
# iterrows() allows you to iterate through the rows of a DataFrame as Series objects. It returns an iterator yielding each index value along with a Series containing the data in each row:
for row_index, row in df.iterrows():
        print(row_index, row, sep='\n')


0
a    1
b    a
Name: 0, dtype: object
1
a    2
b    b
Name: 1, dtype: object
2
a    3
b    c
Name: 2, dtype: object


In [49]:
# itertuples(): Iterate over the rows of a DataFrame as namedtuples of the values. This is a lot faster than iterrows() and is in most cases preferable to use to iterate over the values of a DataFrame.
# The itertuples() method will return an iterator yielding a namedtuple for each row in the DataFrame. The first element of the tuple will be the row’s corresponding index value, while the remaining values are the row values.
for row in df.itertuples():
        print(row)


Pandas(Index=0, a=1, b='a')
Pandas(Index=1, a=2, b='b')
Pandas(Index=2, a=3, b='c')
