# Here are a few examples of iteration in pandas:

# To iterate over a single column in a DataFrame, you can use the iteritems() method on the column, like this:

In [1]:
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

for value in df['A'].iteritems():
    print(value)
    
    #This will print out the index and value for each item in column 'A' of the DataFrame.

(0, 1)
(1, 2)
(2, 3)


In [2]:
df

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


# To iterate over all the columns of a DataFrame, you can use the iteritems() method on the DataFrame, like this:

In [3]:
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [4]:
for column, series in df.iteritems():
    print(column, series.tolist())
    
    
#This will print out the name of the column and the values of the series for each column in the DataFrame.

A [1, 2, 3]
B [4, 5, 6]


In [5]:
for column, series in df.iteritems():
    print(column, series)

A 0    1
1    2
2    3
Name: A, dtype: int64
B 0    4
1    5
2    6
Name: B, dtype: int64


# To iterate over all the rows of a DataFrame, you can use the iterrows() method, like this:

In [6]:
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [7]:
for index, row in df.iterrows():
    print(row['A'], row['B'])
    
    
#This will print out the values of columns 'A' and 'B' for each row in the DataFrame.

1 4
2 5
3 6


Note that iterrows() and iteritems() are quite slow as they have to create new python objects for each row or item, itertuples() is faster as it returns the data in the form of namedtuples which are faster than python objects.

In [8]:
import pandas as pd
import time

# create a large DataFrame
df = pd.DataFrame({'A': range(1000000), 'B': range(1000000)})

# time iterrows()
start = time.time()
for index, row in df.iterrows():
    pass
end = time.time()
print(f"iterrows(): {end - start} seconds")

# time iteritems()
start = time.time()
for column, series in df.iteritems():
    pass
end = time.time()
print(f"iteritems(): {end - start} seconds")

# time itertuples()
start = time.time()
for row in df.itertuples():
    pass
end = time.time()
print(f"itertuples(): {end - start} seconds")


#It is highly recommended to use itertuples() if performance is a concern.

iterrows(): 14.617839813232422 seconds
iteritems(): 0.0 seconds
itertuples(): 0.42028093338012695 seconds


In this example, we create a large DataFrame with 1,000,000 rows and 2 columns, and then time how long it takes to iterate over each row using the iterrows() method, each column using the iteritems() method, and each row using the itertuples() method. 

# few other ways to iterate over the rows or columns of a DataFrame in pandas:

1) df.apply(): This method applies a function to each row or column of a DataFrame. The function is passed a Series or DataFrame corresponding to the row or column, and the result of the function is combined into a new DataFrame or Series. For example, to iterate over the rows of a DataFrame and calculate the sum of each row:

In [9]:
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [10]:
row_sums = df.apply(lambda row: row.sum(), axis=1)
print(row_sums)

0    5
1    7
2    9
dtype: int64


2) df.agg(): This method applies multiple aggregation functions to the DataFrame, returning the result as a DataFrame. For example, to iterate over the columns of a DataFrame and calculate the mean and standard deviation of each column:

In [11]:
column_agg = df.agg(['mean', 'std'])
print(column_agg)

        A    B
mean  2.0  5.0
std   1.0  1.0


3) df.groupby(): This method groups the rows of a DataFrame by one or more columns, and allows you to perform aggregate calculations on the groups. For example, to iterate over the rows of a DataFrame grouped by the values of column 'A' and calculate the mean of column 'B' for each group:

In [12]:
grouped = df.groupby('A').mean()
print(grouped)

     B
A     
1  4.0
2  5.0
3  6.0


All these method are slower than itertuples() but they have the advantage of being more readable and easy to use as they don't need to iterate over the rows and columns, they perform a specific operation, they are more high level operations.

In [25]:
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

df['sum'] = df.apply(lambda x: x['A'] + x['B'], axis=1)
df

Unnamed: 0,A,B,sum
0,1,4,5
1,2,5,7
2,3,6,9


# Here is an example of using the df.apply() method with the axis parameter set to 0 (which is the default value) to apply a function to each column of a DataFrame:

In [27]:
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

df


Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [28]:
result = df.apply(lambda x: x.max() - x.min())
print(result)


#3-1 in column A.
#6-4 in column B.

A    2
B    2
dtype: int64


# Here is an example of using the df.apply() method with the axis parameter set to 1 to apply a function to each row of a DataFrame:

In [29]:
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [30]:
result = df.apply(lambda x: x.max() - x.min(), axis=1)
print(result)


# 4-1
#5-2
#6-3

0    3
1    3
2    3
dtype: int64


# ------------------------------------------------------------------------------------------------

# you can use the map() and filter() functions to iterate over the rows or columns of a DataFrame in pandas.

1) df.map(): This method applies a function to each element of a DataFrame or Series. It returns a new DataFrame or Series with the same shape as the original, but with the values replaced by the results of the function. For example, to iterate over the rows of a DataFrame and change the values of column 'A' based on a certain condition:

In [13]:
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [14]:
df['A'] = df['A'].map(lambda x: x*2 if x>1 else x)
print(df)

   A  B
0  1  4
1  4  5
2  6  6


In [22]:
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

df['A'] = df['A'].map(lambda x: x*2)
print(df)


   A  B
0  2  4
1  4  5
2  6  6


# ------------------------------------------------------------------------------------------------------------------------------------

# df filter only columns with a certain condition

In this example, the filter method is applied to the columns of the DataFrame, and the lambda function is applied to each column, returning only columns where the condition is met, in this case where column 'A' has values greater than 2.

Please note that this method will filter the columns not the rows

In [36]:
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C':[7,8,9]})

columns_to_keep = ['A','C']
filtered_df = df[columns_to_keep]
print(filtered_df)


   A  C
0  1  7
1  2  8
2  3  9


In [37]:
filtered_df = df.loc[:,columns_to_keep]
print(filtered_df)


   A  C
0  1  7
1  2  8
2  3  9


# df filter only rows

In [32]:
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

filtered_df = df[df['A'] > 2]
print(filtered_df)


   A  B
2  3  6


# How do we specify if we iterate over a column or rows?

When you iterate over the rows or columns of a DataFrame in pandas, you can specify whether you want to iterate over the rows or columns by setting the axis parameter of the method.

In [18]:
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

for index, row in df.iterrows():
    print(index, row.to_list())

0 [1, 4]
1 [2, 5]
2 [3, 6]


In [20]:
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

for column, series in df.iteritems():
    print(column, series.to_list())

A [1, 2, 3]
B [4, 5, 6]


To iterate over the rows of a DataFrame, you can use the itertuples() method, which returns a named tuple containing the values of each column for each row. The fastest way.

In [21]:
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

for row in df.itertuples():
    print(row)


Pandas(Index=0, A=1, B=4)
Pandas(Index=1, A=2, B=5)
Pandas(Index=2, A=3, B=6)


In general, when iterating over the DataFrame, it is more efficient to iterate over the rows instead of the columns because it allows you to access the values of multiple columns for each row, rather than just the values of one column for each row.

# ------------------------------------------------------------------------------------------------

# loc and iloc

In [43]:
import pandas as pd

# Create a sample DataFrame
data = {'name': ['Alice', 'Bob', 'Charlie', 'David'],
        'age': [25, 32, 45, 38],
        'city': ['New York', 'Los Angeles', 'Chicago', 'Houston']}
df = pd.DataFrame(data)
df

Unnamed: 0,name,age,city
0,Alice,25,New York
1,Bob,32,Los Angeles
2,Charlie,45,Chicago
3,David,38,Houston


In [39]:
# Using the 'loc' indexer to select a specific row by label
print(df.loc[1])

name            Bob
age              32
city    Los Angeles
Name: 1, dtype: object


In [40]:
# Using the 'iloc' indexer to select a specific row by index
print(df.iloc[2])

name    Charlie
age          45
city    Chicago
Name: 2, dtype: object


# df.loc[1, 'name'] selects the 'name' value of the second row by label

In [41]:
# Using the 'loc' indexer to select a specific value by label
print(df.loc[1, 'name'])

Bob


# df.iloc[2, 0] selects the first column value of the third row by index.

In [42]:
# Using the 'iloc' indexer to select a specific value by index
print(df.iloc[2, 0])

Charlie


# ----------------------------------------------------------------

In [3]:
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [4]:
column_values = [row['A'] for index, row in df.iterrows()]
print(column_values)

[1, 2, 3]


In [6]:
df['A'] = df['A'].apply(lambda x: x**2)
df['A']

0     1
1    16
2    81
Name: A, dtype: int64

In [7]:
df['C'] = [val ** 2 for val in df['A']]
df['C']

0       1
1     256
2    6561
Name: C, dtype: int64