# **Generic operations on a DataFrame**

##  Import libraries and prepare DataFrame.

In [1]:
import pandas as pd

data = {
    'Name': ['Alia', 'Amitabh', 'Shahrukh', 'Deepika'],
    'Age': [30, 80, 57, 37],
    'Salary': [50000, 65000, 75000, 60000]
}

df_actor = pd.DataFrame(data)
df_actor

Unnamed: 0,Name,Age,Salary
0,Alia,30,50000
1,Amitabh,80,65000
2,Shahrukh,57,75000
3,Deepika,37,60000


##  Data information

In [2]:
df_actor.info()
df_actor.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    4 non-null      object
 1   Age     4 non-null      int64 
 2   Salary  4 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 228.0+ bytes


Unnamed: 0,Age,Salary
count,4.0,4.0
mean,51.0,62500.0
std,22.464787,10408.329997
min,30.0,50000.0
25%,35.25,57500.0
50%,47.0,62500.0
75%,62.75,67500.0
max,80.0,75000.0


## Indexers: loc, iloc, at, iat, ix

`loc` gets rows (and/or columns) with particular labels.

`iloc` gets rows (and/or columns) at integer locations.

`at` gets scalar values. It's a very fast `loc`.
`iat` Get scalar values. It's a very fast `iloc`.

`ix` is generic: primarily a label based integer. When it fails, it looks for integer (position) based. `ix` is deprecated now (pandas version > = 0.20.0).  

Read further:
https://stackoverflow.com/questions/28757389/pandas-loc-vs-iloc-vs-at-vs-iat

### 0. Filter with 'Age' limit.

In [3]:
display(df_actor)
df1 = df_actor[df_actor['Age'] > 30] 
display(df1)
df2 = df_actor.loc[df_actor['Age'] < 60] 
df2

Unnamed: 0,Name,Age,Salary
0,Alia,30,50000
1,Amitabh,80,65000
2,Shahrukh,57,75000
3,Deepika,37,60000


Unnamed: 0,Name,Age,Salary
1,Amitabh,80,65000
2,Shahrukh,57,75000
3,Deepika,37,60000


Unnamed: 0,Name,Age,Salary
0,Alia,30,50000
2,Shahrukh,57,75000
3,Deepika,37,60000


### 1. Print value at a selected (row,col) coordinate.

In [4]:
value = df_actor.iat[3, 2] 
display(value, type(value))
value = df_actor.iloc[2,0] 
value, type(value)

60000

numpy.int64

('Shahrukh', str)

### 2. Add a row.

In [9]:
# Add Row to DataFrame
df1 = df_actor.copy()
list_row = ['Pankaj', 47, 50000]
df1.loc[len(df_actor)] = list_row
display(df1)

#s_new = pd.Series({'Name':'Ranveer', 'Age':38, 'Salary':50000})
df_new = pd.DataFrame({'Name':['Ranveer'], 'Age':[38], 'Salary':[50000]})
df_new = pd.DataFrame([{'Name':'Ranveer', 'Age':38, 'Salary':50000}]) # this n above both work

df1 = pd.concat([df1,df_new], ignore_index=True)
display(df1)


# Maybe we can add a column by concatenating a series


Unnamed: 0,Name,Age,Salary
0,Alia,30,50000
1,Amitabh,80,65000
2,Shahrukh,57,75000
3,Deepika,37,60000
4,Pankaj,47,50000


Unnamed: 0,Name,Age,Salary
0,Alia,30,50000
1,Amitabh,80,65000
2,Shahrukh,57,75000
3,Deepika,37,60000
4,Pankaj,47,50000
5,Ranveer,38,50000


Insert a gender column.

In [None]:
display(df1)
df1['Gender'] = ['F','M','M','F','M','M']
df1

`groupby`, etc

In [None]:
df2 = df1.copy()
g = df2.groupby(['Gender'])
print(g)
print('g.sum =',g.sum())


Understanding `loc`, `iloc`, `at`, `iat`:

In [11]:
import pandas as pd

print('Pandas version =', pd.__version__)


df = pd.DataFrame({'col_0': ['00', '10', '20', '30', '40'],
                   'col_1': ['01', '11', '21', '31', '41'],
                   'col_2': ['02', '12', '22', '32', '42'],
                   'col_3': ['03', '13', '23', '33', '43']},
                  index=['row_0', 'row_1', 'row_2', 'row_3', 'row_4'])
print('Our DataFrame:')
df
# Expected output: 
#       col_0 col_1 col_2 col_3
# row_0    00    01    02    03
# row_1    10    11    12    13
# row_2    20    21    22    23
# row_3    30    31    32    33
# row_4    40    41    42    43

Pandas version = 2.0.3
Our DataFrame:


Unnamed: 0,col_0,col_1,col_2,col_3
row_0,0,1,2,3
row_1,10,11,12,13
row_2,20,21,22,23
row_3,30,31,32,33
row_4,40,41,42,43


*All can print values coordinate-wise (by joint row-column indices or row-column headers' names)* 

In [12]:
# Let's choose an (row,column) or (r,c) coordinate:
r = 2; c = 3
rname = 'row_2'; cname = 'col_3'

print(df.at[rname,cname]) 
print(df.loc[rname,cname])

print(df.iat[r,c]) 
print(df.iloc[r,c])



23
23
23
23


*All can update values as well.*

*However `at` and `iat` can only operate on scalar (single) values

In [None]:
# Goodread: https://note.nkmk.me/en/python-pandas-at-iat-loc-iloc/

Replace a value.

(a) Replace a value with match.

In [13]:
import numpy as np
df1 = df.mask(df == '32')
display(df1) 
df2 = df.copy()
df2 = df.replace('32','23')
display(df2)
df2 = df1.replace('23','100') 
display(df2)
df2 = df2.replace(to_replace = np.nan, value ='-9999')
df2


Unnamed: 0,col_0,col_1,col_2,col_3
row_0,0,1,2.0,3
row_1,10,11,12.0,13
row_2,20,21,22.0,23
row_3,30,31,,33
row_4,40,41,42.0,43


Unnamed: 0,col_0,col_1,col_2,col_3
row_0,0,1,2,3
row_1,10,11,12,13
row_2,20,21,22,23
row_3,30,31,23,33
row_4,40,41,42,43


Unnamed: 0,col_0,col_1,col_2,col_3
row_0,0,1,2.0,3
row_1,10,11,12.0,13
row_2,20,21,22.0,100
row_3,30,31,,33
row_4,40,41,42.0,43


Unnamed: 0,col_0,col_1,col_2,col_3
row_0,0,1,2,3
row_1,10,11,12,13
row_2,20,21,22,100
row_3,30,31,-9999,33
row_4,40,41,42,43


**Print last row**

In [18]:
display(df2.iloc[0:-1])

df3 = df2.iloc[[0, -1]]
df3

Unnamed: 0,col_0,col_1,col_2,col_3
row_0,0,1,2,3
row_1,10,11,12,13
row_2,20,21,22,100
row_3,30,31,-9999,33


Unnamed: 0,col_0,col_1,col_2,col_3
row_0,0,1,2,3
row_4,40,41,42,43


In [7]:
data = {
  "Array_1": [49.50, 70],
  "Array_2": [65.1, 49.50]
}
  
df3 = pd.DataFrame(data)
  
df3.replace(49.50, 60)

Unnamed: 0,Array_1,Array_2
0,60.0,65.1
1,70.0,60.0


## Display first and last rows only

In [22]:
dfx = pd.DataFrame({'a':range(1,5), 'b':['a','b','c','d']})
display(dfx)
df2x = dfx.iloc[[0,-1]]
    
df2x

Unnamed: 0,a,b
0,1,a
1,2,b
2,3,c
3,4,d


Unnamed: 0,a,b
0,1,a
3,4,d


In [None]:
df3['Array_1'][1]

`groupby`

In [None]:
import pandas as pd

df_fruit = pd.DataFrame(
    {
       "Apple": [5, 2, 7, 0],
       "Banana": [4, 7, 5, 1],
       "Carrot": [9, 3, 5, 1]
    }
)
display(df_fruit)
print ("Input DataFrame 1 is:\n", df_fruit)
g_sum = df.groupby(['Apple']).sum()
print ("Group by Apple is:\n", g_sum)

In [None]:
import pandas as pd

dtypes = {
    "first_name": "category",
    "gender": "category",
    "type": "category",
    "state": "category",
    "party": "category",
}
df = pd.read_csv(
    "groupby-data/legislators-historical.csv",
    dtype=dtypes,
    usecols=list(dtypes) + ["birthday", "last_name"],
    parse_dates=["birthday"]
)