# Pandas

- Pandas is a Python library.

- Pandas is used to analyze data.

- "Pandas" has a reference to both "Panel Data", and "Python Data Analysis" and was created by Wes McKinney in 2008.

## Dictionary

In [1]:
x = {
    'Name': 'TinTin',
    'Age': 999,
    'Key': 'value'
}

In [2]:
x['Name']

'TinTin'

In [None]:
x = {
    [1, 2, 3], [4, 5, 6]
}
# Error ?

In [None]:
x = {
    'key1':[1, 2, 3],
    'key2':[4, 5, 6]
}
display(x)

In [3]:
# column-wise -- dict of list
columnwise_dict = {
    "Name": ["Alice", "Bob", "Charlie"],
    "Age":  [25, 30, 35],
    "City": ["New York", "London", "Paris"]
}

columnwise_dict

{'Name': ['Alice', 'Bob', 'Charlie'],
 'Age': [25, 30, 35],
 'City': ['New York', 'London', 'Paris']}

## DataFrame

In [4]:
import pandas as pd

df_columnwise = pd.DataFrame(columnwise_dict)
df_columnwise.head()

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,London
2,Charlie,35,Paris


In [14]:
# dict of list
# columnwise_dict['Name']
columnwise_dict['Name'][0]

'Alice'

In [15]:
# DataFrame column-wise
# df_columnwise['Name']
df_columnwise['Name'][0]

'Alice'

In [8]:
# row-wise -- list of dict
rowwise_dict = [
    {"Name": "Alice", "Age": 25, "City": "New York"},
    {"Name": "Bob", "Age": 30, "City": "London"},
    {"Name": "Charlie", "Age": 35, "City": "Paris"}
]

rowwise_dict

[{'Name': 'Alice', 'Age': 25, 'City': 'New York'},
 {'Name': 'Bob', 'Age': 30, 'City': 'London'},
 {'Name': 'Charlie', 'Age': 35, 'City': 'Paris'}]

In [9]:
import pandas as pd
df_rowwise = pd.DataFrame(rowwise_dict)
df_rowwise.head()

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,London
2,Charlie,35,Paris


In [18]:
# row-wise -- list of dict
rowwise_dict[0]
# rowwise_dict[0]['Name']

{'Name': 'Alice', 'Age': 25, 'City': 'New York'}

In [21]:
df_rowwise['Name'][0]

'Alice'

In [None]:
# dataframe row 0 ?
df_rowwise.iloc[0] # index position
df_rowwise.loc[0] # index label

Name       Alice
Age           25
City    New York
Name: 0, dtype: object

In [24]:
df_rowwise.loc["test"] = {
    "Name": "Test",
    "Age": 99,
    "City": "Test City"
}

In [None]:
display(df_rowwise)

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,London
2,Charlie,35,Paris
test,Test,99,Test City


In [None]:
df_rowwise.loc["test"]

Name         Test
Age            99
City    Test City
Name: test, dtype: object

In [40]:
df_rowwise['Name'][0] 
# df_rowwise.iloc[0, 0] 
# df_rowwise.loc[0, 'Name']

'Alice'

In [34]:
display(df_rowwise[['Name','City']][0:2]) # last index is exclusive

display(df_rowwise.iloc[0:2, [0, 2]]) # index position, last index is exclusive

display(df_rowwise.loc[0:1, ['Name','City']]) #index label, inclusive

Unnamed: 0,Name,City
0,Alice,New York
1,Bob,London


Unnamed: 0,Name,City
0,Alice,New York
1,Bob,London


Unnamed: 0,Name,City
0,Alice,New York
1,Bob,London


## Change DataFrame

In [37]:
import random
import pandas as pd

names = ["Alice", "Bob", "Charlie", "David", "Emma", "Frank", "Grace", "Hannah", "Ian", "Jack"]
ages = [random.randint(18, 65) for _ in range(10)]
cities = ["New York", "Los Angeles", "Chicago", "Houston", "Phoenix", "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose"]

# Generate random data
data = {
    "Name": names,
    "Age": ages,
    "City": cities
}

# Create DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City
0,Alice,46,New York
1,Bob,28,Los Angeles
2,Charlie,45,Chicago
3,David,18,Houston
4,Emma,49,Phoenix
5,Frank,35,Philadelphia
6,Grace,60,San Antonio
7,Hannah,25,San Diego
8,Ian,64,Dallas
9,Jack,25,San Jose


In [41]:
# Change value

df.loc[0, 'Name'] = 'TinTin'
df

Unnamed: 0,Name,Age,City
0,TinTin,46,New York
1,Bob,28,Los Angeles
2,Charlie,45,Chicago
3,David,18,Houston
4,Emma,49,Phoenix
5,Frank,35,Philadelphia
6,Grace,60,San Antonio
7,Hannah,25,San Diego
8,Ian,64,Dallas
9,Jack,25,San Jose


In [42]:
# df['Name'][0] = 'Alice'
# df

In [46]:
#Adding a new row to df_rowwise
new_row = pd.DataFrame([{'Name':'David', 'Age':40, 'City':'Tokyo'}])
df = pd.concat([df, new_row], ignore_index=True)
df

Unnamed: 0,Name,Age,City
0,TinTin,46,New York
1,Bob,28,Los Angeles
2,Charlie,45,Chicago
3,David,18,Houston
4,Emma,49,Phoenix
5,Frank,35,Philadelphia
6,Grace,60,San Antonio
7,Hannah,25,San Diego
8,Ian,64,Dallas
9,Jack,25,San Jose


In [47]:
df.loc[99] = ['David2', 40, 'Tokyo']
df

Unnamed: 0,Name,Age,City
0,TinTin,46,New York
1,Bob,28,Los Angeles
2,Charlie,45,Chicago
3,David,18,Houston
4,Emma,49,Phoenix
5,Frank,35,Philadelphia
6,Grace,60,San Antonio
7,Hannah,25,San Diego
8,Ian,64,Dallas
9,Jack,25,San Jose


In [48]:
# Adding a new column to df_rowwise
display("add column")
df['Country'] = "test"
display(df)

'add column'

Unnamed: 0,Name,Age,City,Country
0,TinTin,46,New York,test
1,Bob,28,Los Angeles,test
2,Charlie,45,Chicago,test
3,David,18,Houston,test
4,Emma,49,Phoenix,test
5,Frank,35,Philadelphia,test
6,Grace,60,San Antonio,test
7,Hannah,25,San Diego,test
8,Ian,64,Dallas,test
9,Jack,25,San Jose,test


In [49]:
# remove column
display("remove column")
df = df.drop(columns=['Country'])
display(df)

'remove column'

Unnamed: 0,Name,Age,City
0,TinTin,46,New York
1,Bob,28,Los Angeles
2,Charlie,45,Chicago
3,David,18,Houston
4,Emma,49,Phoenix
5,Frank,35,Philadelphia
6,Grace,60,San Antonio
7,Hannah,25,San Diego
8,Ian,64,Dallas
9,Jack,25,San Jose


In [51]:
# # remove row
display("remove row")
df = df.drop(index=[10])
display(df)

'remove row'

Unnamed: 0,Name,Age,City
0,TinTin,46,New York
1,Bob,28,Los Angeles
2,Charlie,45,Chicago
3,David,18,Houston
4,Emma,49,Phoenix
5,Frank,35,Philadelphia
6,Grace,60,San Antonio
7,Hannah,25,San Diego
8,Ian,64,Dallas
9,Jack,25,San Jose


In [52]:
# # reset index
display("reset index")
df = df.reset_index(drop=True)
display(df)

'reset index'

Unnamed: 0,Name,Age,City
0,TinTin,46,New York
1,Bob,28,Los Angeles
2,Charlie,45,Chicago
3,David,18,Houston
4,Emma,49,Phoenix
5,Frank,35,Philadelphia
6,Grace,60,San Antonio
7,Hannah,25,San Diego
8,Ian,64,Dallas
9,Jack,25,San Jose


In [53]:
# Filter
df_filtered = df[df['Age'] > 40]
df_filtered

Unnamed: 0,Name,Age,City
0,TinTin,46,New York
2,Charlie,45,Chicago
4,Emma,49,Phoenix
6,Grace,60,San Antonio
8,Ian,64,Dallas


In [54]:
df_filtered = df[(df['Age'] > 40) & (df['City'] == 'Chicago')]
df_filtered

Unnamed: 0,Name,Age,City
2,Charlie,45,Chicago


### Dataframe to Dictionary

In [55]:
import pandas as pd

data = {
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7,9,9],
    'D': ['X', 'Y', 'Z']
}
# Create DataFrame
df = pd.DataFrame(data)
display(df)

dict_records = df.to_dict(orient='records')
dict_columns = df.to_dict(orient='list')
dict_index = df.to_dict(orient='index')

print("Records:\n", dict_records)
print("Columns:\n", dict_columns)
print("Index:\n", dict_index)

Unnamed: 0,A,B,C,D
0,1,4,7,X
1,2,5,9,Y
2,3,6,9,Z


Records:
 [{'A': 1, 'B': 4, 'C': 7, 'D': 'X'}, {'A': 2, 'B': 5, 'C': 9, 'D': 'Y'}, {'A': 3, 'B': 6, 'C': 9, 'D': 'Z'}]
Columns:
 {'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 9, 9], 'D': ['X', 'Y', 'Z']}
Index:
 {0: {'A': 1, 'B': 4, 'C': 7, 'D': 'X'}, 1: {'A': 2, 'B': 5, 'C': 9, 'D': 'Y'}, 2: {'A': 3, 'B': 6, 'C': 9, 'D': 'Z'}}


In [57]:
# Save DataFrame to CSV
df.to_csv("df.csv", index=False)

In [58]:
# Load DataFrame from CSV
df = pd.read_csv("df.csv")

In [59]:
display(df)

Unnamed: 0,A,B,C,D
0,1,4,7,X
1,2,5,9,Y
2,3,6,9,Z


### Numpy Array to Dataframe

In [60]:
import pandas as pd
import numpy as np

arr = np.array([[1, 2, 3], [4, 5, 6]])
df = pd.DataFrame(arr, columns=['A', 'B', 'C'])
display(df)

Unnamed: 0,A,B,C
0,1,2,3
1,4,5,6


In [62]:
# Compare performance: list vs dict vs pandas DataFrame
import time
import random
import numpy as np
import pandas as pd

# Generate data
n = 10000000
py_list = [random.randint(0, 100) for _ in range(n)]
np_array = np.array(py_list)
df = pd.DataFrame({'col': py_list})
py_dict = {'col': py_list}

# Sum operation timing
start = time.time()
sum_list = sum(py_list)
print(f"List sum: {time.time() - start:.5f} seconds")

start = time.time()
sum_dict = sum(py_dict['col'])
print(f"Dict sum: {time.time() - start:.5f} seconds")

start = time.time()
sum_df = df['col'].sum()
print(f"Pandas DataFrame sum: {time.time() - start:.5f} seconds")

List sum: 0.03060 seconds
Dict sum: 0.02698 seconds
Pandas DataFrame sum: 0.00235 seconds


### DataFrame Methods for Working with Data

https://www.geeksforgeeks.org/python-pandas-dataframe/

|FUNCTION|DESCRIPTION|
|---|---|
|index()|Method returns index (row labels) of the DataFrame|
|insert()|Method inserts a column into a DataFrame|
|add()|Method returns addition of dataframe and other, element-wise (binary operator add)|
|sub()|Method returns subtraction of dataframe and other element-wise (binary operator sub)|
|mul()|Method returns multiplication of dataframe and other, element-wise (binary operator mul)|
|div()|Method returns floating division of dataframe and other element-wise (binary operator truediv)|
|unique()|Method extracts the unique values in the dataframe|
|nunique()|Method returns count of the unique values in the dataframe|
|value_counts()|Method counts the number of times each unique value occurs within the Series|
|columns()|Method returns the column labels of the DataFrame|
|axes()|Method returns a list representing the axes of the DataFrame|
|isnull()|Method creates a Boolean Series for extracting rows with null values|
|notnull()|Method creates a Boolean Series for extracting rows with non-null values|
|isin()|Method extracts rows from a DataFrame where a column value exists in a predefined collection|
|dtypes()|Method returns a Series with the data type of each column. The result’s index is the original DataFrame’s columns|
|astype()|Method converts the data types in a Series|
|values()|Method returns a Numpy representation of the DataFrame i.e only the values in the DataFrame will be returned, the axes labels will be removed|
|sort_values()|Method sorts a data frame in Ascending or Descending order of passed Column|
|sort_index()|Method sorts the values in a DataFrame based on their index positions or labels instead of their values but sometimes a data frame is made out of two or more data frames and hence later index can be changed using this method|
|loc[]|Method retrieves rows based on index label|
|iloc[]|Method retrieves rows based on index position|
|ix[]|Method retrieves DataFrame rows based on either index label or index position. This method combines the best features of the .loc[] and .iloc[] methods|
|rename()|Method is called on a DataFrame to change the names of the index labels or column names|
|drop()|Method is used to delete rows or columns from a DataFrame|
|pop()|Method is used to delete rows or columns from a DataFrame|
|sample()|Method pulls out a random sample of rows or columns from a DataFrame|
|nsmallest()|Method pulls out the rows with the smallest values in a column|
|nlargest()|Method pulls out the rows with the largest values in a column|
|shape()|Method returns a tuple representing the dimensionality of the DataFrame|
|ndim()|Method returns an ‘int’ representing the number of axes / array dimensions. Returns 1 if Series, otherwise returns 2 if DataFrame|
|dropna()|Method allows the user to analyze and drop Rows/Columns with Null values in different ways|
|fillna()|Method manages and let the user replace NaN values with some value of their own|
|rank()|Values in a Series can be ranked in order with this method|
|query()|Method is an alternate string-based syntax for extracting a subset from a DataFrame|
|copy()|Method creates an independent copy of a pandas object|
|duplicated()|Method creates a Boolean Series and uses it to extract rows that have duplicate values|
|drop_duplicates()|Method is an alternative option to identifying duplicate rows and removing them through filtering|
|set_index()|Method sets the DataFrame index (row labels) using one or more existing columns|
|reset_index()|Method resets index of a Data Frame. This method sets a list of integer ranging from 0 to length of data as index|
|where()|Method is used to check a Data Frame for one or more condition and return the result accordingly. By default, the rows not satisfying the condition are filled with NaN value|