This notebook illustrates some examples observed (not exhaustive) from the DataCamp practices, and some other examples.


- https://practice.datacamp.com/p/5
- https://practice.datacamp.com/p/16


### Some Pandas Methods/Attributes


- df.head()
- df.tail()
- df.info()
- df.shape


In [1]:
import pandas as pd
import numpy as np

#### Numpy Logical Functions


In [2]:
x = np.array([9, 5])
y = np.array([16, 12])

np.logical_or(x < 5, y > 15)

array([ True, False])

In [3]:
x < 5

array([False, False])

In [4]:
y > 15

array([ True, False])

In [None]:
# Python's bitwise logic operators
(x < 5) | (y > 15)

array([ True, False])

#### pandas.DataFrame.to_numpy


In [None]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_numpy.html
# DataFrame.to_numpy(dtype=None, copy=False, na_value=<no_default>)[source]

df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})
arr = df.to_numpy()
arr[0, 1] = 100
print(arr)
display(df)

[[  1.  100. ]
 [  2.    4.5]]


Unnamed: 0,A,B
0,1,3.0
1,2,4.5


In [None]:
df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})
arr = df.to_numpy(copy=True)
arr[0, 1] = 100
print(arr)
display(df)

[[  1.  100. ]
 [  2.    4.5]]


Unnamed: 0,A,B
0,1,3.0
1,2,4.5


In [None]:
# note that the numpy array still references the dataframe
# this behavior is not guaranteed according to the docs
df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
arr = df.to_numpy()
arr[0, 1] = 100
print(arr)
display(df)

[[  1 100]
 [  2   4]]


Unnamed: 0,A,B
0,1,100
1,2,4


In [None]:
# make an explicit copy
df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
arr = df.to_numpy(copy=True)  # Using this would be more explicitly declared.
arr[0, 1] = 100
print(arr)
display(df)

[[  1 100]
 [  2   4]]


Unnamed: 0,A,B
0,1,3
1,2,4


In [10]:
df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})
arr = df.to_numpy()
np.shares_memory(arr, df.values)

False

In [11]:
df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
arr = df.to_numpy()
np.shares_memory(arr, df.values)

True

#### .copy()


In [12]:
arr1 = arr.copy()
arr1

array([[1, 3],
       [2, 4]])

In [13]:
arr[0, 1] = 33
print(arr)
print(arr1)

[[ 1 33]
 [ 2  4]]
[[1 3]
 [2 4]]


In [14]:
df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
df1 = df.copy()
df1

Unnamed: 0,A,B
0,1,3
1,2,4


In [15]:
df.iloc[0, 1] = 33
display(df)
display(df1)

Unnamed: 0,A,B
0,1,33
1,2,4


Unnamed: 0,A,B
0,1,3
1,2,4


#### Pandas filtering and Dataframe creation


In [None]:
# example dictionary
my_dict = {
    "course": ["dv", "ds", "de", "de"],
    "level": ["beginner", "intermediate", "intermediate", "advanced"],
    "lesson": [5, 10, 15, 20],
}

In [17]:
datacamp = pd.DataFrame(
    my_dict
)  # question is asking which Pandas function to use - use pd.DataFrame function
datacamp

Unnamed: 0,course,level,lesson
0,dv,beginner,5
1,ds,intermediate,10
2,de,intermediate,15
3,de,advanced,20


In [18]:
datacamp

Unnamed: 0,course,level,lesson
0,dv,beginner,5
1,ds,intermediate,10
2,de,intermediate,15
3,de,advanced,20


In [19]:
datacamp[(datacamp["course"] == "ds")]

Unnamed: 0,course,level,lesson
1,ds,intermediate,10


In [20]:
datacamp[(datacamp["course"] == "ds") | (datacamp["course"] == "de")]

Unnamed: 0,course,level,lesson
1,ds,intermediate,10
2,de,intermediate,15
3,de,advanced,20


In [21]:
# & is evaluated first before |
datacamp[
    (datacamp["course"] == "ds")
    | (datacamp["course"] == "de") & (datacamp["lesson"] >= 15)
]

Unnamed: 0,course,level,lesson
1,ds,intermediate,10
2,de,intermediate,15
3,de,advanced,20


In [22]:
datacamp[
    ((datacamp["course"] == "ds") | (datacamp["course"] == "de"))
    & (datacamp["lesson"] >= 15)
]

Unnamed: 0,course,level,lesson
2,de,intermediate,15
3,de,advanced,20


In [23]:
keys = ["a", "b", "c"]
values = [[1, 2, 3], [11, 22, 33], [10, 20, 30]]

# zip to combine keys and values into a tuple
# Constructing a dictionary from the zipped data
zipped = zip(keys, values)
data = dict(zipped)

data

{'a': [1, 2, 3], 'b': [11, 22, 33], 'c': [10, 20, 30]}

In [24]:
pd.DataFrame(data)

Unnamed: 0,a,b,c
0,1,11,10
1,2,22,20
2,3,33,30


In [25]:
keys = ["a", "b", "c"]
values = [[1, 2, 3], [11, 22, 33], [10, 20, 30]]

# zip to combine keys and values into a tuple
# Constructing a list from the zipped data
list(zip(keys, values))

[('a', [1, 2, 3]), ('b', [11, 22, 33]), ('c', [10, 20, 30])]

In [26]:
pd.DataFrame(list(zip(keys, values)))

Unnamed: 0,0,1
0,a,"[1, 2, 3]"
1,b,"[11, 22, 33]"
2,c,"[10, 20, 30]"


#### Pandas Dataframe Iterrows


- The iterrows() method generates an iterator object of the DataFrame, allowing us to iterate each row in the DataFrame.
- Each iteration produces an index object and a row object (a Pandas Series object).


In [None]:
import pandas as pd

data = {"firstname": ["Sally", "Mary", "John"], "age": [50, 40, 30]}

df = pd.DataFrame(data)
df

Unnamed: 0,firstname,age
0,Sally,50
1,Mary,40
2,John,30


In [None]:
for index, row in df.iterrows():
    print(f"This is the index: {index}")
    print(f"This is the row: {row}")

This is the index: 0
This is the row: firstname    Sally
age             50
Name: 0, dtype: object
This is the index: 1
This is the row: firstname    Mary
age            40
Name: 1, dtype: object
This is the index: 2
This is the row: firstname    John
age            30
Name: 2, dtype: object


In [None]:
for index, row in df.iterrows():
    print(row["firstname"])

Sally
Mary
John


In [None]:
# There will be a TypeError if we do this.
for row in df.iterrows():
    print(row["firstname"])

TypeError: tuple indices must be integers or slices, not str

#### Use of loc and iloc


In [None]:
dict = {
    "country": ["Brazil", "Russia", "India", "China", "South Africa"],
    "capital": ["Brasilia", "Moscow", "New Delhi", "Beijing", "Pretoria"],
    "area": [8.516, 17.10, 3.286, 9.597, 1.221],
    "population": [200.4, 143.5, 1252, 1357, 52.98],
}

brics = pd.DataFrame(dict)
brics

In [None]:
brics.values  # return array representation

In [None]:
brics.shape

In [None]:
brics["country"]

In [None]:
brics[["country"]]

In [None]:
brics[["country", "area"]]

In [None]:
brics.loc[:, "country"]

In [None]:
brics.loc[:, ["country"]]

In [None]:
brics.loc[[0, 2], ["country", "area"]]

In [None]:
# SyntaxError: invalid syntax
brics.loc[[0:2], ['country', 'area']]

In [None]:
brics.loc[0:2, ["country", "area"]]

In [None]:
brics.iloc[:, 0]

In [None]:
brics.iloc[:, [0]]

In [None]:
brics.iloc[1:3, 0:2]

#### Quantiles


https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.quantile.html


In [None]:
brics["area"].quantile([0.1, 0.5, 0.9])  # 10%, 50%, 90%

#### Reindex


In [None]:
data = {"age": [50, 40, 30, 40], "qualified": [True, False, False, False]}

df = pd.DataFrame(data)
df

- https://www.w3schools.com/python/pandas/ref_df_reindex.asp
- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.reindex.html


In [None]:
import pandas as pd

data = {"age": [50, 40, 30, 40], "qualified": [True, False, False, False]}

idx = ["Sally", "Mary", "John", "Monica"]

df = pd.DataFrame(data, index=idx)
df

In [None]:
# Note: The reindex method does NOT make changes to the original DataFrame object.
newidx = ["Robert", "Cindy", "John", "Monica"]
df.reindex(newidx)

In [None]:
# Note: The reindex method does NOT make changes to the original DataFrame object.
df

In [None]:
# Note:The values are set to NaN if the new index is not the same as the old index.

newidx = ["Robert", "Cindy", "John", "Monica"]

newdf = df.reindex(newidx)
newdf

In [None]:
newidx = ["Robert", "Cindy", "John", "Monica"]

newdf = df.reindex(newidx, fill_value="missing")
newdf