About pandas.DataFrame.iloc


In [5]:
import pandas as pd

# Create a DataFrame with some data
data = pd.DataFrame(
    {
        "Name": [
            "Alice",
            "Bob",
            "Charlie",
            "David",
            "Eve",
            "Frank",
            "Grace",
        ],
        "Age": [
            25,
            30,
            35,
            40,
            45,
            24,
            29,
        ],
        "Gender": [
            "F",
            "M",
            "M",
            "M",
            "F",
            "M",
            "F",
        ],
    }
)


# Print the DataFrame
print(data, "\n")

# Select the first row using iloc
print("data.iloc[0]","\n",data.iloc[0], "\n")

# Select the value at the third row and second column using iloc
print("data.iloc[2, 1]","\n",data.iloc[2, 1], "\n")

# Select all the values in the first column using iloc
print("data.iloc[:, 0]","\n",data.iloc[:, 0], "\n")

# Select the rows with indices 0, 2, and 4 using iloc
# print(data.iloc[[0, 2, 4], :])
print("data.iloc[[0, 2, 4]]","\n",data.iloc[[0, 2, 4]], "\n")

      Name  Age Gender
0    Alice   25      F
1      Bob   30      M
2  Charlie   35      M
3    David   40      M
4      Eve   45      F
5    Frank   24      M
6    Grace   29      F 

data.iloc[0] 
 Name      Alice
Age          25
Gender        F
Name: 0, dtype: object 

data.iloc[2, 1] 
 35 

data.iloc[:, 0] 
 0      Alice
1        Bob
2    Charlie
3      David
4        Eve
5      Frank
6      Grace
Name: Name, dtype: object 

data.iloc[[0, 2, 4]] 
       Name  Age Gender
0    Alice   25      F
2  Charlie   35      M
4      Eve   45      F 



In [6]:
import numpy as np

np.random.seed(42)
shuffled_indices = np.random.permutation(100)
test_set_size = 20
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]

print(type(shuffled_indices), "\n", shuffled_indices)
print(len(test_indices), "\n", test_indices)
print(len(train_indices), "\n", train_indices)

<class 'numpy.ndarray'> 
 [83 53 70 45 44 39 22 80 10  0 18 30 73 33 90  4 76 77 12 31 55 88 26 42
 69 15 40 96  9 72 11 47 85 28 93  5 66 65 35 16 49 34  7 95 27 19 81 25
 62 13 24  3 17 38  8 78  6 64 36 89 56 99 54 43 50 67 46 68 61 97 79 41
 58 48 98 57 75 32 94 59 63 84 37 29  1 52 21  2 23 87 91 74 86 82 20 60
 71 14 92 51]
20 
 [83 53 70 45 44 39 22 80 10  0 18 30 73 33 90  4 76 77 12 31]
80 
 [55 88 26 42 69 15 40 96  9 72 11 47 85 28 93  5 66 65 35 16 49 34  7 95
 27 19 81 25 62 13 24  3 17 38  8 78  6 64 36 89 56 99 54 43 50 67 46 68
 61 97 79 41 58 48 98 57 75 32 94 59 63 84 37 29  1 52 21  2 23 87 91 74
 86 82 20 60 71 14 92 51]


In [7]:
from zlib import crc32  # Import the crc32 function from the zlib module
import numpy as np     # Import NumPy for handling numerical operations

# Calculate and print the CRC32 checksum of the integer 22
# The integer is first converted to a 64-bit integer, and then to bytes
print(crc32(np.int64(22).tobytes()))  # Converts 22 to bytes and computes CRC32

# Calculate and print the CRC32 checksum of the integer 44
# Similarly, the integer is converted to a 64-bit integer, and then to bytes
print(crc32(np.int64(44).tobytes()))  # Converts 44 to bytes and computes CRC32

# Print the value of 2 raised to the power of 32
# This shows the number of possible values in a 32-bit space
print(2**32)  # Outputs 4294967296

3757500357
3422267504
4294967296


In [154]:
print(type(data), "\n", data)

<class 'pandas.core.frame.DataFrame'> 
       Name  Age Gender
0    Alice   25      F
1      Bob   30      M
2  Charlie   35      M
3    David   40      M
4      Eve   45      F
5    Frank   24      M
6    Grace   29      F


In [155]:
data_with_id = data.reset_index()
data_with_id

Unnamed: 0,index,Name,Age,Gender
0,0,Alice,25,F
1,1,Bob,30,M
2,2,Charlie,35,M
3,3,David,40,M
4,4,Eve,45,F
5,5,Frank,24,M
6,6,Grace,29,F


In [156]:
def is_id_in_test_set(identifier, test_ratio):
    # crc32 return a number that is within the range of 0 to 2**32-1
    return crc32(np.int64(identifier)) < test_ratio * 2**32


def split_data_with_id_hash(data, test_ratio, id_column):
    ids = data[id_column]
    print(type(ids), "\n", ids)
    in_test_set = ids.apply(lambda id_: is_id_in_test_set(id_, test_ratio))
    print(type(in_test_set), "\n", in_test_set)
    return data.loc[~in_test_set], data.loc[in_test_set]

In [157]:
train_set, test_set = split_data_with_id_hash(data_with_id, 0.4, "index")
print(train_set)
print(test_set)

<class 'pandas.core.series.Series'> 
 0    0
1    1
2    2
3    3
4    4
5    5
6    6
Name: index, dtype: int64
<class 'pandas.core.series.Series'> 
 0     True
1    False
2     True
3    False
4    False
5     True
6    False
Name: index, dtype: bool
   index   Name  Age Gender
1      1    Bob   30      M
3      3  David   40      M
4      4    Eve   45      F
6      6  Grace   29      F
   index     Name  Age Gender
0      0    Alice   25      F
2      2  Charlie   35      M
5      5    Frank   24      M
