<h1 align="center" style="color: orange"> Pandas I </h1>

In [2]:
import numpy as np
import pandas as pd

### Series

In [5]:
# From list (default integer index)
s1 = pd.Series([10, 20, 30, 40])
print(f"Series s1:\n{s1}")
print("--------------------------------\n")


# With custom index
s2 = pd.Series([10, 20, 30, 40], 
               index=['a', 'b', 'c', 'd'],
               name='values')
print(f"Series s2 with custom index:\n{s2}")
print("--------------------------------\n")

# From dictionary (keys become index)
data_dict = {'US': 46, 'China': 38, 'Japan': 27, 'GB': 22}
medals = pd.Series(data_dict, name='Gold_Medals')
print(f"Series medals (from dict):\n{medals}")
print("--------------------------------\n")

# From scalar (broadcasts to all indices)
s3 = pd.Series(5.0, index=['a', 'b', 'c', 'd'])
print(f"Series s3 (from scalar):\n{s3}")
print("--------------------------------\n")

# From NumPy array
arr = np.random.randn(5)
s4 = pd.Series(arr, index=['v', 'w', 'x', 'y', 'z'])
print(f"Series s4 (from Numpy array):\n{s4}")

Series s1:
0    10
1    20
2    30
3    40
dtype: int64
--------------------------------

Series s2 with custom index:
a    10
b    20
c    30
d    40
Name: values, dtype: int64
--------------------------------

Series medals (from dict):
US       46
China    38
Japan    27
GB       22
Name: Gold_Medals, dtype: int64
--------------------------------

Series s3 (from scalar):
a    5.0
b    5.0
c    5.0
d    5.0
dtype: float64
--------------------------------

Series s4 (from Numpy array):
v    1.598080
w   -0.710631
x    0.210101
y    0.351109
z   -0.294023
dtype: float64


### Series Attributes

In [None]:
medals = pd.Series({'US': 46, 'China': 38, 'Japan': 27, 'GB': 22}, 
                   name='Gold_Medals')

# Core attributes
print(f"Values: {medals.values}")       # NumPy array
print(f"\nIndex: {medals.index}")       # Index object
print(f"\nName: {medals.name}")         # 'Gold_Medals'
print(f"\nDtype: {medals.dtype}")       # int64
print(f"\nSize: {medals.size}")         # 4 (total elements including NaN)
print(f"\nCount: {medals.count()}")     # 4 (non-NaN values)
print(f"\nShape: {medals.shape}")       # (4,)
print(f"\nMemory: {medals.memory_usage()} bytes")

Values: [46 38 27 22]

Index: Index(['US', 'China', 'Japan', 'GB'], dtype='object')

Name: Gold_Medals

Dtype: int64

Size: 4

Count: 4

Shape: (4,)

Memory: 64 bytes


### Series Operations

In [8]:
# Arithmetic operations (element-wise)
s = pd.Series([1, 2, 3, 4])
print(f"Adding Scalar :\n{s + 10}")
print(f"\nMultiply with Scalar :\n{s * 2}")
print(f"\nPower :\n{s ** 2}")
print("--------------------------------\n")

# Operations between Series (aligns on index)
s1 = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
s2 = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])
result = s1 + s2
print(f"Result :\n{s3}")
print("--------------------------------\n")
# a     11
# b     22
# c     33
# d    NaN  (no matching index in s1)

# Aggregations
data = pd.Series([1, 2, 3, 4, 5])
print(f"Sum: {data.sum()}")
print(f"\nMean: {data.mean()}")
print(f"\nStd: {data.std()}")
print(f"\nMin: {data.min()}")
print(f"\nMax: {data.max()}")
print(f"\nMedian: {data.median()}")
print("--------------------------------\n")

# Statistical summary
print(f"Summary Statistic :\n{data.describe()}")

Adding Scalar :
0    11
1    12
2    13
3    14
dtype: int64

Multiply with Scalar :
0    2
1    4
2    6
3    8
dtype: int64

Power :
0     1
1     4
2     9
3    16
dtype: int64
--------------------------------

Result :
a    5.0
b    5.0
c    5.0
d    5.0
dtype: float64
--------------------------------

Sum: 15

Mean: 3.0

Std: 1.5811388300841898

Min: 1

Max: 5

Median: 3.0
--------------------------------

Summary Statistic :
count    5.000000
mean     3.000000
std      1.581139
min      1.000000
25%      2.000000
50%      3.000000
75%      4.000000
max      5.000000
dtype: float64


### Series Indexing

In [None]:
s = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])

# By label (explicit indexing)
print("Explicit Indexing")
print(f"s['b']: {s['b']}")  # 20
print(f"s[['a', 'c']]: \n{s[['a', 'c']]}")  # Multiple labels
print("--------------------------------\n")

# By position (implicit indexing)
print("Implicit Indexing")
print(f"s.iloc[1]: {s.iloc[1]}")    # 20
print(f"s.iloc[0:2]: \n{s.iloc[0:2]}")  # First two elements
print("--------------------------------\n")

# Boolean indexing
print("Boolean Indexing")
print(f"s[s > 20]: \n{s[s > 20]}")  # Values > 20
print("--------------------------------\n")

# Fancy indexing
print("Fancy Indexing")
indices = [0, 2]
print(f"s.iloc[indices]: {s.iloc[indices]}")

Explicit Indexing
s['b']: 20
s[['a', 'c']]: 
a    10
c    30
dtype: int64
--------------------------------

Implicit Indexing
s.iloc[1]: 20
s.iloc[0:2]: 
a    10
b    20
dtype: int64
--------------------------------

Boolean Indexing
s[s > 20]: 
c    30
d    40
dtype: int64
--------------------------------

Fancy Indexing
s.iloc[indices]: a    10
c    30
dtype: int64


### Series Methods

In [None]:
s = pd.Series([3, 1, 4, 1, 5, 9, 2, 6], name='numbers')
print(f"Series s :\n{s}")
print("--------------------------------\n")

# Sorting
print("\nSorting")
print(f"s.sort_values(): {s.sort_values()}")        # Sort by values
print(f"s.sort_index(): {s.sort_index()}")      # Sort by index
print("--------------------------------\n")

# Unique values
print("Unique values")
print(f"s.unique(): {s.unique()}")      # Array of unique values
print(f"s.nunique(): {s.nunique()}")        # Count of unique values
print(f"s.value_counts(): {s.value_counts()}")      # Frequency of each value
print("--------------------------------\n")

# Ranking
print("Ranking")
print(f"s.rank(): {s.rank()}")      # Assign ranks to values
print("--------------------------------\n")

# Checking for duplicates
print("Checking for duplicates")
print(f"s.duplicated(): {s.duplicated()}")      # Boolean mask
print(f"s.drop_duplicates(): {s.drop_duplicates()}")        # Remove duplicates
print("--------------------------------\n")

# String operations (if dtype is object/string)
print("String Operations")
s_str = pd.Series(['apple', 'banana', 'cherry'])
print(f"s_str.str.upper():\n{s_str.str.upper()}")       # 'APPLE', 'BANANA', 'CHERRY'
print(f"s_str.str.contains('an'):\n{s_str.str.contains('an')}")     # Boolean mask
print("--------------------------------\n")

# Apply custom function
def square_if_even(x):
    return x**2 if x % 2 == 0 else x

print(f"s.apply(square_if_even): {s.apply(square_if_even)}")

# Map values (dictionary-based transformation)
mapping = {1: 'one', 2: 'two', 3: 'three'}
print(f"s.map(mapping): {s.map(mapping)}")      # NaN for unmapped values

Series s :
0    3
1    1
2    4
3    1
4    5
5    9
6    2
7    6
Name: numbers, dtype: int64
--------------------------------


Sorting
s.sort_values(): 1    1
3    1
6    2
0    3
2    4
4    5
7    6
5    9
Name: numbers, dtype: int64
s.sort_index(): 0    3
1    1
2    4
3    1
4    5
5    9
6    2
7    6
Name: numbers, dtype: int64
--------------------------------

Unique values
s.unique(): [3 1 4 5 9 2 6]
s.nunique(): 7
s.value_counts(): numbers
1    2
3    1
4    1
5    1
9    1
2    1
6    1
Name: count, dtype: int64
--------------------------------

Ranking
s.rank(): 0    4.0
1    1.5
2    5.0
3    1.5
4    6.0
5    8.0
6    3.0
7    7.0
Name: numbers, dtype: float64
--------------------------------

Checking for duplicates
s.duplicated(): 0    False
1    False
2    False
3     True
4    False
5    False
6    False
7    False
Name: numbers, dtype: bool
s.drop_duplicates(): 0    3
1    1
2    4
4    5
5    9
6    2
7    6
Name: numbers, dtype: int64
----------------------------

### DataFrame

In [3]:
# Conceptual representation of DataFrame internals
df_concept = {
    'index': [0, 1, 2],
    'columns': [
        {'name': 'growth', 'data': pd.Series([0.5, 0.7, 1.2])},
        {'name': 'revenue', 'data': pd.Series([100, 150, 200])}
    ]
}

print(df_concept)

{'index': [0, 1, 2], 'columns': [{'name': 'growth', 'data': 0    0.5
1    0.7
2    1.2
dtype: float64}, {'name': 'revenue', 'data': 0    100
1    150
2    200
dtype: int64}]}


In [5]:
# From dictionary of lists (most common)
data = {
    'Country': ['US', 'China', 'Japan', 'GB'],
    'Gold': [46, 38, 27, 22],
    'Silver': [37, 32, 14, 21],
    'Bronze': [38, 18, 17, 22]
}
df = pd.DataFrame(data)

# From list of dictionaries
records = [
    {'Country': 'US', 'Gold': 46, 'Silver': 37},
    {'Country': 'China', 'Gold': 38, 'Silver': 32},
    {'Country': 'Japan', 'Gold': 27, 'Silver': 14}
]
df = pd.DataFrame(records)

print(df)
print("--------------------------------------------\n")

# From NumPy array with column names
arr = np.random.randn(4, 3)
df = pd.DataFrame(arr, 
                  columns=['A', 'B', 'C'],
                  index=['w', 'x', 'y', 'z'])
print(df)
print("--------------------------------------------\n")

# From dictionary of Series
s1 = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
s2 = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
df = pd.DataFrame({'col1': s1, 'col2': s2})
print(df)
print("--------------------------------------------\n")

# From nested lists
data = [
    ['US', 46, 37, 38],
    ['China', 38, 32, 18],
    ['Japan', 27, 14, 17]
]
df = pd.DataFrame(data, columns=['Country', 'Gold', 'Silver', 'Bronze'])
print(df)
print("--------------------------------------------\n")

# With custom index
df = pd.DataFrame(data, 
                  columns=['Country', 'Gold', 'Silver', 'Bronze'],
                  index=['first', 'second', 'third'])
print(df)

  Country  Gold  Silver
0      US    46      37
1   China    38      32
2   Japan    27      14
--------------------------------------------

          A         B         C
w  0.857905 -1.705285 -0.010104
x -0.534062  0.580203 -0.402063
y -1.316870 -0.496442 -1.261982
z  1.459550  0.471603 -0.444002
--------------------------------------------

   col1  col2
a     1    10
b     2    20
c     3    30
--------------------------------------------

  Country  Gold  Silver  Bronze
0      US    46      37      38
1   China    38      32      18
2   Japan    27      14      17
--------------------------------------------

       Country  Gold  Silver  Bronze
first       US    46      37      38
second   China    38      32      18
third    Japan    27      14      17


In [7]:
df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4.0, 5.0, 6.0],
    'C': ['x', 'y', 'z']
})
print(df)
print("--------------------------------------------\n")

# Core attributes
print(f"Shape: {df.shape}")              # (3, 3) - (rows, cols)
print(f"\nSize: {df.size}")                # 9 - total elements
print(f"\nColumns: {df.columns}")          # Column labels
print(f"\nIndex: {df.index}")              # Row labels
print(f"\nDtypes:\n{df.dtypes}")           # Data type per column
print(f"\nValues:\n{df.values}")           # NumPy array (2D)
print(f"\nMemory usage:\n{df.memory_usage()}")
print(f"\nMemory (deep): {df.memory_usage(deep=True).sum()} bytes")

# Axes (dimension names)
print(f"Axes: {df.axes}")                # [index, columns]

   A    B  C
0  1  4.0  x
1  2  5.0  y
2  3  6.0  z
--------------------------------------------

Shape: (3, 3)

Size: 9

Columns: Index(['A', 'B', 'C'], dtype='object')

Index: RangeIndex(start=0, stop=3, step=1)

Dtypes:
A      int64
B    float64
C     object
dtype: object

Values:
[[1 4.0 'x']
 [2 5.0 'y']
 [3 6.0 'z']]

Memory usage:
Index    132
A         24
B         24
C         24
dtype: int64

Memory (deep): 330 bytes
Axes: [RangeIndex(start=0, stop=3, step=1), Index(['A', 'B', 'C'], dtype='object')]


### DataFrame Methods

In [8]:
df1 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4.0, 5.0, 6.0],
})

df2 = pd.DataFrame(
    {
        'A': [5, 6, 7],
        'B': [5.0, 12.1, 9.6],
    }
)

print(f"df1 + df2: {df1 + df2}")          # __add__
print(f"\ndf1 - df2: {df1 - df2}")          # __sub__
print(f"\ndf1 * df2: {df1 * df2}")          # __mul__
print(f"\ndf1 / df2: {df1 / df2}")          # __truediv__
print("--------------------------------------------\n")

# Iteration
for col in df:     # __iter__ (iterates over column names)
    print(col)

# Indexing
print(df['A'] )           # __getitem__
df['A'] = [1,2,3]  # __setitem__

df1 + df2:     A     B
0   6   9.0
1   8  17.1
2  10  15.6

df1 - df2:    A    B
0 -4 -1.0
1 -4 -7.1
2 -4 -3.6

df1 * df2:     A     B
0   5  20.0
1  12  60.5
2  21  57.6

df1 / df2:           A         B
0  0.200000  0.800000
1  0.333333  0.413223
2  0.428571  0.625000
--------------------------------------------

A
B
C
0    1
1    2
2    3
Name: A, dtype: int64


**Categorical Data**

In [30]:
# categorical data
shirt_size = pd.Series(['m', 'l', 'xs', 's', 'xl'], dtype='category')
shirt_size

0     m
1     l
2    xs
3     s
4    xl
dtype: category
Categories (5, object): ['l', 'm', 's', 'xl', 'xs']

In [36]:
# ordering categories and astype()
size_type = pd.api.types.CategoricalDtype(
    categories=['s', 'm', 'l', 'xl'], 
    ordered=True
)

ordered_shirt_size = shirt_size.astype(size_type)
print(ordered_shirt_size.cat.ordered)
ordered_shirt_size

True


0      m
1      l
2    NaN
3      s
4     xl
dtype: category
Categories (4, object): ['s' < 'm' < 'l' < 'xl']

In [5]:
# creating a dataFrame
car_speed = {'car': ['BMW', 'Tata', 'Ford'],
             'speed': [120, 90, 70]
            }

df = pd.DataFrame(car_speed, index=['Germany', 'India', 'US'])
df

Unnamed: 0,car,speed
Germany,BMW,120
India,Tata,90
US,Ford,70


**Iloc & Loc**

In [None]:
# iloc : Integer based indexing
df.iloc[0]

car      BMW
speed    120
Name: German, dtype: object

In [None]:
# iloc[row_index, col_index]
df.iloc[0, 1]

np.int64(120)

In [None]:
# slicing also work with iloc
df.iloc[1:, 1:]

Unnamed: 0,speed
India,90
US,70


In [8]:
# .loc : Label based indexing
df.loc['India']

car      Tata
speed      90
Name: India, dtype: object

In [10]:
df.loc['India', 'car']

'Tata'

In [11]:
df.loc[:, 'speed']

German    120
India      90
US         70
Name: speed, dtype: int64

In [20]:
# accessing elements
df['car']

German     BMW
India     Tata
US        Ford
Name: car, dtype: object

In [23]:
print(type(df['car']), type(df), sep='\n')

<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>


**Slicing**

In [None]:
# adding a single row to DF
df.loc['Japan'] = ["Toyota", 110] 

# adding multiple rows with concat
temp_df = pd.DataFrame({"car": ['Volvo', 'Lamborghini'],"speed": [100, 200]}, 
                       index=['Sweden', 'Italy'])
df = pd.concat([df, temp_df])

# adding a single column to DF
df['color'] = ['Violet', 'Indigo', 'Blue', 'Green', 'Yellow', 'Orange']

print("DataFrame :")
print(df)

DataFrame :
                 car  speed   color
Germany          BMW    120  Violet
India           Tata     90  Indigo
US              Ford     70    Blue
Japan         Toyota    110   Green
Sweden         Volvo    100  Yellow
Italy    Lamborghini    200  Orange


In [None]:
# df[[]] gives the col as df
speed_df = df[['speed']]

# df[] gives the col as series
speed_series = df['speed']

if isinstance(speed_df, pd.DataFrame):
    print(speed_df, "\nis a DataFrame\n")
    
if isinstance(speed_series, pd.Series):
    print(speed_series, "\nis a Series")

         speed
Germany    120
India       90
US          70
Japan      110
Sweden     100
Italy      200 
is a DataFrame

Germany    120
India       90
US          70
Japan      110
Sweden     100
Italy      200
Name: speed, dtype: int64 
is a Series


**Pandas axes**

In [None]:
data = {
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9]
}
df = pd.DataFrame(data)

print("DataFrame : ")
print(df)

# Summing along Axis 0 (column-wise)
print("\nColumn-wise sum (axis=0):")
print(df.sum(axis=0))

# Summing along Axis 1 (row-wise)
print("\nRow-wise sum (axis=1):")
print(df.sum(axis=1))

DataFrame : 
   A  B  C
0  1  4  7
1  2  5  8
2  3  6  9

Column-wise sum (axis=0):
A     6
B    15
C    24
dtype: int64

Row-wise sum (axis=1):
0    12
1    15
2    18
dtype: int64


In [14]:
np_arr = np.random.default_rng().integers(low=1, high=100, size=120)
random_series = pd.Series(np_arr, name='random series')
random_series

0      38
1      42
2      43
3      63
4      19
       ..
115     2
116     5
117    76
118    99
119    77
Name: random series, Length: 120, dtype: int64

In [4]:
# loading a dataset
df = pd.read_csv('./Data/economy.csv')

In [7]:
df.head()

Unnamed: 0,Model,Displ,Cyl,Trans,Drive,Fuel,Cert Region,Stnd,Stnd Description,Underhood ID,Veh Class,Air Pollution Score,City MPG,Hwy MPG,Cmb MPG,Greenhouse Gas Score,SmartWay,Comb CO2
0,ACURA Integra,1.5,4.0,SCV-7,2WD,Gasoline,CA,L3SULEV30,California LEV-III SULEV30,RHNXV01.54EC,large car,7,30,37,33,6,No,269
1,ACURA Integra,1.5,4.0,SCV-7,2WD,Gasoline,FA,T3B30,Federal Tier 3 Bin 30,RHNXV01.54EC,large car,7,30,37,33,6,No,269
2,ACURA Integra,2.0,4.0,Man-6,2WD,Gasoline,CA,L3ULEV50,California LEV-III ULEV50,RHNXV02.0TDC,large car,6,21,28,24,5,No,371
3,ACURA Integra,2.0,4.0,Man-6,2WD,Gasoline,FA,T3B50,Federal Tier 3 Bin 50,RHNXV02.0TDC,large car,6,21,28,24,5,No,371
4,ACURA Integra A-Spec,1.5,4.0,Man-6,2WD,Gasoline,CA,L3ULEV50,California LEV-III ULEV50,RHNXV01.55DC,large car,6,26,36,30,6,No,293


In [10]:
df.columns

Index(['Model', 'Displ', 'Cyl', 'Trans', 'Drive', 'Fuel', 'Cert Region',
       'Stnd', 'Stnd Description', 'Underhood ID', 'Veh Class',
       'Air Pollution Score', 'City MPG', 'Hwy MPG', 'Cmb MPG',
       'Greenhouse Gas Score', 'SmartWay', 'Comb CO2'],
      dtype='object')

In [11]:
city_mpg, highway_mpg = df['City MPG'], df['Hwy MPG']

---

**Concatenation and Merging**

In [8]:
country_1_df = pd.DataFrame(
    {
        "Country": ["India", "Japan", "US"],
        "Currency": ["Indian Rupee ", "Japanese yen", "United States dollar"],
        "Temperature": [37, 26, 18],
        "humidity": [90.1, 34.2, 47.5]
    }
)

country_2_df = pd.DataFrame(
    {
        "Country": ["Korea", "Australia", "UK"],
        "Currency": ["Won", "Dollar", "UK Pound"],
        "Temperature": [17, 16, 18],
        "humidity": [40.1, 24.2, 27.5]
    }
)

country_1_df

Unnamed: 0,Country,Currency,Temperature,humidity
0,India,Indian Rupee,37,90.1
1,Japan,Japanese yen,26,34.2
2,US,United States dollar,18,47.5


In [9]:
country_combined_df = pd.concat([country_1_df, country_2_df])

print(country_combined_df)

country_combined_df_one_idx = pd.concat([country_1_df, country_2_df], ignore_index=True)

     Country              Currency  Temperature  humidity
0      India         Indian Rupee            37      90.1
1      Japan          Japanese yen           26      34.2
2         US  United States dollar           18      47.5
0      Korea                   Won           17      40.1
1  Australia                Dollar           16      24.2
2         UK              UK Pound           18      27.5


In the above cell, we concatenated the two dataframes in the row direction. We can also concatinate the dataframes in the column direction. For that we need to pass the `axis=1` as an argument to the `concat()` function.

In [10]:
side_by_side_df = pd.concat([country_1_df, country_2_df], ignore_index=True, axis=1)
side_by_side_df

Unnamed: 0,0,1,2,3,4,5,6,7
0,India,Indian Rupee,37,90.1,Korea,Won,17,40.1
1,Japan,Japanese yen,26,34.2,Australia,Dollar,16,24.2
2,US,United States dollar,18,47.5,UK,UK Pound,18,27.5


In [11]:
country_3_df = pd.DataFrame(
    {
        "Country": ["India", "Japan", "US"],
        "Air Quality": [6, 4, 5]
    }
) 

merged_df = pd.merge( country_2_df, country_3_df, on='Country', how='outer')
merged_df

Unnamed: 0,Country,Currency,Temperature,humidity,Air Quality
0,Korea,Won,17.0,40.1,
1,Australia,Dollar,16.0,24.2,
2,UK,UK Pound,18.0,27.5,
3,India,,,,6.0
4,Japan,,,,4.0
5,US,,,,5.0


In the above df, we can see that those values which are not present in both the input data frames were not dropped as it happened in inner join (merge without specifying outer).

**loc and iloc**

`loc` and `iloc` are the two functions which are used to access the data from the dataframe. `loc` is used to access the data using the labels of the rows and columns. `iloc` is used to access the data using the index of the rows and columns.

In [22]:
print(f'loc[2]  = {merged_df.loc[2]}\n') # Using indexing for accessing data
print(f"iloc[2] = {merged_df.iloc[2]}")  # Using the row number

loc[2]  = Country              UK
Currency       UK Pound
Temperature        18.0
humidity           27.5
Air Quality         NaN
Name: 2, dtype: object

iloc[2] = Country              UK
Currency       UK Pound
Temperature        18.0
humidity           27.5
Air Quality         NaN
Name: 2, dtype: object


Now let us come back to our main dataset and see some more functions of pandas.

In [24]:
print(df.describe())

# including summaries of object datatypes also
print(df.describe(include='object'))

       culmen_length_mm  culmen_depth_mm  flipper_length_mm  body_mass_g
count        342.000000       342.000000         342.000000   342.000000
mean          43.921930        17.151170         200.915205  4201.754386
std            5.459584         1.974793          14.061714   801.954536
min           32.100000        13.100000         172.000000  2700.000000
25%           39.225000        15.600000         190.000000  3550.000000
50%           44.450000        17.300000         197.000000  4050.000000
75%           48.500000        18.700000         213.000000  4750.000000
max           59.600000        21.500000         231.000000  6300.000000
       species  island   sex
count      344     344   334
unique       3       3     3
top     Adelie  Biscoe  MALE
freq       152     168   168


In [6]:
species_series = df['species']
species_series

0      Adelie
1      Adelie
2      Adelie
3      Adelie
4      Adelie
        ...  
339    Gentoo
340    Gentoo
341    Gentoo
342    Gentoo
343    Gentoo
Name: species, Length: 344, dtype: object

In [9]:
# changing the index (2, end, 2)
species_series.index = np.arange(2, (df.shape[0]+1)*2, 2)
species_series

2      Adelie
4      Adelie
6      Adelie
8      Adelie
10     Adelie
        ...  
680    Gentoo
682    Gentoo
684    Gentoo
686    Gentoo
688    Gentoo
Name: species, Length: 344, dtype: object

In [10]:
species_series.iloc[[1, 10, 100]]

4      Adelie
22     Adelie
202    Adelie
Name: species, dtype: object

In [12]:
# species_series.loc[[1, 10, 100]]  # error
species_series.loc[[2, 10, 100]]

2      Adelie
10     Adelie
100    Adelie
Name: species, dtype: object

In [14]:
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
339    False
340    False
341    False
342    False
343    False
Length: 344, dtype: bool

In [15]:
# finding duplicates
df.loc[df.duplicated()] 

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex


In [17]:
df.iloc[0:5, 0:3 ]

Unnamed: 0,species,island,culmen_length_mm
0,Adelie,Torgersen,39.1
1,Adelie,Torgersen,39.5
2,Adelie,Torgersen,40.3
3,Adelie,Torgersen,
4,Adelie,Torgersen,36.7


In [20]:
df.loc[0:5, ['species', 'body_mass_g', 'island']]

Unnamed: 0,species,body_mass_g,island
0,Adelie,3750.0,Torgersen
1,Adelie,3800.0,Torgersen
2,Adelie,3250.0,Torgersen
3,Adelie,,Torgersen
4,Adelie,3450.0,Torgersen
5,Adelie,3650.0,Torgersen


In [22]:
df.sort_values(['culmen_length_mm'], ascending=False)

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
253,Gentoo,Biscoe,59.6,17.0,230.0,6050.0,MALE
169,Chinstrap,Dream,58.0,17.8,181.0,3700.0,FEMALE
321,Gentoo,Biscoe,55.9,17.0,228.0,5600.0,MALE
215,Chinstrap,Dream,55.8,19.8,207.0,4000.0,MALE
335,Gentoo,Biscoe,55.1,16.0,230.0,5850.0,MALE
...,...,...,...,...,...,...,...
70,Adelie,Torgersen,33.5,19.0,190.0,3600.0,FEMALE
98,Adelie,Dream,33.1,16.1,178.0,2900.0,FEMALE
142,Adelie,Dream,32.1,15.5,188.0,3050.0,FEMALE
3,Adelie,Torgersen,,,,,


In [23]:
df.sort_values(['culmen_length_mm', 'body_mass_g'], ascending=[True, False])

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
142,Adelie,Dream,32.1,15.5,188.0,3050.0,FEMALE
98,Adelie,Dream,33.1,16.1,178.0,2900.0,FEMALE
70,Adelie,Torgersen,33.5,19.0,190.0,3600.0,FEMALE
92,Adelie,Dream,34.0,17.1,185.0,3400.0,FEMALE
8,Adelie,Torgersen,34.1,18.1,193.0,3475.0,
...,...,...,...,...,...,...,...
321,Gentoo,Biscoe,55.9,17.0,228.0,5600.0,MALE
169,Chinstrap,Dream,58.0,17.8,181.0,3700.0,FEMALE
253,Gentoo,Biscoe,59.6,17.0,230.0,6050.0,MALE
3,Adelie,Torgersen,,,,,


In [None]:
df.to_csv('data_without_index.csv', index=False)