Load a DataFrame from a CSV file

In [5]:
import pandas as pd

df = pd.read_csv('plant.csv')

print(df.head())
print(df.tail())


  Soil_Type  Sunlight_Hours Water_Frequency Fertilizer_Type  Temperature  \
0      loam        5.192294       bi-weekly        chemical    31.719602   
1     sandy        4.033133          weekly         organic    28.919484   
2      loam        8.892769       bi-weekly            none    23.179059   
3      loam        8.241144       bi-weekly            none    18.465886   
4     sandy        8.374043       bi-weekly         organic    18.128741   

    Humidity  Growth_Milestone  
0  61.591861                 0  
1  52.422276                 1  
2  44.660539                 0  
3  46.433227                 0  
4  63.625923                 0  
    Soil_Type  Sunlight_Hours Water_Frequency Fertilizer_Type  Temperature  \
188     sandy           5.652           daily            none         28.0   
189      clay           7.528          weekly        chemical         30.5   
190      loam           4.934       bi-weekly            none         24.5   
191     sandy           8.273    

Set a specific column

In [7]:
df.set_index('Temperature', inplace=True)

Select a specific column and display its values

In [11]:
print(df.columns)

Index(['Soil_Type', 'Sunlight_Hours', 'Water_Frequency', 'Fertilizer_Type',
       'Temperature', 'Humidity', 'Growth_Milestone'],
      dtype='object')


In [15]:
column_values = df['Temperature']
print(column_values)

0      31.719602
1      28.919484
2      23.179059
3      18.465886
4      18.128741
         ...    
188    28.000000
189    30.500000
190    24.500000
191    27.900000
192    21.700000
Name: Temperature, Length: 193, dtype: float64


Select multiple columns and display the resulting DataFrame

In [16]:
selected_columns = df[['Humidity', 'Soil_Type']]
print(selected_columns)

      Humidity Soil_Type
0    61.591861      loam
1    52.422276     sandy
2    44.660539      loam
3    46.433227      loam
4    63.625923     sandy
..         ...       ...
188  70.200000     sandy
189  60.100000      clay
190  61.700000      loam
191  69.500000     sandy
192  56.900000      clay

[193 rows x 2 columns]


Select a subset of rows using the .loc method

In [17]:
subset_rows_loc = df.loc[1:5]
print(subset_rows_loc)

  Soil_Type  Sunlight_Hours Water_Frequency Fertilizer_Type  Temperature  \
1     sandy        4.033133          weekly         organic    28.919484   
2      loam        8.892769       bi-weekly            none    23.179059   
3      loam        8.241144       bi-weekly            none    18.465886   
4     sandy        8.374043       bi-weekly         organic    18.128741   
5     sandy        8.627622       bi-weekly            none    20.004858   

    Humidity  Growth_Milestone  
1  52.422276                 1  
2  44.660539                 0  
3  46.433227                 0  
4  63.625923                 0  
5  67.618726                 0  


Select a subset of rows and columns using the .iloc method

In [18]:
subset_rows_cols_iloc = df.iloc[1:5, [0, 2, 3]]
print(subset_rows_cols_iloc)

  Soil_Type Water_Frequency Fertilizer_Type
1     sandy          weekly         organic
2      loam       bi-weekly            none
3      loam       bi-weekly            none
4     sandy       bi-weekly         organic


Filter rows based on a condition

In [21]:
filtered_rows = df[df['Temperature'] > 20]
print(filtered_rows)

    Soil_Type  Sunlight_Hours Water_Frequency Fertilizer_Type  Temperature  \
0        loam        5.192294       bi-weekly        chemical    31.719602   
1       sandy        4.033133          weekly         organic    28.919484   
2        loam        8.892769       bi-weekly            none    23.179059   
5       sandy        8.627622       bi-weekly            none    20.004858   
6        loam        4.444268           daily         organic    25.984533   
..        ...             ...             ...             ...          ...   
188     sandy        5.652000           daily            none    28.000000   
189      clay        7.528000          weekly        chemical    30.500000   
190      loam        4.934000       bi-weekly            none    24.500000   
191     sandy        8.273000           daily         organic    27.900000   
192      clay        6.732000          weekly            none    21.700000   

      Humidity  Growth_Milestone  
0    61.591861              

Group the DataFrame by a specific column and calculate the mean of each group

In [26]:
data = {
    'Temperature': [25, 30, 30, 35, 35, 40],
    'Humidity': [30, 45, 45, 50, 50, 60],
    'Value': [10, 20, 30, 40, 50, 60]
}

df = pd.DataFrame(data)




In [27]:
print("Original DataFrame:\n", df)
grouped_mean = df.groupby(['Temperature', 'Humidity']).mean()
print("\nGrouped Mean DataFrame:\n", grouped_mean)

Original DataFrame:
    Temperature  Humidity  Value
0           25        30     10
1           30        45     20
2           30        45     30
3           35        50     40
4           35        50     50
5           40        60     60

Grouped Mean DataFrame:
                       Value
Temperature Humidity       
25          30         10.0
30          45         25.0
35          50         45.0
40          60         60.0


Use the agg method to apply multiple aggregation functions to grouped data:

In [34]:
data = {
    'Temperature': [25, 30, 30, 35, 35, 40],
    'Humidity': [30, 45, 45, 50, 50, 60],
    'Value1': [10, 20, 30, 40, 50, 60],
    'Value2': [5, 15, 25, 35, 45, 55]
}

df = pd.DataFrame(data)


In [32]:

print("Original DataFrame:\n", df)

grouped_agg = df.groupby(['Temperature', 'Humidity']).agg({
    'Value1': ['mean', 'sum'],
    'Value2': 'min'
})

print("\nGrouped DataFrame with Multiple Aggregations:\n", grouped_agg)

Original DataFrame:
    Temperature  Humidity  Value1  Value2
0           25        30      10       5
1           30        45      20      15
2           30        45      30      25
3           35        50      40      35
4           35        50      50      45
5           40        60      60      55

Grouped DataFrame with Multiple Aggregations:
                      Value1     Value2
                       mean sum    min
Temperature Humidity                  
25          30         10.0  10      5
30          45         25.0  50     15
35          50         45.0  90     35
40          60         60.0  60     55


gCalculate the size of each group

In [36]:
group_size = df.groupby('Temperature').size()
print(group_size)

Temperature
25    1
30    2
35    2
40    1
dtype: int64


Select rows based on multiple conditions

In [37]:
multiple_conditions = df[(df['Temperature'] > 20) & (df['Humidity'] < 30)]
print(multiple_conditions)

Empty DataFrame
Columns: [Temperature, Humidity, Value1, Value2]
Index: []


Use the query method to filter rows:


In [38]:
filtered_query = df.query('Temperature > 20 and Humidity < 30')
print(filtered_query)


Empty DataFrame
Columns: [Temperature, Humidity, Value1, Value2]
Index: []


Use isin to filter rows based on a list of values

In [40]:
filtered_isin = df[df['Temperature'].isin([20, 30, 40])]
print(filtered_isin)


   Temperature  Humidity  Value1  Value2
1           30        45      20      15
2           30        45      30      25
5           40        60      60      55


Select specific columns and rename them

In [42]:
selected_renamed = df[['Temperature', 'Humidity']].rename(columns={'Temperature': 'Temp', 'Humidity': 'humity'})
print(selected_renamed)


   Temp  humity
0    25      30
1    30      45
2    30      45
3    35      50
4    35      50
5    40      60
