##  Modifying Data: Adding and Dropping Columns and Rows

In [1]:
from email.mime.audio import MIMEAudio

import pandas as pd
from pandas.conftest import ascending

# Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35]
}
df = pd.DataFrame(data)

In [2]:
df

Unnamed: 0,Name,Age
0,Alice,25
1,Bob,30
2,Charlie,35


In [3]:
df['City'] = ['New York', 'Los Angeles', 'Chicago']

In [4]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago


In [5]:
df['Age Group'] = ['Young' if age < 30 else 'Mature' for age in df['Age']]

In [6]:
df

Unnamed: 0,Name,Age,City,Age Group
0,Alice,25,New York,Young
1,Bob,30,Los Angeles,Mature
2,Charlie,35,Chicago,Mature


In [7]:
df = df.drop('Age Group', axis=1)

In [8]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago


In [9]:
df = df.drop(1)

In [10]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
2,Charlie,35,Chicago


In [11]:
df_copy = df.copy()

In [12]:
df_copy

Unnamed: 0,Name,Age,City
0,Alice,25,New York
2,Charlie,35,Chicago


In [13]:
df = df.assign(Discount=[5, 10], Total_Spend=[100, 200])

In [14]:
df

Unnamed: 0,Name,Age,City,Discount,Total_Spend
0,Alice,25,New York,5,100
2,Charlie,35,Chicago,10,200


## Accessing Data: Using df.iloc[] and df.loc[]

In [15]:
import pandas as pd

# Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, 30, 35, 40, 45],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Seattle']
}
df = pd.DataFrame(data)

In [16]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Seattle


In [17]:
row = df.loc[2]
row

Name    Charlie
Age          35
City    Chicago
Name: 2, dtype: object

In [18]:
cell = df.loc[3, 'City']
cell

'Houston'

In [19]:
subset_rows = df.loc[1:3]
subset_rows

Unnamed: 0,Name,Age,City
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston


In [20]:
subset = df.loc[1:4, ['Name', 'City']]
subset

Unnamed: 0,Name,City
1,Bob,Los Angeles
2,Charlie,Chicago
3,David,Houston
4,Eva,Seattle


In [21]:
mask = df['Age'] > 30
filtered = df.loc[mask].iloc[:, 0]
filtered

2    Charlie
3      David
4        Eva
Name: Name, dtype: object

In [21]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Seattle


In [23]:
df.iloc[1:4]

Unnamed: 0,Name,Age,City
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston


##  Sampling and Previewing Data: Using df.sample() and df.head()

In [22]:
import pandas as pd
df = pd.read_csv('model_logs.csv')

In [23]:
df.head(3)

Unnamed: 0,Date,Prompt,Response Time (ms),Tokens Generated
0,2023-01-01,Generate a creative story about space travel,62,287
1,2023-01-02,Write a sci-fi short story set in 2050,45,361
2,2023-01-03,Write a poem about the future of technology,33,221


In [27]:
random_row = df.sample()
random_row

Unnamed: 0,Date,Prompt,Response Time (ms),Tokens Generated
44,2023-02-14,Explain quantum computing in simple terms,52,300


In [33]:
random_rows = df.sample(n=3)
random_rows

Unnamed: 0,Date,Prompt,Response Time (ms),Tokens Generated
95,2023-04-06,Explain quantum computing in simple terms,61,304
24,2023-01-25,Describe the impact of AI on healthcare,97,271
77,2023-03-19,Write a poem about the future of technology,42,318


In [35]:
random_sample = df.sample(n=5, random_state=67)
random_sample

Unnamed: 0,Date,Prompt,Response Time (ms),Tokens Generated
17,2023-01-18,Generate a creative story about space travel,61,484
24,2023-01-25,Describe the impact of AI on healthcare,97,271
64,2023-03-06,Outline the ethical considerations of AI,51,269
93,2023-04-04,Explain quantum computing in simple terms,97,325
97,2023-04-08,Write a poem about the future of technology,38,465


In [36]:
fraction_sample = df.sample(frac=0.3)
fraction_sample

Unnamed: 0,Date,Prompt,Response Time (ms),Tokens Generated
12,2023-01-13,Write a sci-fi short story set in 2050,61,307
90,2023-04-01,Generate a creative story about space travel,71,482
47,2023-02-17,Write a poem about the future of technology,44,204
98,2023-04-09,Generate marketing copy for a new tech product,77,296
13,2023-01-14,Create a recipe using plant-based ingredients,98,257
80,2023-03-22,Explain blockchain for beginners,90,226
10,2023-01-11,Generate a creative story about space travel,98,230
89,2023-03-31,Explain blockchain for beginners,78,490
42,2023-02-12,Outline the ethical considerations of AI,41,499
78,2023-03-20,Generate marketing copy for a new tech product,94,286


## Filtering Data: Masks and pandas.Series.between()

In [1]:
import pandas as pd

# Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, 30, 35, 40, 45],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami']
}
df = pd.DataFrame(data)


In [2]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Miami


In [3]:
mask = df['Age'] > 30

In [4]:
mask

0    False
1    False
2     True
3     True
4     True
Name: Age, dtype: bool

In [5]:
filtered_df = df[mask]
filtered_df

Unnamed: 0,Name,Age,City
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Miami


In [6]:
mask = (df['Age'] > 30) & (df['City'] != 'Houston')
filtered_df = df[mask]
filtered_df

Unnamed: 0,Name,Age,City
2,Charlie,35,Chicago
4,Eva,45,Miami


In [7]:
mask

0    False
1    False
2     True
3    False
4     True
dtype: bool

### pandas.Series.between()

In [8]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Miami


In [9]:
filtered_df = df[df['Age'].between(30, 40)]
filtered_df

Unnamed: 0,Name,Age,City
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston


In [10]:
mask = df['City'].isin(['Chicago', 'Houston'])
filtered_df = df[mask]
filtered_df

Unnamed: 0,Name,Age,City
2,Charlie,35,Chicago
3,David,40,Houston


## Sorting Data

In [17]:
import pandas as pd

# Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Fred'],
    'Age': [25, 30, 35, 40, 45, 40],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami', 'Miami']
}
df = pd.DataFrame(data)

In [18]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Miami
5,Fred,40,Miami


In [19]:
sorted_df = df.sort_values(by='Age')
sorted_df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston
5,Fred,40,Miami
4,Eva,45,Miami


In [20]:
sorted_df = df.sort_values(by='Age', ascending=False)
sorted_df

Unnamed: 0,Name,Age,City
4,Eva,45,Miami
3,David,40,Houston
5,Fred,40,Miami
2,Charlie,35,Chicago
1,Bob,30,Los Angeles
0,Alice,25,New York


In [25]:
sorted_df = df.sort_values(by=['City', 'Age'], ascending=[True, False])
sorted_df

Unnamed: 0,Name,Age,City
2,Charlie,35,Chicago
3,David,40,Houston
1,Bob,30,Los Angeles
4,Eva,45,Miami
5,Fred,40,Miami
0,Alice,25,New York


In [26]:
sorted_df = df.sort_index()
sorted_df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Miami
5,Fred,40,Miami


In [27]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Miami
5,Fred,40,Miami


In [28]:
df.sort_values(by='Age', ascending=False, inplace=True)

In [29]:
df

Unnamed: 0,Name,Age,City
4,Eva,45,Miami
3,David,40,Houston
5,Fred,40,Miami
2,Charlie,35,Chicago
1,Bob,30,Los Angeles
0,Alice,25,New York


In [30]:
# Example DataFrame: Player leaderboard with names and scores
data = {
    'Player': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'Score': [150, 200, 125, 300, 175]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Player,Score
0,Alice,150
1,Bob,200
2,Charlie,125
3,Diana,300
4,Eve,175


In [31]:
sorted_df = df.sort_values(by='Score', ascending=False).reset_index(drop=True)
sorted_df['Rank'] = sorted_df.index + 1
sorted_df

Unnamed: 0,Player,Score,Rank
0,Diana,300,1
1,Bob,200,2
2,Eve,175,3
3,Alice,150,4
4,Charlie,125,5


## Handling Missing Data

In [32]:
import pandas as pd
# Sample DataFrame with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'Age': [25, None, 35, 40],
    'City': ['New York', 'Los Angeles', None, 'Houston']
}
df = pd.DataFrame(data)


In [36]:
df.info()
df

<class 'pandas.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    4 non-null      str    
 1   Age     3 non-null      float64
 2   City    3 non-null      str    
dtypes: float64(1), str(2)
memory usage: 228.0 bytes


Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,,Los Angeles
2,Charlie,35.0,
3,Diana,40.0,Houston


In [37]:
df.isnull()

Unnamed: 0,Name,Age,City
0,False,False,False
1,False,True,False
2,False,False,True
3,False,False,False


In [38]:
df.isna()

Unnamed: 0,Name,Age,City
0,False,False,False
1,False,True,False
2,False,False,True
3,False,False,False


In [39]:
df_cleaned = df.dropna()
df_cleaned

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
3,Diana,40.0,Houston


In [40]:
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,,Los Angeles
2,Charlie,35.0,
3,Diana,40.0,Houston


In [41]:
df_cleaned = df.dropna(axis=1)
df_cleaned

Unnamed: 0,Name
0,Alice
1,Bob
2,Charlie
3,Diana


In [42]:
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,,Los Angeles
2,Charlie,35.0,
3,Diana,40.0,Houston


In [43]:
df['Age'] = df['Age'].fillna(df['Age'].mean())

In [45]:
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,33.333333,Los Angeles
2,Charlie,35.0,
3,Diana,40.0,Houston


In [46]:
df['City'] = df['City'].fillna('Unknown')

In [48]:
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,33.333333,Los Angeles
2,Charlie,35.0,Unknown
3,Diana,40.0,Houston


In [49]:
df.info()


<class 'pandas.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    4 non-null      str    
 1   Age     4 non-null      float64
 2   City    4 non-null      str    
dtypes: float64(1), str(2)
memory usage: 228.0 bytes


## Aggregations and Grouping Data

In [50]:
import pandas as pd
# Sample DataFrame
data = {
    'Category': ['Electronics', 'Electronics', 'Furniture', 'Furniture', 'Apparel'],
    'Product': ['Laptop', 'Mouse', 'Chair', 'Table', 'Shoes'],
    'Sales': [1200, 100, 400, 300, 50]
}
df = pd.DataFrame(data)


In [51]:
df

Unnamed: 0,Category,Product,Sales
0,Electronics,Laptop,1200
1,Electronics,Mouse,100
2,Furniture,Chair,400
3,Furniture,Table,300
4,Apparel,Shoes,50


In [52]:
grouped = df.groupby('Category')['Sales'].sum().reset_index()
grouped

Unnamed: 0,Category,Sales
0,Apparel,50
1,Electronics,1300
2,Furniture,700


In [54]:
grouped = df.groupby(['Category', 'Product'])['Sales'].sum()
grouped

Category     Product
Apparel      Shoes        50
Electronics  Laptop     1200
             Mouse       100
Furniture    Chair       400
             Table       300
Name: Sales, dtype: int64

In [55]:
summary = df.groupby('Category')['Sales'].agg(['sum', 'mean', 'count'])
summary

Unnamed: 0_level_0,sum,mean,count
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apparel,50,50.0,1
Electronics,1300,650.0,2
Furniture,700,350.0,2
