<a href="https://colab.research.google.com/github/izzat-ai/learning-ai/blob/main/pandas_data_preparation/data_preparation_all_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [2]:
df1 = pd.DataFrame({
    'Name': ['Ali', 'Vali', np.nan, 'Jasur', 'Kamol'],
    'Age': [25, np.nan, 35, 28, np.nan],
    'Salary': [5000, 6000, np.nan, 7000, 6500]
})
df1

Unnamed: 0,Name,Age,Salary
0,Ali,25.0,5000.0
1,Vali,,6000.0
2,,35.0,
3,Jasur,28.0,7000.0
4,Kamol,,6500.0


In [3]:
df1.isnull()

Unnamed: 0,Name,Age,Salary
0,False,False,False
1,False,True,False
2,True,False,True
3,False,False,False
4,False,True,False


In [4]:
# Determine if there are missing values ​​and count them
df1.isnull().sum()

Unnamed: 0,0
Name,1
Age,2
Salary,1


In [5]:
df2 = pd.DataFrame({
    'Product': ['Laptop', 'Phone', 'Tablet', 'Watch', 'Camera'],
    'Price': [1000, np.nan, 300, 200, np.nan],
    'Stock': [50, 120, np.nan, 200, 45],
    'Rating': [4.5, 4.8, np.nan, 4.2, 4.7]
})
df2

Unnamed: 0,Product,Price,Stock,Rating
0,Laptop,1000.0,50.0,4.5
1,Phone,,120.0,4.8
2,Tablet,300.0,,
3,Watch,200.0,200.0,4.2
4,Camera,,45.0,4.7


In [6]:
# Fill in the missing values: Average for Price and Rating, 0 for Stock
price_mean = df2['Price'].mean()
rating_mean = df2['Rating'].mean()
df2.fillna({'Price':price_mean, 'Rating':rating_mean, 'Stock':0})

Unnamed: 0,Product,Price,Stock,Rating
0,Laptop,1000.0,50.0,4.5
1,Phone,500.0,120.0,4.8
2,Tablet,300.0,0.0,4.55
3,Watch,200.0,200.0,4.2
4,Camera,500.0,45.0,4.7


In [7]:
df3 = pd.DataFrame({
    'Student': ['Ali Khan', 'Vali Karimov', 'Sardor Toshmatov', 'Jasur Aliyev'],
    'Email': ['ali@example.com', 'vali@test.com', 'sardor@mail.uz', 'jasur@inbox.com'],
    'Score': [85, 90, 78, 92]
})
df3

Unnamed: 0,Student,Email,Score
0,Ali Khan,ali@example.com,85
1,Vali Karimov,vali@test.com,90
2,Sardor Toshmatov,sardor@mail.uz,78
3,Jasur Aliyev,jasur@inbox.com,92


In [8]:
# Extract only domain names from the 'Email' column (e.g.: example.com, test.com)
df3['Email'].str.split('@').str[1]

Unnamed: 0,Email
0,example.com
1,test.com
2,mail.uz
3,inbox.com


In [9]:
df4 = pd.DataFrame({
    'Name': ['   Ali   ', 'Vali', '  Sardor', 'Jasur  '],
    'City': ['TASHKENT', 'samarkand', 'BuKhArA', 'fergana'],
    'Phone': ['+998901234567', '998-90-123-45-68', '998 90 123 45 69', '998901234570']
})
df4

Unnamed: 0,Name,City,Phone
0,Ali,TASHKENT,+998901234567
1,Vali,samarkand,998-90-123-45-68
2,Sardor,BuKhArA,998 90 123 45 69
3,Jasur,fergana,998901234570


In [16]:
# Remove spaces in the 'Name' column
df4['Name'] = df4['Name'].str.strip()

# Make the 'City' column Title Case (Every word capitalized)
df4['City'] = df4['City'].str.capitalize()

# Remove all characters from the 'Phone' column (leave only numbers)
df4['Phone'] = df4['Phone'].str.replace(r'\D', '', regex=True)

In [17]:
df4

Unnamed: 0,Name,City,Phone
0,Ali,Tashkent,998901234567
1,Vali,Samarkand,998901234568
2,Sardor,Bukhara,998901234569
3,Jasur,Fergana,998901234570


In [18]:
df5 = pd.DataFrame({
    'Product': ['Laptop', 'Phone', 'Laptop', 'Tablet', 'Phone', 'Laptop'],
    'Store': ['A', 'A', 'B', 'A', 'B', 'C'],
    'Price': [1000, 500, 950, 300, 480, 1020]
})
df5

Unnamed: 0,Product,Store,Price
0,Laptop,A,1000
1,Phone,A,500
2,Laptop,B,950
3,Tablet,A,300
4,Phone,B,480
5,Laptop,C,1020


In [20]:
# Change the entries 'Laptop' to 'Notebook'
df5.replace({'Laptop':'Notebook'})

Unnamed: 0,Product,Store,Price
0,Notebook,A,1000
1,Phone,A,500
2,Notebook,B,950
3,Tablet,A,300
4,Phone,B,480
5,Notebook,C,1020


In [21]:
df6 = pd.DataFrame({
    'Code': ['USR001', 'USR002', 'ADM001', 'USR003', 'ADM002'],
    'Name': ['Ali', 'Vali', 'Admin_Sardor', 'Jasur', 'Admin_Kamol'],
    'Status': ['active', 'inactive', 'active', 'active', 'inactive']
})
df6

Unnamed: 0,Code,Name,Status
0,USR001,Ali,active
1,USR002,Vali,inactive
2,ADM001,Admin_Sardor,active
3,USR003,Jasur,active
4,ADM002,Admin_Kamol,inactive


In [24]:
df7 = pd.DataFrame({
    'Name': ['Ali', 'Vali', 'Sardor'],
    'Age': [25, 30, 35],
    'Salary': [5000, 6000, 5500]
})
df7

Unnamed: 0,Name,Age,Salary
0,Ali,25,5000
1,Vali,30,6000
2,Sardor,35,5500


In [25]:
# Change the name of the 'Name' column to 'Employee'
df7.rename(columns={'Name':'Employee'})

Unnamed: 0,Employee,Age,Salary
0,Ali,25,5000
1,Vali,30,6000
2,Sardor,35,5500


In [26]:
df8 = pd.DataFrame({
    'product_name': ['Laptop', 'Phone', 'Tablet'],
    'product_price': [1000, 500, 300],
    'product_stock': [50, 120, 80]
})
df8

Unnamed: 0,product_name,product_price,product_stock
0,Laptop,1000,50
1,Phone,500,120
2,Tablet,300,80


In [30]:
# Remove the 'product_' prefix from all column names
col_names = ['name', 'price', 'stock']
df8.columns = col_names

In [31]:
df8

Unnamed: 0,name,price,stock
0,Laptop,1000,50
1,Phone,500,120
2,Tablet,300,80


In [32]:
df9 = pd.DataFrame({
    'Name': ['Ali', 'Vali', 'Sardor', 'Jasur'],
    'Age': [25, 30, 35, 28],
    'City': ['Tashkent', 'Samarkand', 'Bukhara', 'Fergana']
})
df9

Unnamed: 0,Name,Age,City
0,Ali,25,Tashkent
1,Vali,30,Samarkand
2,Sardor,35,Bukhara
3,Jasur,28,Fergana


In [34]:
# Delete row 2 (index=1) completely
df9.drop(1)

Unnamed: 0,Name,Age,City
0,Ali,25,Tashkent
2,Sardor,35,Bukhara
3,Jasur,28,Fergana


In [35]:
df10 = pd.DataFrame({
    'Product': ['Laptop', 'Phone', 'Tablet', 'Watch', 'Camera'],
    'Price': [1000, 500, 300, 200, 800],
    'Stock': [50, 0, 80, 0, 45]
})
df10

Unnamed: 0,Product,Price,Stock
0,Laptop,1000,50
1,Phone,500,0
2,Tablet,300,80
3,Watch,200,0
4,Camera,800,45


In [37]:
df10[df10['Stock'] == 0].index

Index([1, 3], dtype='int64')

In [39]:
# Delete rows with stock value 0
df10.drop([1, 3])

Unnamed: 0,Product,Price,Stock
0,Laptop,1000,50
2,Tablet,300,80
4,Camera,800,45
