<a href="https://colab.research.google.com/github/izzat-ai/learning-ai/blob/main/pandas_data_preparation/practice_all.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This page will provide practice in data preparation. DataFrames are created using AI.

In [1]:
import numpy as np
import pandas as pd

In [2]:
df_scores = pd.DataFrame({
    "student": ["Ali", "Vali", "Hasan", "Olim", "Zafar"],
    "math": [85, np.nan, 78, np.nan, 90],
    "english": [88, 92, np.nan, 75, np.nan]
})
df_scores

Unnamed: 0,student,math,english
0,Ali,85.0,88.0
1,Vali,,92.0
2,Hasan,78.0,
3,Olim,,75.0
4,Zafar,90.0,


In [4]:
# qaysi ustunlarda nan borligini aniqlash
df_scores.isnull().sum()

Unnamed: 0,0
student,0
math,2
english,2


In [8]:
# matematikadan bahosi yo'qlarni o'zidan oldingi qiymat bn to'ldirish
df_scores['math'] = df_scores['math'].ffill()
df_scores

Unnamed: 0,student,math,english
0,Ali,85.0,88.0
1,Vali,85.0,92.0
2,Hasan,78.0,
3,Olim,78.0,75.0
4,Zafar,90.0,


In [10]:
# englishdan bahosi yo'qlarni o'rtacha qiymat bn to'ldirish
eng_mean = df_scores['english'].mean()
df_scores['english'] = df_scores['english'].fillna(eng_mean)
df_scores

Unnamed: 0,student,math,english
0,Ali,85.0,88.0
1,Vali,85.0,92.0
2,Hasan,78.0,85.0
3,Olim,78.0,75.0
4,Zafar,90.0,85.0


In [11]:
df_users = pd.DataFrame({
    "user_id": [1, 2, 2, 3, 4, 4],
    "username": ["ali", "vali", "vali", "hasan", "olim", "olim"]
})
df_users

Unnamed: 0,user_id,username
0,1,ali
1,2,vali
2,2,vali
3,3,hasan
4,4,olim
5,4,olim


In [14]:
# takrorlangan qatorlarni aniqlash
df_users.duplicated()

Unnamed: 0,0
0,False
1,False
2,True
3,False
4,False
5,True


In [16]:
# takrorlangan qatorlarni o'chirish
df_users = df_users.drop_duplicates()
df_users

Unnamed: 0,user_id,username
0,1,ali
1,2,vali
3,3,hasan
4,4,olim


In [17]:
df_status = pd.DataFrame({
    "name": ["Ali", "Vali", "Hasan", "Olim"],
    "status": ["active", "inactive", "active", "inactive"]
})
df_status

Unnamed: 0,name,status
0,Ali,active
1,Vali,inactive
2,Hasan,active
3,Olim,inactive


In [18]:
# statusini raqamli turga o'tkizish
df_status['new_status'] = df_status['status'].replace({'active':1, 'inactive':0})
df_status

  df_status['new_status'] = df_status['status'].replace({'active':1, 'inactive':0})


Unnamed: 0,name,status,new_status
0,Ali,active,1
1,Vali,inactive,0
2,Hasan,active,1
3,Olim,inactive,0


In [19]:
df_sales = pd.DataFrame({
    "region": ["Toshkent", "Samarqand", "Toshkent", "Buxoro", "Samarqand"],
    "sales": [1200, 900, 1500, 700, 1100]
})
df_sales

Unnamed: 0,region,sales
0,Toshkent,1200
1,Samarqand,900
2,Toshkent,1500
3,Buxoro,700
4,Samarqand,1100


In [21]:
# hududlar bo'yicha umumiy va o'rtacha savdoni hisoblash
df_sales.groupby('region')['sales'].agg(['sum', 'mean'])

Unnamed: 0_level_0,sum,mean
region,Unnamed: 1_level_1,Unnamed: 2_level_1
Buxoro,700,700.0
Samarqand,2000,1000.0
Toshkent,2700,1350.0


In [22]:
df_salary = pd.DataFrame({
    "employee": ["A", "B", "C", "D", "E"],
    "salary": [1200, 1300, 1250, 15000, 1280]
})
df_salary

Unnamed: 0,employee,salary
0,A,1200
1,B,1300
2,C,1250
3,D,15000
4,E,1280


In [23]:
# o'rtacha va standard og'ishni aniqlash
mean = df_salary['salary'].mean()
std = df_salary['salary'].std()

- z-scoreni hisoblash

In [24]:
df_salary['z_score'] = (df_salary['salary']-mean)/std
df_salary

Unnamed: 0,employee,salary,z_score
0,A,1200,-0.456561
1,B,1300,-0.44029
2,C,1250,-0.448426
3,D,15000,1.788821
4,E,1280,-0.443544


- anomal qiymat 1.5 dan katta bo'ladi . Shuning uchun 1.5 dan katta ma'lumot borligini tekshiramiz

In [27]:
# oulier qiymatni aniqlash
outlier = df_salary[df_salary['z_score'].abs() > 1.5]
outlier

Unnamed: 0,employee,salary,z_score
3,D,15000,1.788821


In [29]:
# outlier qiymatni tashavorish
df_salary.drop(3)

Unnamed: 0,employee,salary,z_score
0,A,1200,-0.456561
1,B,1300,-0.44029
2,C,1250,-0.448426
4,E,1280,-0.443544


In [31]:
df_orders = pd.DataFrame({
    "order_id": [1, 2, 3, 4],
    "order_date": ["2024-01-10", "2024-02-15", "2024-02-20", "2024-03-05"],
    "amount": [250, 400, 300, 500]
})
df_orders

Unnamed: 0,order_id,order_date,amount
0,1,2024-01-10,250
1,2,2024-02-15,400
2,3,2024-02-20,300
3,4,2024-03-05,500


In [35]:
import datetime as dt

In [34]:
np.dtype(df_orders['order_date'])

dtype('O')

In [36]:
# order_date ustunini datetime formatiga o'tkizish
df_orders['order_date'] = pd.to_datetime(df_orders['order_date'])
df_orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   order_id    4 non-null      int64         
 1   order_date  4 non-null      datetime64[ns]
 2   amount      4 non-null      int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 228.0 bytes


In [38]:
df_orders['month'] = df_orders['order_date'].dt.month

# oylar bo'yicha buyurtmar sonini hisoblash
df_orders.groupby('month')['amount'].sum()

Unnamed: 0_level_0,amount
month,Unnamed: 1_level_1
1,250
2,700
3,500


In [39]:
df_data = pd.DataFrame({
    "id": range(1, 21),
    "score": [45, 60, 72, 88, 90, 55, 67, 80, 77, 69,
              92, 84, 73, 66, 58, 91, 85, 79, 62, 70]
})
df_data

Unnamed: 0,id,score
0,1,45
1,2,60
2,3,72
3,4,88
4,5,90
5,6,55
6,7,67
7,8,80
8,9,77
9,10,69


In [44]:
# tasodifiy 10 ta qator olish
df_sample = df_data.sample(10)
df_sample

Unnamed: 0,id,score
16,17,85
18,19,62
7,8,80
19,20,70
0,1,45
2,3,72
15,16,91
11,12,84
8,9,77
5,6,55
