In [41]:
import pandas as pd

# Sample data as a dictionary
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'Tonte', 'Eve', 'Grace', 'Ama'],
    'income': [4000, 5000, 6000, 7000, 7500, 8000, 10000000,]
}

df = pd.DataFrame(data)
df

Unnamed: 0,name,income
0,Alice,4000
1,Bob,5000
2,Charlie,6000
3,Tonte,7000
4,Eve,7500
5,Grace,8000
6,Ama,10000000


In [42]:
df.describe()

Unnamed: 0,income
count,7.0
mean,1433929.0
std,3777283.0
min,4000.0
25%,5500.0
50%,7000.0
75%,7750.0
max,10000000.0


In [40]:
df.income.quantile(0.75)
df.income.quantile(0.25, interpolation="higher")

np.int64(6000)

In [43]:
df.income.quantile(1)

np.float64(10000000.0)

In [46]:
percentile_99 = df.income.quantile(0.99)
print(percentile_99)

9400479.999999994


In [48]:
df_no_outlier = df[df.income <= percentile_99]
df_no_outlier

Unnamed: 0,name,income
0,Alice,4000
1,Bob,5000
2,Charlie,6000
3,Tonte,7000
4,Eve,7500
5,Grace,8000


In [54]:
import numpy as np

df['income'][3]=np.nan

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['income'][3]=np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['income'][3]=np.nan


In [55]:
df

Unnamed: 0,name,income
0,Alice,4000.0
1,Bob,5000.0
2,Charlie,6000.0
3,Tonte,
4,Eve,7500.0
5,Grace,8000.0
6,Ama,10000000.0


In [57]:
df.income.mean()

np.float64(1671750.0)

In [58]:
df_new = df.fillna(df.income.mean())
df_new

Unnamed: 0,name,income
0,Alice,4000.0
1,Bob,5000.0
2,Charlie,6000.0
3,Tonte,1671750.0
4,Eve,7500.0
5,Grace,8000.0
6,Ama,10000000.0


In [60]:
df = pd.read_csv("data/AB_NYC_2019.csv")
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [61]:
df.price.describe()

count    48895.000000
mean       152.720687
std        240.154170
min          0.000000
25%         69.000000
50%        106.000000
75%        175.000000
max      10000.000000
Name: price, dtype: float64

In [64]:
min_thresold, max_thresold = df.price.quantile([0.01,0.999])
min_thresold, max_thresold

(30.0, 3000.0)

In [65]:
df[df.price<min_thresold]

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
957,375249,Enjoy Staten Island Hospitality,1887999,Rimma & Jim,Staten Island,Graniteville,40.62109,-74.16534,Private room,20,3,80,2019-05-26,0.92,1,226
2675,1428154,"Central, Peaceful Semi-Private Room",5912572,Tangier,Brooklyn,Flatbush,40.63899,-73.95177,Shared room,29,2,5,2014-10-20,0.07,1,321
2860,1620248,Large furnished 2 bedrooms- - 30 days Minimum,2196224,Sally,Manhattan,East Village,40.73051,-73.98140,Entire home/apt,10,30,0,,,4,137
3020,1767037,Small Cozy Room Wifi & AC near JFK,9284163,Antonio,Queens,Woodhaven,40.68968,-73.85219,Private room,29,2,386,2019-06-19,5.53,3,50
3918,2431607,"Bright, Airy Room Share for 2",4973668,Gloria,Brooklyn,Bedford-Stuyvesant,40.68642,-73.93440,Shared room,25,5,76,2019-06-06,1.22,3,258
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48486,36280646,"Cable and wfi, L/G included.",272872092,Chris,Queens,Forest Hills,40.73657,-73.85088,Entire home/apt,16,9,1,2019-07-07,1.00,1,322
48647,36354776,Cozy bedroom in diverse neighborhood near JFK,273393150,Liza,Queens,Richmond Hill,40.68639,-73.81847,Private room,28,2,0,,,1,24
48832,36450814,FLATBUSH HANG OUT AND GO,267223765,Jarmel,Brooklyn,Flatbush,40.64922,-73.96078,Shared room,20,1,0,,,3,363
48867,36473044,The place you were dreaming for.(only for guys),261338177,Diana,Brooklyn,Gravesend,40.59080,-73.97116,Shared room,25,1,0,,,6,338


In [66]:
df2 = df[(df.price>min_thresold)&(df.price<max_thresold)]
df2.shape

(48183, 16)

In [67]:
df2.sample(5)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
22231,17908331,"Big, Bright, Tribeca Studio w/ Doorman & Elevator",4601948,Lily,Manhattan,Tribeca,40.7181,-74.00984,Entire home/apt,157,2,13,2018-09-23,0.49,1,18
25451,20358824,"$45NYCCozy Room with curtain NearJ,G, and M train",145214508,Yonette,Brooklyn,Bedford-Stuyvesant,40.69522,-73.94226,Shared room,45,3,7,2018-10-02,0.31,2,364
29576,22704069,The Paris - Duplex Penthouse with Roof Deck,162575767,Hampton,Manhattan,SoHo,40.72044,-73.9998,Entire home/apt,800,2,0,,,1,2
35589,28253864,"Astoria, NY 1-bedroom apartment",14933692,Fernando,Queens,Astoria,40.76616,-73.91332,Entire home/apt,90,2,8,2019-01-18,0.82,1,0
12990,9860190,Deluxe Furnished 1-Bedroom Midtown West Apartment,30283594,Kara,Manhattan,Theater District,40.75943,-73.98302,Entire home/apt,239,30,0,,,121,363


In [68]:
df2.price.describe()

count    48183.000000
mean       148.772036
std        153.594795
min         31.000000
25%         70.000000
50%        110.000000
75%        179.000000
max       2999.000000
Name: price, dtype: float64