# Sorting and Subsetting
`.sort_values()` is a very powerful method tha can have multiple parameters and attributes

In [3]:
import pandas as pd
import numpy as np

california_housing_test = pd.read_csv('../datasets-from-colab/california_housing_test.csv')
dataset = {
    'column1': np.random.rand(10),  # 10 random floats between 0 and 1
    'column2': np.random.randint(0, 100, 10),  # 10 random integers between 0 and 99
    'column3': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C', 'A'] # 10 categorical values
}

sample_df = pd.DataFrame(california_housing_test)

In [None]:
"""
  .sort_values(<COLUMN>)

  SORTS DATAFRAME BY COLUMN, 
  DEFAULT `LOWEST TO HIGHEST VALUE` BASED ON COLUMN
"""
sample_df = pd.DataFrame(dataset)
sample_df.sort_values('column1')

    column1  column2 column3
9  0.104231       94       A
3  0.165953       13       A
6  0.265867       75       A
1  0.376848       35       B
7  0.437907        9       B
5  0.643364       41       C
2  0.759861       45       C
4  0.833786       94       B
0  0.848438       92       A
8  0.929721       40       C


In [None]:
"""
  .sort_values(<COLUMN>, ascending=False)

  SORTS DATAFRAME BY COLUMN, HIGHEST TO LOWEST
"""
sample_df.sort_values(['population', "median_income"], ascending=[True, False])

      longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
2640    -114.62     33.62                26.0         18.0             3.0   
1115    -116.95     33.86                 1.0          6.0             2.0   
740     -117.12     32.66                52.0         16.0             4.0   
2494    -118.44     34.04                49.0         32.0             7.0   
1355    -117.11     32.66                52.0         25.0             5.0   
...         ...       ...                 ...          ...             ...   
1597    -117.12     33.49                 4.0      21988.0          4055.0   
2429    -117.20     33.58                 2.0      30450.0          5033.0   
1146    -117.27     33.15                 4.0      23915.0          4135.0   
2186    -116.14     34.45                12.0       8796.0          1721.0   
978     -121.53     38.48                 5.0      27870.0          5027.0   

      population  households  median_income  median_house_value

Subsetting refers to choosing a specific column in a dataframe:

In [9]:
sample_df['population']

0       1537.0
1        809.0
2       1484.0
3         49.0
4        850.0
         ...  
2995    1258.0
2996    3496.0
2997     693.0
2998      46.0
2999     753.0
Name: population, Length: 3000, dtype: float64

Getting multiple columns at once:

In [10]:
sample_df[["population", "households"]]

Unnamed: 0,population,households
0,1537.0,606.0
1,809.0,277.0
2,1484.0,495.0
3,49.0,11.0
4,850.0,237.0
...,...,...
2995,1258.0,607.0
2996,3496.0,1036.0
2997,693.0,220.0
2998,46.0,14.0


Subsetting ROWS

In [13]:
# sample_df[sample_df["column2"] < 20]
california_housing_test[california_housing_test["housing_median_age"] < 2]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
842,-117.95,35.08,1.0,83.0,15.0,32.0,15.0,4.875,141700.0
1115,-116.95,33.86,1.0,6.0,2.0,8.0,2.0,1.625,55000.0


In [None]:
# Subsetting multiple conditions
# BEST FOR LOOKING ONE SPECIFIC VALUE ONLY
CONDITION1 = sample_df['population'] == 32
CONDITION2 = sample_df['households'] == 15
sample_df[CONDITION1 & CONDITION2]

In [9]:
# 
sample = [32, 8]
sample_df[sample_df['population'].isin(sample)]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
676,-118.25,34.09,52.0,104.0,20.0,32.0,17.0,3.75,241700.0
740,-117.12,32.66,52.0,16.0,4.0,8.0,3.0,1.125,60000.0
842,-117.95,35.08,1.0,83.0,15.0,32.0,15.0,4.875,141700.0
1115,-116.95,33.86,1.0,6.0,2.0,8.0,2.0,1.625,55000.0
1726,-117.05,33.03,16.0,87.0,20.0,32.0,21.0,4.3571,144600.0


In [4]:
# `.isin` method
# BEST FOR LOOKING MULTIPLE SPECIFIC VALUES
sample_df[sample_df['median_income'].isin([3.7500])]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
454,-122.53,37.86,38.0,1183.0,196.0,628.0,205.0,3.75,478600.0
505,-118.18,34.05,41.0,762.0,147.0,817.0,176.0,3.75,123100.0
676,-118.25,34.09,52.0,104.0,20.0,32.0,17.0,3.75,241700.0
841,-117.97,33.75,32.0,1564.0,270.0,973.0,290.0,3.75,190400.0
1478,-122.46,37.77,52.0,1824.0,388.0,799.0,363.0,3.75,435700.0
