# Working with DataFrames

In [None]:
import pandas as pd

In [None]:
df = pd.read_excel("../data/LungCapData.xls")

In [None]:
df.head(10)

In [None]:
df.Gender.head(10)

In [None]:
df.Gender == "male"

In [None]:
df_male = df[df.Gender == "male"] 

In [None]:
df_female = df[df.Gender == "female"]

In [None]:
df.shape

In [None]:
df_male.shape 

In [None]:
df_female.shape

In [None]:
df.loc[df.Gender == "female", "Smoke"]

In [None]:
mask1 = df.Gender == "male"
mask1

In [None]:
df_male = df[mask1]
df.head() 

In [None]:
df_male = df.loc[mask1] 
df_male.head() 

In [None]:
df.dtypes # == object

In [None]:
mask2 = df.dtypes == object
mask2

In [None]:
df.loc[:, mask2].head() 

In [None]:
df.loc[:, ~mask2].head() 

In [None]:
mask1

In [None]:
df.loc[mask1, ~mask2].head() 

## Filtering DataFrames with many Conditions (AND)

In [None]:
2 == 2 and 2 > 3  

In [None]:
2 == 2 and 3 > 2 

In [None]:
2 == 2 or 2 > 3 

In [None]:
2 < 1 or 3 == 2 

In [None]:
import pandas as pd

In [None]:
df = pd.read_excel("../data/LungCapData.xls")

In [None]:
df.head(10) 

In [None]:
df.Gender == "male"

In [None]:
df.Smoke == "yes"

In [None]:
(df.Gender == "male") & (df.Smoke == "yes")

In [None]:
df[(df.Gender == "male") & (df.Smoke == "yes")]

In [None]:
df.head() 

In [None]:
(df.Gender == "male") | (df.Smoke == "yes")

In [None]:
df[(df.Gender == "male") | (df.Smoke == "yes")]

In [None]:
mask1 = df.Gender == "male"
mask1.head()

In [None]:
df.Age > 14

In [None]:
mask2 = df.Age > 14
mask2.head()

In [None]:
(mask1 & mask2).head()

In [None]:
df.columns

In [None]:
male_adult = df.loc[mask1 & mask2, ["Smoke", "Caesarean"]]
male_adult.head(5) 

In [None]:
male_adult.info()

In [None]:
male_adult.describe()

In [None]:
df.describe()

### Filtering DataFrames with many Conditions (OR)

In [None]:
import pandas as pd

In [None]:
df = pd.read_excel("../data/LungCapData.xls")

In [None]:
df.head() 

In [None]:
df.Gender == 'female'

In [None]:
df.Caesarean == 'yes'

In [None]:
(df.Gender == 'female') & (df.Caesarean == 'yes')

In [None]:
df[(df.Gender == 'female') | (df.Caesarean == 'yes')]

In [None]:
mask1 = df.Gender == "female"
mask1.head(5)

In [None]:
mask2 = df.Age < 14
mask2.head(5)

In [None]:
(mask1 | mask2).head(10)

In [None]:
df.loc[mask1 | mask2].head()  

In [None]:
female_adult = df.loc[mask1 | mask2, ["Smoke", "Caesarean"]]

In [None]:
female_adult.head()

In [None]:
female_adult.info()

In [None]:
female_adult.describe()

In [None]:
df.describe()

## Advanced Filtering with between(), isin() and ~

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("../data/gapminder.csv")

In [None]:
df.head()

In [None]:
year_1952 = df.loc[df.year == 1952]

In [None]:
year_1952.head()

In [None]:
year_1952.tail()

In [None]:
year_1952.info()

In [None]:
since1952 = df.loc[df.year >= 1952]

In [None]:
since1952.head()

In [None]:
since1952.tail()

In [None]:
df.year.between(1960, 1969).head()

In [None]:
df_60s = df.loc[df.year.between(1960, 1969, inclusive=True)]

In [None]:
df_60s.head()

In [None]:
df_60s.tail()

In [None]:
selected_year = [1972, 1996]

In [None]:
df.year.isin(selected_year).head()

In [None]:
L = [4, 5, 6]
60 not in L   

In [None]:
year_df = df.loc[df.year.isin(selected_year)]

In [None]:
year_df.head()

In [None]:
year_df.tail()

In [None]:
og_not_72_96 = df.loc[~df.year.isin(selected_year)]

In [None]:
og_not_72_96.head()

In [None]:
og_not_72_96.year.unique()

### any() and all()

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("../data/gapminder.csv")

In [None]:
df.head() 

In [None]:
df.country == "Bangladesh"

In [None]:
(df.country == "Bangladesh").any()

In [None]:
(df.country == "Bangladesh").all()

In [None]:
(df.year == 2022).any()

In [None]:
pd.Series([-1, 0.5 , 1, -0.1, 0]).any()

In [None]:
(df.continent == "Asia").any() 

### Removing Columns

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("../data/gapminder.csv")

In [None]:
df.head() 

In [None]:
df.drop(columns = "country") 
df.head()

In [None]:
df_new = df.drop(columns = "country") 
df_new.head()

In [None]:
df.drop(columns = ["country", "pop"], inplace=True)
df.head() 

In [None]:
df.drop(labels = "continent", axis = "columns", inplace= True)
df.head() 

In [None]:
df.head() 

### Removing Rows

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("../data/covid19.csv", index_col = "Country/Region")

In [None]:
df.head(10) 

In [None]:
df.drop(index = "Mainland China")
df.head()

In [None]:
df.drop(index = ["Mainland China","Bangladesh"], inplace = True)
df.head() 

### Adding new Columns to a DataFrame

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("../data/500_Person_Gender_Height_Weight_Index.csv")

In [None]:
df.head() 

In [None]:
df["Zeros"] = "Zero" 

In [None]:
df.head()

### Creating Columns based on other Columns

In [None]:
import pandas as pd
import numpy as np 

In [None]:
df = pd.read_csv("../data/500_Person_Gender_Height_Weight_Index.csv")

In [None]:
df.head() 

In [None]:
df['HeightLog10'] = np.log10(df['Height'])
df.head() 

In [None]:
df['WeightLog10'] = np.log10(df['Weight'])
df.head() 

In [None]:
df['HeightInM'] = df['Height']/100 
df.head() 

In [None]:
df['BMI'] = (df['Weight'])/ (df['HeightInM']) * (df['HeightInM'])

In [None]:
df.head() 

## Sorting DataFrames 

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("../data/gapminder.csv")
df.head() 

In [None]:
df.year.sort_values() 

In [None]:
df.sort_values(by = "year").head() 

In [None]:
df.sort_values(by = "pop", ascending = False, inplace = True)
df.head() 

In [None]:
df.sort_values(by = ["country", "continent"], ascending = [True, True], inplace= True)

In [None]:
df.sort_index(ascending = True, inplace = True)

In [None]:
df 

In [None]:
df.sort_values(by = "pop").reset_index(drop = True) 

In [None]:
df.sort_values(by = "pop", ignore_index = True)

### Ranking DataFrames with rank()

In [3]:
import pandas as pd

In [4]:
ages = pd.Series([15, 32, 45, 21, 55, 15, 0],  index = ["A", "B", "C", "D", "E", "F", "G"])

In [5]:
ages = pd.Series([15, 32, 45, 21, 55, 15, 0],  index = ["A", "B", "C", "D", "E", "F", "G"])

In [6]:
ages

A    15
B    32
C    45
D    21
E    55
F    15
G     0
dtype: int64

In [7]:
ages.sort_values(ascending = False)

E    55
C    45
B    32
D    21
A    15
F    15
G     0
dtype: int64

In [8]:
ages.rank(ascending=False, method = "min").sort_values(ascending = True)

E    1.0
C    2.0
B    3.0
D    4.0
A    5.0
F    5.0
G    7.0
dtype: float64

In [None]:
ages.rank(ascending=False, method = "min", pct=True).sort_values() * 100 

In [None]:
df = pd.read_csv("../data/500_Person_Gender_Height_Weight_Index.csv")

In [None]:
df.Height.rank(ascending = False).head() 

In [None]:
df["Height_Rank"] = df.Height.rank(ascending = False, method="min")
df.head() 

In [None]:
df.sort_values("Height", ascending= False).head() 

In [None]:
df.drop(columns = "Height_Rank", inplace= True)

### nunique(), nlargest() and nsmallest() with DataFrames

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("../data/gapminder.csv")

In [None]:
df.head() 

In [None]:
df.tail() 

In [None]:
df.country.unique()

In [None]:
df.nunique(axis = 1, dropna=False)

In [None]:
df.nunique(dropna = False) 

In [None]:
df.nlargest(n = 5, columns = "pop") 

In [None]:
df.sort_values("country", ascending = False).head(5)

In [None]:
df.nsmallest(n = 5, columns = "pop")

In [None]:
p = pd.to_numeric(df['pop'])
p.idxmax() 

In [None]:
p = pd.to_numeric(df['pop'])
p.idxmin() 

### Summary Statistics and Accumulations

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("../data/500_Person_Gender_Height_Weight_Index.csv")

In [None]:
df.head()  

In [None]:
df.describe() 

In [None]:
df.count(axis = "columns") 

In [None]:
df.count(axis = 1) 

In [None]:
df.mean(axis = 1) 

In [None]:
df.sum(axis = 0) 

In [None]:
df.head() 

In [None]:
df.Height.cumsum(axis = 0) 

In [None]:
df.corr() 

In [None]:
df.cov() 

### The agg() method

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("../data/500_Person_Gender_Height_Weight_Index.csv")

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.mean() 

In [None]:
df.agg("mean")

In [None]:
df.agg(["mean", "std"])

In [None]:
df.agg(["mean", "std", "min", "max", "median"])

In [None]:
df.agg({"Weight": "mean", "Height":["min", "max"]})

### apply()

In [None]:
import pandas as pd
import numpy as np 

In [None]:
df = pd.read_csv("../data/500_Person_Gender_Height_Weight_Index.csv")

In [None]:
df.head() 

In [None]:
df.info() 

In [None]:
df.min(axis = 0) 

In [None]:
df['HeightLog10'] = df['Height'].apply(lambda x : np.log10(x))
df.head() 

In [None]:
df['WeightCat'] = df['Weight'].apply(lambda x : "High" if x > 65 else "Low")
df.head() 

In [None]:
def heightcat(x): 
    if x == 199: 
        return "Tall" 
    elif  x == 170: 
        return "Medium"
    else: 
        return "Small"

In [None]:
df['HeightCat'] = df['Height'].apply(heightcat)
df.head() 

### String Operations Intro / Refresher

In [None]:
"Hello World"

In [None]:
type("Hello World")

In [None]:
hello = "Hello World"
hello

In [None]:
len(hello)

In [None]:
hello.lower()

In [None]:
hello.upper()

In [None]:
hello.title()

In [None]:
hello.split(" ")

In [None]:
hello.replace("Hello", "Hi")

### String Operations in Pandas

In [None]:
import pandas as pd

In [None]:
df = pd.read_excel("../data/LungCapData.xls")

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.columns.str.lower()

In [None]:
df.columns.str.title() 

In [None]:
df.columns.str.contains('s')