01. DataFrame and Series

In [1]:
import pandas as pd

menu = pd.Series(['비빔밥','김치찌개','된장찌개'])
menu

0     비빔밥
1    김치찌개
2    된장찌개
dtype: object

In [2]:
price = pd.Series([10000, 9000, 8000])
price

0    10000
1     9000
2     8000
dtype: int64

In [4]:
# Dataframe
pd.DataFrame({
    "Menu" : menu,
    "Price" : price
})

Unnamed: 0,Menu,Price
0,비빔밥,10000
1,김치찌개,9000
2,된장찌개,8000


In [5]:
df = pd.DataFrame({
    "Menu": ['Buger', 'Pizza', "Corn"],
    "Price": [10000, 9000, 8000],
    "Origin": ['US','US','US']
})

df

Unnamed: 0,Menu,Price,Origin
0,Buger,10000,US
1,Pizza,9000,US
2,Corn,8000,US


In [6]:
df["Menu"]

0    Buger
1    Pizza
2     Corn
Name: Menu, dtype: object

In [7]:
df[["Menu"]]

Unnamed: 0,Menu
0,Buger
1,Pizza
2,Corn


In [8]:
df[["Menu", "Price"]]

Unnamed: 0,Menu,Price
0,Buger,10000
1,Pizza,9000
2,Corn,8000


In [10]:
cols=['Menu','Price']
df[cols]

Unnamed: 0,Menu,Price
0,Buger,10000
1,Pizza,9000
2,Corn,8000


In [11]:
print("df   :", type(df))
print("df['Price'] :", type(df['Price']))
print("df[['Price']] :", type(df[['Price']]))

df   : <class 'pandas.core.frame.DataFrame'>
df['Price'] : <class 'pandas.core.series.Series'>
df[['Price']] : <class 'pandas.core.frame.DataFrame'>


02. Data Import and Export

In [12]:
df = pd.DataFrame({
    "Menu" : ["Americano", "Latte", "Mocha", "Cappuccino", "Espresso", "Milktea", "Green Tea"],
    "Price" : [4500, 5000, 5500, 5000, 4000, 5900, 5300],
    "Calories" : [10, 110, 250, 110, 20, 210, 0],
})

df

Unnamed: 0,Menu,Price,Calories
0,Americano,4500,10
1,Latte,5000,110
2,Mocha,5500,250
3,Cappuccino,5000,110
4,Espresso,4000,20
5,Milktea,5900,210
6,Green Tea,5300,0


In [13]:
df.to_csv('coffee_menu.csv')

In [15]:
temp_df = pd.read_csv('coffee_menu.csv')
temp_df.head()

Unnamed: 0.1,Unnamed: 0,Menu,Price,Calories
0,0,Americano,4500,10
1,1,Latte,5000,110
2,2,Mocha,5500,250
3,3,Cappuccino,5000,110
4,4,Espresso,4000,20


In [None]:
# without index column
df.to_csv("cafe.csv", index=False)
df= pd.read_csv("cafe.csv")
df.head()

Unnamed: 0,Menu,Price,Calories
0,Americano,4500,10
1,Latte,5000,110
2,Mocha,5500,250
3,Cappuccino,5000,110
4,Espresso,4000,20


In [17]:
## Advanced Topics
# index_col
# pd.read_csv('data.csv', index_col=0)

# usecols
# pd.read_csv('data.csv', usecols=['col1', 'col2'])

# parse_dates
# pd.read_csv('data.csv', parse_dates=['date_col'])

# encoding
# pd.read_csv('data.csv', encoding='utf-8')

03. EDA

In [18]:
# Important step before data preprocessing to understand the dataset
# Involves identifying missing values, outliers, patterns, and variable characteristics
# Helps observe and understand the data from multiple perspectives

In [19]:
df.head(2)

Unnamed: 0,Menu,Price,Calories
0,Americano,4500,10
1,Latte,5000,110


In [20]:
df.sample(3)

Unnamed: 0,Menu,Price,Calories
6,Green Tea,5300,0
0,Americano,4500,10
2,Mocha,5500,250


In [21]:
df.shape

(7, 3)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Menu      7 non-null      object
 1   Price     7 non-null      int64 
 2   Calories  7 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 300.0+ bytes


In [23]:
df.corr(numeric_only=True)

Unnamed: 0,Price,Calories
Price,1.0,0.713227
Calories,0.713227,1.0


In [24]:
df_car = pd.DataFrame({
    "car": ['Sedan', 'SUV','Sedan', 'SUV', 'SUV','SUV','Sedan','Sedan','Sedan','Sedan','Sedan' ],
    "size": ['S', 'M', 'S', 'S', 'M', 'M', 'L', 'S', 'S', 'M', 'S']
})

df_car.head(3)

Unnamed: 0,car,size
0,Sedan,S
1,SUV,M
2,Sedan,S


In [25]:
df_car.nunique()

car     2
size    3
dtype: int64

In [26]:
print(df_car['car'].unique())
print(df_car['size'].unique())

['Sedan' 'SUV']
['S' 'M' 'L']


In [None]:
# value_counts
print(df_car['car'].value_counts())
print(df_car['size'].value_counts())

car
Sedan    7
SUV      4
Name: count, dtype: int64
size
S    6
M    4
L    1
Name: count, dtype: int64


In [28]:
df.describe()


Unnamed: 0,Price,Calories
count,7.0,7.0
mean,5028.571429,101.428571
std,631.70216,99.40298
min,4000.0,0.0
25%,4750.0,15.0
50%,5000.0,110.0
75%,5400.0,160.0
max,5900.0,250.0


In [32]:
df_car.describe(include="O")

Unnamed: 0,car,size
count,11,11
unique,2,3
top,Sedan,S
freq,7,6


04. Data Type Conversion

In [36]:
data = {
    "Menu": ["Americano", "Latte", "Mocha", "Cappuccino", "Espresso", "Milktea", "Green Tea"],
    "Price": [4500.0, 5000.0, 5500.0, 5000.0, 4000.0, 5900.0, 5300.0],
    "Calories": ['10','110','250','110','20','210','0']
}
df = pd.DataFrame(data)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Menu      7 non-null      object 
 1   Price     7 non-null      float64
 2   Calories  7 non-null      object 
dtypes: float64(1), object(2)
memory usage: 300.0+ bytes


In [None]:
# change data type using astype
df['Price']=df['Price'].astype('int')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Menu      7 non-null      object
 1   Price     7 non-null      int32 
 2   Calories  7 non-null      object
dtypes: int32(1), object(2)
memory usage: 272.0+ bytes


In [38]:
df["Calories"] = df["Calories"].astype('float')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Menu      7 non-null      object 
 1   Price     7 non-null      int32  
 2   Calories  7 non-null      float64
dtypes: float64(1), int32(1), object(1)
memory usage: 272.0+ bytes


05. Add new columns

In [40]:
df = pd.read_csv("cafe.csv")
df.head(2)

Unnamed: 0,Menu,Price,Calories
0,Americano,4500,10
1,Latte,5000,110


In [42]:
df['New'] = 0
df.head(2)

Unnamed: 0,Menu,Price,Calories,New
0,Americano,4500,10,0
1,Latte,5000,110,0


In [43]:
# Using existing column to create a new column
discount = 0.2
df['Discounted Price'] = df['Price'] * (1 - discount)
df.head(2)

Unnamed: 0,Menu,Price,Calories,New,Discounted Price
0,Americano,4500,10,0,3600.0
1,Latte,5000,110,0,4000.0


06. Deleting Data

In [44]:
# axis = 0 means row wise operation, axis = 1 means column wise operation

df = pd.read_csv("cafe.csv")
df.head(3)

Unnamed: 0,Menu,Price,Calories
0,Americano,4500,10
1,Latte,5000,110
2,Mocha,5500,250


In [45]:
# Deleting a row
df.drop(1, axis=0, inplace=True) # inplace=True means changes are applied to the original DataFrame
df.head(3)

Unnamed: 0,Menu,Price,Calories
0,Americano,4500,10
2,Mocha,5500,250
3,Cappuccino,5000,110


In [47]:
# Deleting a column
# instead on inplace, we can also use df = df.drop('column_name', axis=1)
df = df.drop("Calories", axis=1)
df.head(3)

Unnamed: 0,Menu,Price
0,Americano,4500
2,Mocha,5500
3,Cappuccino,5000


07. Indexing/Slicing with loc