# Chapter1. Python (Pandas) Basics

# Class 상속 기초

In [1]:
# Class 상속을 이해하기 위한 기초 코드

class Animal:
    def __init__(self, name):
        self.name = name
        self.default_age = 10
        print(f"Animal {self.name} created.")
        
    def speak(self):
        return "Some sound"
    
    def intro(self):
        print(f"I'm Animal named {self.name}, and my age is {self.default_age}.")

A1 = Animal("Buddy")
A1_speak = A1.speak()
print(A1_speak)
A1.intro()
print(A1.default_age)

Animal Buddy created.
Some sound
I'm Animal named Buddy, and my age is 10.
10


In [2]:
class Dog(Animal):
    def speak(self):
        return "Bark"

A2 = Dog("Rex")
A2_speak = A2.speak()
print(A2_speak)
A2.intro()
print(A2.default_age)

Animal Rex created.
Bark
I'm Animal named Rex, and my age is 10.
10


In [None]:
# Animal 클래스를 상속받아 Dog 클래스를 정의하고, Init에서 속성을 변경합니다.
# 만약 super().__init__()를 호출하지 않으면, 부모 클래스의 속성을 변경하지 못합니다.
# 여기서 나열된 모든 변수를 상속받으려면 *args와 **kwargs를 사용해야 합니다.
# *args는 위치 인자를 튜플 형태로, **kwargs는 키워드 인자를 딕셔너리 형태로 전달합니다.
# *args에 들어갈 수 있는 값의 예시는 다음과 같습니다: (1, 2, 3, 'hello', [1,2,3], (4,5,6))
# **kwargs에 들어갈 수 있는 값의 예시는 다음과 같습니다: {'name': 'Rex', 'age': 5}

class Dog2(Animal):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.default_age = 5
        print(f"Animal {self.name} created.")

    def speak(self):
        return "Bark"
    
    def intro(self):
        age = self.default_age
        age_squared = age * age
        age_cubed = age * age * age
        print(f"I'm Dog named {self.name}, and my age is {self.default_age}.")

A3 = Dog2("Rex")
A3_speak = A3.speak()
print(A3_speak)
A3.intro()
print(A3.default_age)

Animal Rex created.
Animal Rex created.
Bark
I'm Dog named Rex, and my age is 5.
5


# Pandas 기본
> pd.DataFrame(데이터 프레임)
- attribute: data, index, columns

- pd.read_csv(): 파일 로드
- df.info(): 변수 종류, 타입
- df.columns: 칼럼 명
- df.loc, df.iloc: 행, 열 조회 location - label, boolean, indexlocation - index
- value_counts()
- sort_values(by= )
- groupby()
- apply()

(더 공부하고 싶다면)  
https://datadoctorblog.com/2023/04/02/Py-Basic-Pandas-DataFrame/

In [4]:
import pandas as pd
import numpy as np

### Data Read

In [5]:
s = 'https://raw.githubusercontent.com/jongmoonha/AI-ME-Practice/refs/heads/main/data/production_monthly_1990.csv'
df = pd.read_csv(s) # volumn: 생산량
print(df.shape)
df.head(3)

(332, 3)


Unnamed: 0,year,month,volumn
0,1990,1,1199.87
1,1990,2,1214.54
2,1990,3,1138.17


In [6]:
df.describe()

Unnamed: 0,year,month,volumn
count,332.0,332.0,332.0
mean,2003.337349,6.451807,1309.801446
std,8.000419,3.447592,159.087274
min,1990.0,1.0,954.78
25%,1996.0,3.0,1198.0125
50%,2003.0,6.0,1298.8
75%,2010.0,9.0,1415.9675
max,2017.0,12.0,1678.98


### Sort by Values

In [7]:
print(df.sort_values(by="year", ascending=False))

     year  month   volumn
331  2017      8  1368.74
330  2017      7  1231.03
329  2017      6  1250.43
328  2017      5  1223.05
327  2017      4  1289.15
..    ...    ...      ...
8    1990      9  1303.67
9    1990     10  1493.49
10   1990     11  1598.42
11   1990     12  1613.31
0    1990      1  1199.87

[332 rows x 3 columns]


### Counts

In [8]:
print(df.value_counts("year"))

year
1990    12
1991    12
2016    12
2015    12
2014    12
2013    12
2012    12
2011    12
2010    12
2009    12
2008    12
2007    12
2006    12
2005    12
2004    12
2003    12
2002    12
2001    12
2000    12
1999    12
1998    12
1997    12
1996    12
1995    12
1994    12
1993    12
1992    12
2017     8
Name: count, dtype: int64


In [9]:
print(df.value_counts("year").sort_index(ascending=True))

year
1990    12
1991    12
1992    12
1993    12
1994    12
1995    12
1996    12
1997    12
1998    12
1999    12
2000    12
2001    12
2002    12
2003    12
2004    12
2005    12
2006    12
2007    12
2008    12
2009    12
2010    12
2011    12
2012    12
2013    12
2014    12
2015    12
2016    12
2017     8
Name: count, dtype: int64


### loc

In [10]:
print(df.loc[0])
# print(df[0])

year      1990.00
month        1.00
volumn    1199.87
Name: 0, dtype: float64


### groupby
- 연도별 생산량 평균 구하기  
1) .loc  
2) for문   
3) groupby   
refer: https://datadoctorblog.com/2023/04/28/Py-Basic-Pandas-Groupby/

In [11]:
# df.sort_values(by='year').value_counts()
list_year = list(df['year'].unique())
print(list_year)

[np.int64(1990), np.int64(1991), np.int64(1992), np.int64(1993), np.int64(1994), np.int64(1995), np.int64(1996), np.int64(1997), np.int64(1998), np.int64(1999), np.int64(2000), np.int64(2001), np.int64(2002), np.int64(2003), np.int64(2004), np.int64(2005), np.int64(2006), np.int64(2007), np.int64(2008), np.int64(2009), np.int64(2010), np.int64(2011), np.int64(2012), np.int64(2013), np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2017)]


#### Find Index

In [12]:
df["year"]==list_year[0]

0       True
1       True
2       True
3       True
4       True
       ...  
327    False
328    False
329    False
330    False
331    False
Name: year, Length: 332, dtype: bool

#### Extract Year from the index

In [13]:
df_each_year = df.loc[df["year"]==list_year[0]]
print(df_each_year)

    year  month   volumn
0   1990      1  1199.87
1   1990      2  1214.54
2   1990      3  1138.17
3   1990      4  1061.09
4   1990      5  1064.13
5   1990      6  1113.13
6   1990      7  1163.86
7   1990      8  1227.80
8   1990      9  1303.67
9   1990     10  1493.49
10  1990     11  1598.42
11  1990     12  1613.31


#### Volume for each year

In [14]:
df_year_volumn = df.loc[df["year"]==list_year[0]]["volumn"]
print(df_year_volumn)

0     1199.87
1     1214.54
2     1138.17
3     1061.09
4     1064.13
5     1113.13
6     1163.86
7     1227.80
8     1303.67
9     1493.49
10    1598.42
11    1613.31
Name: volumn, dtype: float64


#### Mean

In [15]:
print(df_year_volumn.mean())

1265.9566666666667


#### Mean Volume for every year

In [16]:
# 2)
yr_volumn_mean = []
for year in list_year:
  yr_volumn_mean.append(df.loc[df["year"]==year,"volumn"].mean())
yr_volumn_mean

[np.float64(1265.9566666666667),
 np.float64(1316.8275),
 np.float64(1231.8875),
 np.float64(1255.1999999999998),
 np.float64(1255.1266666666666),
 np.float64(1270.1575),
 np.float64(1322.3174999999999),
 np.float64(1347.9641666666666),
 np.float64(1425.135),
 np.float64(1396.5541666666666),
 np.float64(1462.28),
 np.float64(1446.6299999999999),
 np.float64(1385.381666666667),
 np.float64(1379.4608333333333),
 np.float64(1394.5916666666665),
 np.float64(1439.4350000000002),
 np.float64(1380.4625000000003),
 np.float64(1300.735),
 np.float64(1186.1866666666667),
 np.float64(1133.7708333333335),
 np.float64(1242.165),
 np.float64(1207.1499999999999),
 np.float64(1199.9991666666667),
 np.float64(1251.1333333333332),
 np.float64(1250.7891666666667),
 np.float64(1330.1916666666668),
 np.float64(1301.9941666666666),
 np.float64(1287.5349999999999)]

#### Mean Volume for every year (Using group by)

In [17]:
df_groupby_year_month = df.groupby(["year","month"])["volumn"]
df_groupby_year = df.groupby(["year"])["volumn"]
print(df_groupby_year_month)

<pandas.core.groupby.generic.SeriesGroupBy object at 0x1125416a0>


In [18]:
print(df_groupby_year_month.mean())
print(df_groupby_year.mean())

year  month
1990  1        1199.87
      2        1214.54
      3        1138.17
      4        1061.09
      5        1064.13
                ...   
2017  4        1289.15
      5        1223.05
      6        1250.43
      7        1231.03
      8        1368.74
Name: volumn, Length: 332, dtype: float64
year
1990    1265.956667
1991    1316.827500
1992    1231.887500
1993    1255.200000
1994    1255.126667
1995    1270.157500
1996    1322.317500
1997    1347.964167
1998    1425.135000
1999    1396.554167
2000    1462.280000
2001    1446.630000
2002    1385.381667
2003    1379.460833
2004    1394.591667
2005    1439.435000
2006    1380.462500
2007    1300.735000
2008    1186.186667
2009    1133.770833
2010    1242.165000
2011    1207.150000
2012    1199.999167
2013    1251.133333
2014    1250.789167
2015    1330.191667
2016    1301.994167
2017    1287.535000
Name: volumn, dtype: float64


#### agg

In [19]:
print(df_groupby_year.agg(["min", "max", "mean"]))

          min      max         mean
year                               
1990  1061.09  1613.31  1265.956667
1991  1139.07  1551.77  1316.827500
1992  1076.76  1466.11  1231.887500
1993  1061.28  1497.98  1255.200000
1994  1044.15  1499.80  1255.126667
1995  1123.16  1546.87  1270.157500
1996  1137.70  1633.81  1322.317500
1997  1168.54  1632.32  1347.964167
1998  1273.98  1643.83  1425.135000
1999  1275.27  1614.78  1396.554167
2000  1304.64  1664.45  1462.280000
2001  1355.22  1555.92  1446.630000
2002  1222.20  1542.89  1385.381667
2003  1234.22  1593.49  1379.460833
2004  1226.44  1597.74  1394.591667
2005  1281.62  1678.98  1439.435000
2006  1220.54  1584.31  1380.462500
2007  1096.65  1460.84  1300.735000
2008  1021.42  1390.49  1186.186667
2009   954.78  1484.07  1133.770833
2010  1073.90  1515.33  1242.165000
2011  1075.52  1401.91  1207.150000
2012  1024.21  1393.68  1199.999167
2013  1101.10  1478.89  1251.133333
2014  1057.14  1548.74  1250.789167
2015  1161.36  1520.93  1330

#### apply
- apply(func="mean", axis=1)
- apply(lambda x: x.mean()+100, axis=1)
- apply(사용자함수, axis=1)  
ref: https://datadoctorblog.com/2023/05/01/Py-Basic-Pandas-Apply/

In [20]:
df_groupby_year.apply("mean")

year
1990    1265.956667
1991    1316.827500
1992    1231.887500
1993    1255.200000
1994    1255.126667
1995    1270.157500
1996    1322.317500
1997    1347.964167
1998    1425.135000
1999    1396.554167
2000    1462.280000
2001    1446.630000
2002    1385.381667
2003    1379.460833
2004    1394.591667
2005    1439.435000
2006    1380.462500
2007    1300.735000
2008    1186.186667
2009    1133.770833
2010    1242.165000
2011    1207.150000
2012    1199.999167
2013    1251.133333
2014    1250.789167
2015    1330.191667
2016    1301.994167
2017    1287.535000
Name: volumn, dtype: float64

In [21]:
def mean_add_100(x):
  return x.mean()+100

In [22]:
print(df_groupby_year.apply(mean_add_100))

year
1990    1365.956667
1991    1416.827500
1992    1331.887500
1993    1355.200000
1994    1355.126667
1995    1370.157500
1996    1422.317500
1997    1447.964167
1998    1525.135000
1999    1496.554167
2000    1562.280000
2001    1546.630000
2002    1485.381667
2003    1479.460833
2004    1494.591667
2005    1539.435000
2006    1480.462500
2007    1400.735000
2008    1286.186667
2009    1233.770833
2010    1342.165000
2011    1307.150000
2012    1299.999167
2013    1351.133333
2014    1350.789167
2015    1430.191667
2016    1401.994167
2017    1387.535000
Name: volumn, dtype: float64


# 통계값 구하기
- df 중 NaN 값은 제외하고, numeric column 만 계산
- df기본 통계기능 .mean(), median(), var()
- df.describe()

#### Load data

In [23]:
s = 'https://raw.githubusercontent.com/jongmoonha/AI-ME-Practice/refs/heads/main/data/iris_missing.csv'
df_iris = pd.read_csv(s)
print(df_iris)
df_iris.describe()

     Sepal.Length  Sepal.Width  Petal.Length  Petal.Width    Species
0             5.1          NaN           NaN          0.2     setosa
1             4.9          3.0           1.4          0.2     setosa
2             4.7          3.2           1.3          0.2     setosa
3             4.6          3.1           1.5          0.2     setosa
4             5.0          3.6           1.4          0.2     setosa
..            ...          ...           ...          ...        ...
145           6.7          3.0           5.2          2.3  virginica
146           NaN          2.5           NaN          NaN  virginica
147           6.5          3.0           5.2          2.0  virginica
148           6.2          3.4           5.4          2.3  virginica
149           5.9          3.0           5.1          NaN  virginica

[150 rows x 5 columns]


Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
count,143.0,137.0,136.0,144.0
mean,5.853846,3.070073,3.811765,1.190972
std,0.823576,0.438642,1.749965,0.768243
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.4,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


#### Read column names

In [24]:
df_iris.columns

Index(['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width',
       'Species'],
      dtype='object')

#### Find NaN Rows

In [25]:
df_iris_notna = df_iris.notna()
df_iris_notna_rows = df_iris_notna.all(axis=1)

print(df_iris_notna)
print(df_iris_notna_rows)
print("\n\n", df_iris_notna_rows.value_counts())

     Sepal.Length  Sepal.Width  Petal.Length  Petal.Width  Species
0            True        False         False         True     True
1            True         True          True         True     True
2            True         True          True         True     True
3            True         True          True         True     True
4            True         True          True         True     True
..            ...          ...           ...          ...      ...
145          True         True          True         True     True
146         False         True         False        False     True
147          True         True          True         True     True
148          True         True          True         True     True
149          True         True          True        False     True

[150 rows x 5 columns]
0      False
1       True
2       True
3       True
4       True
       ...  
145     True
146    False
147     True
148     True
149    False
Length: 150, dtype: bool


 T

In [26]:
df_iris_notna = df_iris.loc[df_iris_notna_rows]
print(df_iris_notna)

     Sepal.Length  Sepal.Width  Petal.Length  Petal.Width    Species
1             4.9          3.0           1.4          0.2     setosa
2             4.7          3.2           1.3          0.2     setosa
3             4.6          3.1           1.5          0.2     setosa
4             5.0          3.6           1.4          0.2     setosa
5             5.4          3.9           1.7          0.4     setosa
..            ...          ...           ...          ...        ...
143           6.8          3.2           5.9          2.3  virginica
144           6.7          3.3           5.7          2.5  virginica
145           6.7          3.0           5.2          2.3  virginica
147           6.5          3.0           5.2          2.0  virginica
148           6.2          3.4           5.4          2.3  virginica

[126 rows x 5 columns]


In [27]:
df_iris_notna_num = df_iris_notna[df_iris.columns[0:-1]]
df_iris_notna_num

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
...,...,...,...,...
143,6.8,3.2,5.9,2.3
144,6.7,3.3,5.7,2.5
145,6.7,3.0,5.2,2.3
147,6.5,3.0,5.2,2.0


In [28]:
df_iris_notna_num.describe()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
count,126.0,126.0,126.0,126.0
mean,5.857937,3.080159,3.783333,1.219841
std,0.82349,0.443084,1.750828,0.764515
min,4.3,2.0,1.0,0.1
25%,5.125,2.8,1.6,0.325
50%,5.8,3.0,4.4,1.3
75%,6.4,3.375,5.1,1.8
max,7.9,4.4,6.9,2.5


# Correlation Analysis
- .corr() for pandas (default: pearson, method=)
- scipy.stats  library can!
reference: https://datadoctorblog.com/2023/08/08/Py-Stat-correlation-analysis/

In [29]:
s = 'https://raw.githubusercontent.com/jongmoonha/AI-ME-Practice/refs/heads/main/data/iris_missing.csv'
df_iris = pd.read_csv(s)

print(df_iris)
df_iris.describe()

df_iris_notna = df_iris.notna()
df_iris_notna_rows = df_iris_notna.all(axis=1)
df_iris_notna = df_iris.loc[df_iris_notna_rows]
df_iris_notna_num = df_iris_notna[df_iris.columns[0:-1]]

df_iris_notna_num.corr()

     Sepal.Length  Sepal.Width  Petal.Length  Petal.Width    Species
0             5.1          NaN           NaN          0.2     setosa
1             4.9          3.0           1.4          0.2     setosa
2             4.7          3.2           1.3          0.2     setosa
3             4.6          3.1           1.5          0.2     setosa
4             5.0          3.6           1.4          0.2     setosa
..            ...          ...           ...          ...        ...
145           6.7          3.0           5.2          2.3  virginica
146           NaN          2.5           NaN          NaN  virginica
147           6.5          3.0           5.2          2.0  virginica
148           6.2          3.4           5.4          2.3  virginica
149           5.9          3.0           5.1          NaN  virginica

[150 rows x 5 columns]


Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
Sepal.Length,1.0,-0.07137,0.860217,0.808743
Sepal.Width,-0.07137,1.0,-0.406533,-0.338672
Petal.Length,0.860217,-0.406533,1.0,0.96136
Petal.Width,0.808743,-0.338672,0.96136,1.0


In [30]:
df_iris_notna_num.corr(method = "spearman")

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
Sepal.Length,1.0,-0.113584,0.873034,0.823997
Sepal.Width,-0.113584,1.0,-0.272651,-0.259357
Petal.Length,0.873034,-0.272651,1.0,0.9406
Petal.Width,0.823997,-0.259357,0.9406,1.0


In [31]:
from scipy.stats import pearsonr, spearmanr, kendalltau
print(pearsonr(df_iris_notna_num["Sepal.Length"], df_iris_notna_num["Petal.Length"]))
print(spearmanr(df_iris_notna_num["Sepal.Length"], df_iris_notna_num["Petal.Length"]))
print(kendalltau(df_iris_notna_num["Sepal.Length"], df_iris_notna_num["Petal.Length"]))

PearsonRResult(statistic=np.float64(0.8602174805176663), pvalue=np.float64(4.46242199286702e-38))
SignificanceResult(statistic=np.float64(0.8730341892821037), pvalue=np.float64(1.7324794500960308e-40))
SignificanceResult(statistic=np.float64(0.7064455853595843), pvalue=np.float64(4.351505558359675e-30))
