In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import preprocessing

In [7]:
pd.set_option('display.max_columns', 8)
pd.set_option('display.unicode.east_asian', True)

In [8]:
df = sns.load_dataset('titanic')
print(df.head(20))

    survived  pclass     sex   age  ...  deck  embark_town  alive  alone
0          0       3    male  22.0  ...   NaN  Southampton     no  False
1          1       1  female  38.0  ...     C    Cherbourg    yes  False
2          1       3  female  26.0  ...   NaN  Southampton    yes   True
3          1       1  female  35.0  ...     C  Southampton    yes  False
4          0       3    male  35.0  ...   NaN  Southampton     no   True
5          0       3    male   NaN  ...   NaN   Queenstown     no   True
6          0       1    male  54.0  ...     E  Southampton     no   True
7          0       3    male   2.0  ...   NaN  Southampton     no  False
8          1       3  female  27.0  ...   NaN  Southampton    yes  False
9          1       2  female  14.0  ...   NaN    Cherbourg    yes  False
10         1       3  female   4.0  ...     G  Southampton    yes  False
11         1       1  female  58.0  ...     C  Southampton    yes   True
12         0       3    male  20.0  ...   NaN  Sout

In [9]:
nan_deck = df['deck'].value_counts(dropna=False)
print(nan_deck)
print(type(nan_deck))

NaN    688
C       59
B       47
D       33
E       32
A       15
F       13
G        4
Name: deck, dtype: int64
<class 'pandas.core.series.Series'>


In [10]:
print(df.head().isnull())

   survived  pclass    sex    age  ...   deck  embark_town  alive  alone
0     False   False  False  False  ...   True        False  False  False
1     False   False  False  False  ...  False        False  False  False
2     False   False  False  False  ...   True        False  False  False
3     False   False  False  False  ...  False        False  False  False
4     False   False  False  False  ...   True        False  False  False

[5 rows x 15 columns]


In [11]:
print(df.head().notnull())

   survived  pclass   sex   age  ...   deck  embark_town  alive  alone
0      True    True  True  True  ...  False         True   True   True
1      True    True  True  True  ...   True         True   True   True
2      True    True  True  True  ...  False         True   True   True
3      True    True  True  True  ...   True         True   True   True
4      True    True  True  True  ...  False         True   True   True

[5 rows x 15 columns]


In [12]:
print(df.isnull().sum(axis=0))

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [14]:
df.dropna(axis=1, thresh=500, inplace=True)
print(df.columns)

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alive',
       'alone'],
      dtype='object')


In [15]:
df_age = df.dropna(subset=['age'], how='any', axis=0)
df_age.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     714 non-null    int64   
 1   pclass       714 non-null    int64   
 2   sex          714 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        714 non-null    int64   
 5   parch        714 non-null    int64   
 6   fare         714 non-null    float64 
 7   embarked     712 non-null    object  
 8   class        714 non-null    category
 9   who          714 non-null    object  
 10  adult_male   714 non-null    bool    
 11  embark_town  712 non-null    object  
 12  alive        714 non-null    object  
 13  alone        714 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(5)
memory usage: 69.2+ KB


In [16]:
mean_age = df['age'].mean()
df['age'].fillna(mean_age, inplace=True)
print(df.head(10))

   survived  pclass     sex        age  ...  adult_male  embark_town  alive  \
0         0       3    male  22.000000  ...        True  Southampton     no   
1         1       1  female  38.000000  ...       False    Cherbourg    yes   
2         1       3  female  26.000000  ...       False  Southampton    yes   
3         1       1  female  35.000000  ...       False  Southampton    yes   
4         0       3    male  35.000000  ...        True  Southampton     no   
5         0       3    male  29.699118  ...        True   Queenstown     no   
6         0       1    male  54.000000  ...        True  Southampton     no   
7         0       3    male   2.000000  ...       False  Southampton     no   
8         1       3  female  27.000000  ...       False  Southampton    yes   
9         1       2  female  14.000000  ...       False    Cherbourg    yes   

   alone  
0  False  
1  False  
2   True  
3  False  
4   True  
5   True  
6   True  
7  False  
8  False  
9  False  

[10 rows

In [17]:
most_freq = df['embark_town'].value_counts(dropna=True).idxmax()
print(most_freq)

Southampton


In [18]:
df_most_freq = df['embark_town'].fillna(most_freq, inplace=False)
print(df_most_freq[825:830])
print(df[825:830])

825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829    Southampton
Name: embark_town, dtype: object
     survived  pclass     sex        age  ...  adult_male  embark_town  alive  \
825         0       3    male  29.699118  ...        True   Queenstown     no   
826         0       3    male  29.699118  ...        True  Southampton     no   
827         1       2    male   1.000000  ...       False    Cherbourg    yes   
828         1       3    male  29.699118  ...        True   Queenstown    yes   
829         1       1  female  62.000000  ...       False          NaN    yes   

     alone  
825   True  
826   True  
827  False  
828   True  
829   True  

[5 rows x 14 columns]


In [19]:
df['embark_town'].fillna(method='ffill', inplace=True)
print(df['embark_town'][825:831])

825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829     Queenstown
830      Cherbourg
Name: embark_town, dtype: object


In [20]:
df.drop(['survived', 'embarked'],axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   pclass       891 non-null    int64   
 1   sex          891 non-null    object  
 2   age          891 non-null    float64 
 3   sibsp        891 non-null    int64   
 4   parch        891 non-null    int64   
 5   fare         891 non-null    float64 
 6   class        891 non-null    category
 7   who          891 non-null    object  
 8   adult_male   891 non-null    bool    
 9   embark_town  891 non-null    object  
 10  alive        891 non-null    object  
 11  alone        891 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(3), object(4)
memory usage: 65.5+ KB


In [21]:
print(df.isnull().sum(axis=0))

pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64


In [22]:
df = pd.DataFrame({'c1':['a', 'a', 'b', 'a', 'b'],
                   'c2':[1, 1, 1, 2, 2],
                   'c3':[1, 1, 2, 2, 2]})
print(df)

  c1  c2  c3
0  a   1   1
1  a   1   1
2  b   1   2
3  a   2   2
4  b   2   2


In [23]:
df_dup = df.duplicated()
print(df_dup)

0    False
1     True
2    False
3    False
4    False
dtype: bool


In [24]:
df_dup = df['c2'].duplicated()
print(df_dup)

0    False
1     True
2     True
3    False
4     True
Name: c2, dtype: bool


In [25]:
df2 = df.drop_duplicates()
print(df2)

  c1  c2  c3
0  a   1   1
2  b   1   2
3  a   2   2
4  b   2   2


In [26]:
df2 = df.drop_duplicates(subset=['c2', 'c3'])
print(df2)

  c1  c2  c3
0  a   1   1
2  b   1   2
3  a   2   2


In [28]:
df = pd.read_csv('./datasets/auto-mpg.csv', header=None)
df.columns = ['mpg', 'cylinders', 'displacement', 'horsepower',
           'weight', 'acceleration', 'model year',
           'origin', 'name']
print(df.head())

    mpg  cylinders  displacement horsepower  ...  acceleration  model year  \
0  18.0          8         307.0      130.0  ...          12.0          70   
1  15.0          8         350.0      165.0  ...          11.5          70   
2  18.0          8         318.0      150.0  ...          11.0          70   
3  16.0          8         304.0      150.0  ...          12.0          70   
4  17.0          8         302.0      140.0  ...          10.5          70   

   origin                       name  
0       1  chevrolet chevelle malibu  
1       1          buick skylark 320  
2       1         plymouth satellite  
3       1              amc rebel sst  
4       1                ford torino  

[5 rows x 9 columns]


In [29]:
mpg_to_kpl = 0.425144
df['kpl'] = df['mpg'] * mpg_to_kpl
print(df.head(20))

     mpg  cylinders  displacement horsepower  ...  model year  origin  \
0   18.0          8         307.0      130.0  ...          70       1   
1   15.0          8         350.0      165.0  ...          70       1   
2   18.0          8         318.0      150.0  ...          70       1   
3   16.0          8         304.0      150.0  ...          70       1   
4   17.0          8         302.0      140.0  ...          70       1   
5   15.0          8         429.0      198.0  ...          70       1   
6   14.0          8         454.0      220.0  ...          70       1   
7   14.0          8         440.0      215.0  ...          70       1   
8   14.0          8         455.0      225.0  ...          70       1   
9   15.0          8         390.0      190.0  ...          70       1   
10  15.0          8         383.0      170.0  ...          70       1   
11  14.0          8         340.0      160.0  ...          70       1   
12  15.0          8         400.0      150.0  ...  

In [30]:
df['kpl'] = df['kpl'].round(2)
print(df.head())

    mpg  cylinders  displacement horsepower  ...  model year  origin  \
0  18.0          8         307.0      130.0  ...          70       1   
1  15.0          8         350.0      165.0  ...          70       1   
2  18.0          8         318.0      150.0  ...          70       1   
3  16.0          8         304.0      150.0  ...          70       1   
4  17.0          8         302.0      140.0  ...          70       1   

                        name   kpl  
0  chevrolet chevelle malibu  7.65  
1          buick skylark 320  6.38  
2         plymouth satellite  7.65  
3              amc rebel sst  6.80  
4                ford torino  7.23  

[5 rows x 10 columns]


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   name          398 non-null    object 
 9   kpl           398 non-null    float64
dtypes: float64(5), int64(3), object(2)
memory usage: 31.2+ KB


In [32]:
print(df['horsepower'].unique())

['130.0' '165.0' '150.0' '140.0' '198.0' '220.0' '215.0' '225.0' '190.0'
 '170.0' '160.0' '95.00' '97.00' '85.00' '88.00' '46.00' '87.00' '90.00'
 '113.0' '200.0' '210.0' '193.0' '?' '100.0' '105.0' '175.0' '153.0'
 '180.0' '110.0' '72.00' '86.00' '70.00' '76.00' '65.00' '69.00' '60.00'
 '80.00' '54.00' '208.0' '155.0' '112.0' '92.00' '145.0' '137.0' '158.0'
 '167.0' '94.00' '107.0' '230.0' '49.00' '75.00' '91.00' '122.0' '67.00'
 '83.00' '78.00' '52.00' '61.00' '93.00' '148.0' '129.0' '96.00' '71.00'
 '98.00' '115.0' '53.00' '81.00' '79.00' '120.0' '152.0' '102.0' '108.0'
 '68.00' '58.00' '149.0' '89.00' '63.00' '48.00' '66.00' '139.0' '103.0'
 '125.0' '133.0' '138.0' '135.0' '142.0' '77.00' '62.00' '132.0' '84.00'
 '64.00' '74.00' '116.0' '82.00']


In [33]:
df['horsepower'].replace('?', np.nan, inplace=True)
df.dropna(subset=['horsepower'], axis=0, inplace=True)
df['horsepower'] = df['horsepower'].astype('float')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        392 non-null    float64
 5   acceleration  392 non-null    float64
 6   model year    392 non-null    int64  
 7   origin        392 non-null    int64  
 8   name          392 non-null    object 
 9   kpl           392 non-null    float64
dtypes: float64(6), int64(3), object(1)
memory usage: 33.7+ KB


In [34]:
print(df['horsepower'].unique())

[130. 165. 150. 140. 198. 220. 215. 225. 190. 170. 160.  95.  97.  85.
  88.  46.  87.  90. 113. 200. 210. 193. 100. 105. 175. 153. 180. 110.
  72.  86.  70.  76.  65.  69.  60.  80.  54. 208. 155. 112.  92. 145.
 137. 158. 167.  94. 107. 230.  49.  75.  91. 122.  67.  83.  78.  52.
  61.  93. 148. 129.  96.  71.  98. 115.  53.  81.  79. 120. 152. 102.
 108.  68.  58. 149.  89.  63.  48.  66. 139. 103. 125. 133. 138. 135.
 142.  77.  62. 132.  84.  64.  74. 116.  82.]


In [35]:
print(df['origin'].unique())

[1 3 2]


In [37]:
df['origin'].replace({1:'USA', 2:'EU', 3:'JP'}, inplace=True)
print(df['origin'].unique())
print(df['origin'])
print(df['origin'].value_counts())

['USA' 'JP' 'EU']
0      USA
1      USA
2      USA
3      USA
4      USA
      ... 
393    USA
394     EU
395    USA
396    USA
397    USA
Name: origin, Length: 392, dtype: object
USA    245
JP      79
EU      68
Name: origin, dtype: int64


In [38]:
print(df['origin'].dtypes)

object


In [39]:
df['origin'] = df['origin'].astype('category')
print(df['origin'].dtypes)
print(df['origin'])

category
0      USA
1      USA
2      USA
3      USA
4      USA
      ... 
393    USA
394     EU
395    USA
396    USA
397    USA
Name: origin, Length: 392, dtype: category
Categories (3, object): ['EU', 'JP', 'USA']


In [40]:
df['origin'] = df['origin'].astype('str')
print(df['origin'].dtypes)
print(df['origin'])

object
0      USA
1      USA
2      USA
3      USA
4      USA
      ... 
393    USA
394     EU
395    USA
396    USA
397    USA
Name: origin, Length: 392, dtype: object


In [42]:
count, bin_dividers = np.histogram(df['horsepower'], bins=3)
print(count)
print(bin_dividers)

[257 103  32]
[ 46.         107.33333333 168.66666667 230.        ]


In [50]:
bin_names = ['저출력', '보통출력', '고출력']
df['hp_bin'] = pd.cut(x=df['horsepower'], bins=bin_dividers,
                      labels=bin_names, include_lowest=True)
print(df[['horsepower', 'hp_bin']].head(20))

    horsepower    hp_bin
0        130.0  보통출력
1        165.0  보통출력
2        150.0  보통출력
3        150.0  보통출력
4        140.0  보통출력
5        198.0    고출력
6        220.0    고출력
7        215.0    고출력
8        225.0    고출력
9        190.0    고출력
10       170.0    고출력
11       160.0  보통출력
12       150.0  보통출력
13       225.0    고출력
14        95.0    저출력
15        95.0    저출력
16        97.0    저출력
17        85.0    저출력
18        88.0    저출력
19        46.0    저출력


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   mpg           392 non-null    float64 
 1   cylinders     392 non-null    int64   
 2   displacement  392 non-null    float64 
 3   horsepower    392 non-null    float64 
 4   weight        392 non-null    float64 
 5   acceleration  392 non-null    float64 
 6   model year    392 non-null    int64   
 7   origin        392 non-null    object  
 8   name          392 non-null    object  
 9   kpl           392 non-null    float64 
 10  hp_bin        392 non-null    category
dtypes: category(1), float64(6), int64(2), object(2)
memory usage: 34.2+ KB


In [53]:
label_encoder = preprocessing.LabelEncoder()
onehot_encoder = preprocessing.OneHotEncoder()


In [54]:
hp_bin_labeled = \
    label_encoder.fit_transform(df['hp_bin'].head(15))
print(hp_bin_labeled)
print(type(hp_bin_labeled))
print(label_encoder.classes_)

[1 1 1 1 1 0 0 0 0 0 0 1 1 0 2]
<class 'numpy.ndarray'>
['고출력' '보통출력' '저출력']


In [55]:
print(hp_bin_labeled.shape)

(15,)


In [67]:
hp_bin_labeled = hp_bin_labeled.reshape(-1, 1)
print(hp_bin_labeled)
print(hp_bin_labeled.shape)

[[1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [2]]
(15, 1)


In [68]:
onehot_pitted = onehot_encoder.fit_transform(hp_bin_labeled)
print(onehot_pitted)

  (0, 1)	1.0
  (1, 1)	1.0
  (2, 1)	1.0
  (3, 1)	1.0
  (4, 1)	1.0
  (5, 0)	1.0
  (6, 0)	1.0
  (7, 0)	1.0
  (8, 0)	1.0
  (9, 0)	1.0
  (10, 0)	1.0
  (11, 1)	1.0
  (12, 1)	1.0
  (13, 0)	1.0
  (14, 2)	1.0


In [71]:
df1 = df[['horsepower', 'hp_bin']]
df2 = pd.get_dummies(df1)
df2

Unnamed: 0,horsepower,hp_bin_저출력,hp_bin_보통출력,hp_bin_고출력
0,130.0,0,1,0
1,165.0,0,1,0
2,150.0,0,1,0
3,150.0,0,1,0
4,140.0,0,1,0
...,...,...,...,...
393,86.0,1,0,0
394,52.0,1,0,0
395,84.0,1,0,0
396,79.0,1,0,0


In [72]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler

In [83]:
data = np.array([4, 5, 6, 7, 8])
min = 4
max = 8

In [84]:
data1 = data -4
print(data1)

[0 1 2 3 4]


In [85]:
data1 = data1 / 4
print(data1)

[0.   0.25 0.5  0.75 1.  ]


In [86]:
data2 = (data - min) / (max - min)
print(data2)

[0.   0.25 0.5  0.75 1.  ]


In [87]:
minmaxscaler = MinMaxScaler()
data3 = minmaxscaler.fit_transform(data.reshape(-1, 1))
print(data3)

[[0.  ]
 [0.25]
 [0.5 ]
 [0.75]
 [1.  ]]
