In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import preprocessing # scikit-learn 설치 모듈명

In [2]:
df = sns.load_dataset('titanic')
print(df.head())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
None


##결측치 처리

In [4]:
nan_deck = df['deck'].value_counts(
    dropna=False)
print(nan_deck)
print(type(nan_deck))

NaN    688
C       59
B       47
D       33
E       32
A       15
F       13
G        4
Name: deck, dtype: int64
<class 'pandas.core.series.Series'>


In [5]:
print(df.head().isnull())

   survived  pclass    sex    age  sibsp  parch   fare  embarked  class  \
0     False   False  False  False  False  False  False     False  False   
1     False   False  False  False  False  False  False     False  False   
2     False   False  False  False  False  False  False     False  False   
3     False   False  False  False  False  False  False     False  False   
4     False   False  False  False  False  False  False     False  False   

     who  adult_male   deck  embark_town  alive  alone  
0  False       False   True        False  False  False  
1  False       False  False        False  False  False  
2  False       False   True        False  False  False  
3  False       False  False        False  False  False  
4  False       False   True        False  False  False  


In [6]:
print(df.head().notnull())

   survived  pclass   sex   age  sibsp  parch  fare  embarked  class   who  \
0      True    True  True  True   True   True  True      True   True  True   
1      True    True  True  True   True   True  True      True   True  True   
2      True    True  True  True   True   True  True      True   True  True   
3      True    True  True  True   True   True  True      True   True  True   
4      True    True  True  True   True   True  True      True   True  True   

   adult_male   deck  embark_town  alive  alone  
0        True  False         True   True   True  
1        True   True         True   True   True  
2        True  False         True   True   True  
3        True   True         True   True   True  
4        True  False         True   True   True  


In [7]:
print(df.isnull().sum(axis=0))

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


In [8]:
df.dropna(axis=1, thresh=500, inplace=True)
print(df.columns)

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alive',
       'alone'],
      dtype='object')


In [9]:
df_age = df.dropna(subset=['age'],
        how='any', axis=0)
print(len(df_age))

714


In [10]:
mean_age = df['age'].mean(axis=0)
df['age'].fillna(mean_age, inplace=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          891 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  embark_town  889 non-null    object  
 12  alive        891 non-null    object  
 13  alone        891 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(5)
memory usage: 79.4+ KB
None


In [11]:
most_freq = df['embark_town'].value_counts(dropna=True).idxmax()
print(most_freq)

Southampton


In [12]:
df[825:830]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
825,0,3,male,29.699118,0,0,6.95,Q,Third,man,True,Queenstown,no,True
826,0,3,male,29.699118,0,0,56.4958,S,Third,man,True,Southampton,no,True
827,1,2,male,1.0,0,2,37.0042,C,Second,child,False,Cherbourg,yes,False
828,1,3,male,29.699118,0,0,7.75,Q,Third,man,True,Queenstown,yes,True
829,1,1,female,62.0,0,0,80.0,,First,woman,False,,yes,True


In [13]:
df_most_freq = df['embark_town'].fillna(most_freq, inplace=False)
print(df_most_freq[825:830])

825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829    Southampton
Name: embark_town, dtype: object


In [14]:
df['embark_town'].fillna(method='ffill', inplace=True)
print(df['embark_town'][825:830])

825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829     Queenstown
Name: embark_town, dtype: object


In [15]:
df['embarked'].fillna(method='ffill', inplace=True)

In [16]:
print(df.isnull().sum(axis=0))

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64


#중복값 처리

In [17]:
df = pd.DataFrame({
    'c1':['a','a','b','a','b'],
    'c2':[1,1,1,2,2],
    'c3':[1,1,2,2,2]
})
print(df)

  c1  c2  c3
0  a   1   1
1  a   1   1
2  b   1   2
3  a   2   2
4  b   2   2


In [18]:
df_dup = df.duplicated()
print(df_dup)

0    False
1     True
2    False
3    False
4    False
dtype: bool


In [19]:
df2 = df.drop_duplicates()
print(df2)

  c1  c2  c3
0  a   1   1
2  b   1   2
3  a   2   2
4  b   2   2


In [20]:
df3 = df.drop_duplicates(subset=['c2', 'c3'])
print(df3)

  c1  c2  c3
0  a   1   1
2  b   1   2
3  a   2   2


#테이터 형태 변환

In [21]:
df = pd.read_csv('../datasets/auto-mpg.csv',
                 header=None)
df.columns = ['mpg', 'cylinders', 'displacement',
             'horsepower', 'weight', 'aceleration',
             'model year', 'origin', 'name']
print(df.head())

    mpg  cylinders  displacement horsepower  weight  aceleration  model year  \
0  18.0          8         307.0      130.0  3504.0         12.0          70   
1  15.0          8         350.0      165.0  3693.0         11.5          70   
2  18.0          8         318.0      150.0  3436.0         11.0          70   
3  16.0          8         304.0      150.0  3433.0         12.0          70   
4  17.0          8         302.0      140.0  3449.0         10.5          70   

   origin                       name  
0       1  chevrolet chevelle malibu  
1       1          buick skylark 320  
2       1         plymouth satellite  
3       1              amc rebel sst  
4       1                ford torino  


In [22]:
mpg_to_kpl_rate = 0.42514
df['kpl'] = df['mpg'] * mpg_to_kpl_rate
print(df.head(30))

     mpg  cylinders  displacement horsepower  weight  aceleration  model year  \
0   18.0          8         307.0      130.0  3504.0         12.0          70   
1   15.0          8         350.0      165.0  3693.0         11.5          70   
2   18.0          8         318.0      150.0  3436.0         11.0          70   
3   16.0          8         304.0      150.0  3433.0         12.0          70   
4   17.0          8         302.0      140.0  3449.0         10.5          70   
5   15.0          8         429.0      198.0  4341.0         10.0          70   
6   14.0          8         454.0      220.0  4354.0          9.0          70   
7   14.0          8         440.0      215.0  4312.0          8.5          70   
8   14.0          8         455.0      225.0  4425.0         10.0          70   
9   15.0          8         390.0      190.0  3850.0          8.5          70   
10  15.0          8         383.0      170.0  3563.0         10.0          70   
11  14.0          8         

In [23]:
df['kpl'] = df['kpl'].round(2)
print(df.head())

    mpg  cylinders  displacement horsepower  weight  aceleration  model year  \
0  18.0          8         307.0      130.0  3504.0         12.0          70   
1  15.0          8         350.0      165.0  3693.0         11.5          70   
2  18.0          8         318.0      150.0  3436.0         11.0          70   
3  16.0          8         304.0      150.0  3433.0         12.0          70   
4  17.0          8         302.0      140.0  3449.0         10.5          70   

   origin                       name   kpl  
0       1  chevrolet chevelle malibu  7.65  
1       1          buick skylark 320  6.38  
2       1         plymouth satellite  7.65  
3       1              amc rebel sst  6.80  
4       1                ford torino  7.23  


In [24]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    float64
 5   aceleration   398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   name          398 non-null    object 
 9   kpl           398 non-null    float64
dtypes: float64(5), int64(3), object(2)
memory usage: 31.2+ KB
None


In [25]:
print(df['horsepower'].unique())

['130.0' '165.0' '150.0' '140.0' '198.0' '220.0' '215.0' '225.0' '190.0'
 '170.0' '160.0' '95.00' '97.00' '85.00' '88.00' '46.00' '87.00' '90.00'
 '113.0' '200.0' '210.0' '193.0' '?' '100.0' '105.0' '175.0' '153.0'
 '180.0' '110.0' '72.00' '86.00' '70.00' '76.00' '65.00' '69.00' '60.00'
 '80.00' '54.00' '208.0' '155.0' '112.0' '92.00' '145.0' '137.0' '158.0'
 '167.0' '94.00' '107.0' '230.0' '49.00' '75.00' '91.00' '122.0' '67.00'
 '83.00' '78.00' '52.00' '61.00' '93.00' '148.0' '129.0' '96.00' '71.00'
 '98.00' '115.0' '53.00' '81.00' '79.00' '120.0' '152.0' '102.0' '108.0'
 '68.00' '58.00' '149.0' '89.00' '63.00' '48.00' '66.00' '139.0' '103.0'
 '125.0' '133.0' '138.0' '135.0' '142.0' '77.00' '62.00' '132.0' '84.00'
 '64.00' '74.00' '116.0' '82.00']


In [26]:
print(df['horsepower'].value_counts().sort_index())

100.0    17
102.0     1
103.0     1
105.0    12
107.0     1
         ..
95.00    14
96.00     3
97.00     9
98.00     2
?         6
Name: horsepower, Length: 94, dtype: int64


In [27]:
df['horsepower'].replace('?', np.nan, inplace=True)
df.dropna(subset=['horsepower'], axis=0, inplace=True)
df['horsepower'] = df['horsepower'].astype('float')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        392 non-null    float64
 5   aceleration   392 non-null    float64
 6   model year    392 non-null    int64  
 7   origin        392 non-null    int64  
 8   name          392 non-null    object 
 9   kpl           392 non-null    float64
dtypes: float64(6), int64(3), object(1)
memory usage: 33.7+ KB
None


In [28]:
print(df['origin'])

0      1
1      1
2      1
3      1
4      1
      ..
393    1
394    2
395    1
396    1
397    1
Name: origin, Length: 392, dtype: int64


In [29]:
print(df['origin'].unique())

[1 3 2]


In [30]:
df['origin'].replace({1:'USA', 2:'EU', 3:'JP'}, inplace=True)
print(df['origin'].unique())
print(df['origin'].dtypes)
print(df['origin'])

['USA' 'JP' 'EU']
object
0      USA
1      USA
2      USA
3      USA
4      USA
      ... 
393    USA
394     EU
395    USA
396    USA
397    USA
Name: origin, Length: 392, dtype: object


In [31]:
df['origin'] = df['origin'].astype('category')
print(df['origin'].dtypes)
print(df['origin'])

category
0      USA
1      USA
2      USA
3      USA
4      USA
      ... 
393    USA
394     EU
395    USA
396    USA
397    USA
Name: origin, Length: 392, dtype: category
Categories (3, object): ['EU', 'JP', 'USA']


In [32]:
df['origin'] = df['origin'].astype('str')
print(df['origin'].dtypes)
print(df['origin'])

object
0      USA
1      USA
2      USA
3      USA
4      USA
      ... 
393    USA
394     EU
395    USA
396    USA
397    USA
Name: origin, Length: 392, dtype: object


In [33]:
count, bin_dividers = np.histogram(
            df['horsepower'], bins=3)
print(count)
print(bin_dividers)
print(df['horsepower'].min())
print(df['horsepower'].max())

[257 103  32]
[ 46.         107.33333333 168.66666667 230.        ]
46.0
230.0


In [34]:
bin_names = ['저출력', '중간출력', '고출력']
df['hp_bin'] = pd.cut(x=df['horsepower'],
                     bins=bin_dividers,
                     labels=bin_names,
                     include_lowest=True)
print(df[['horsepower','hp_bin']].head(20))
print(df['hp_bin'].dtypes)

    horsepower hp_bin
0        130.0   중간출력
1        165.0   중간출력
2        150.0   중간출력
3        150.0   중간출력
4        140.0   중간출력
5        198.0    고출력
6        220.0    고출력
7        215.0    고출력
8        225.0    고출력
9        190.0    고출력
10       170.0    고출력
11       160.0   중간출력
12       150.0   중간출력
13       225.0    고출력
14        95.0    저출력
15        95.0    저출력
16        97.0    저출력
17        85.0    저출력
18        88.0    저출력
19        46.0    저출력
category


In [35]:
label_encoder = preprocessing.LabelEncoder()
onehot_encoder = preprocessing.OneHotEncoder()

In [36]:
onehot_labeled = label_encoder.fit_transform(
    df['hp_bin'].head(15))
print(onehot_labeled)
print(type(onehot_labeled))
print(onehot_labeled.shape)

[2 2 2 2 2 0 0 0 0 0 0 2 2 0 1]
<class 'numpy.ndarray'>
(15,)


In [37]:
onehot_reshaped = onehot_labeled.reshape(-1,1)
print(onehot_reshaped)
print(type(onehot_reshaped))
print(onehot_reshaped.shape)

[[2]
 [2]
 [2]
 [2]
 [2]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [2]
 [2]
 [0]
 [1]]
<class 'numpy.ndarray'>
(15, 1)


In [38]:
onehot_fitted = onehot_encoder.fit_transform(
    onehot_reshaped)
print(onehot_fitted)

  (0, 2)	1.0
  (1, 2)	1.0
  (2, 2)	1.0
  (3, 2)	1.0
  (4, 2)	1.0
  (5, 0)	1.0
  (6, 0)	1.0
  (7, 0)	1.0
  (8, 0)	1.0
  (9, 0)	1.0
  (10, 0)	1.0
  (11, 2)	1.0
  (12, 2)	1.0
  (13, 0)	1.0
  (14, 1)	1.0


In [44]:
data = np.array([4,5,6,7,8])
df_data = pd.DataFrame(data)
min_value = df_data.min()[0]
max_value = df_data.max()[0]
print(min_value)
print(max_value)
scaled_data = data - min_value
print(scaled_data)
scaled_data = (data - min_value) / (max_value - min_value)
print(scaled_data)

4
8
[0 1 2 3 4]
[0.   0.25 0.5  0.75 1.  ]


In [56]:
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler
from sklearn.preprocessing import *

In [55]:
minmaxscaler = MinMaxScaler()
minmaxscaled_data = minmaxscaler.fit_transform(
    data.reshape(-1,1))  
print(minmaxscaled_data)

[[0.  ]
 [0.25]
 [0.5 ]
 [0.75]
 [1.  ]]


In [50]:
print(df.horsepower.describe())

count    392.000000
mean     104.469388
std       38.491160
min       46.000000
25%       75.000000
50%       93.500000
75%      126.000000
max      230.000000
Name: horsepower, dtype: float64


In [51]:
min_0 = df.horsepower - df.horsepower.min()
print(min_0.describe())

count    392.000000
mean      58.469388
std       38.491160
min        0.000000
25%       29.000000
50%       47.500000
75%       80.000000
max      184.000000
Name: horsepower, dtype: float64


In [53]:
horsepower_minmaxscaled = min_0 / min_0.max()
print(horsepower_minmaxscaled.describe())
print(horsepower_minmaxscaled.head())

count    392.000000
mean       0.317768
std        0.209191
min        0.000000
25%        0.157609
50%        0.258152
75%        0.434783
max        1.000000
Name: horsepower, dtype: float64
0    0.456522
1    0.646739
2    0.565217
3    0.565217
4    0.510870
Name: horsepower, dtype: float64
