# 라이브러리 로드

In [224]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# 한글 깨짐 방지
from matplotlib import font_manager, rc 
import matplotlib.font_manager as fm

for font_path in fm.findSystemFonts(fontpaths = None, fontext = 'ttf'):
    if 'D2Coding' in font_path:
        font = font_manager.FontProperties(fname=font_path).get_name()
        rc('font', family=font)
        break 

# 데이터 로드 및 데이터 확인

In [225]:
DATA_PATH = "../../Data/titanic/"

df = pd.read_csv(DATA_PATH+"train.csv")

df.shape, df.columns

((891, 12),
 Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
        'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
       dtype='object'))

In [226]:
df.columns = [col.lower() for col in df.columns] # 컬럼명 소문자로 변환
df.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [227]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  891 non-null    int64  
 1   survived     891 non-null    int64  
 2   pclass       891 non-null    int64  
 3   name         891 non-null    object 
 4   sex          891 non-null    object 
 5   age          714 non-null    float64
 6   sibsp        891 non-null    int64  
 7   parch        891 non-null    int64  
 8   ticket       891 non-null    object 
 9   fare         891 non-null    float64
 10  cabin        204 non-null    object 
 11  embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [228]:
df.describe(include="all")

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Braund, Mr. Owen Harris",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [229]:
df. head(), df.tail()

(   passengerid  survived  pclass  \
 0            1         0       3   
 1            2         1       1   
 2            3         1       3   
 3            4         1       1   
 4            5         0       3   
 
                                                 name     sex   age  sibsp  \
 0                            Braund, Mr. Owen Harris    male  22.0      1   
 1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
 2                             Heikkinen, Miss. Laina  female  26.0      0   
 3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
 4                           Allen, Mr. William Henry    male  35.0      0   
 
    parch            ticket     fare cabin embarked  
 0      0         A/5 21171   7.2500   NaN        S  
 1      0          PC 17599  71.2833   C85        C  
 2      0  STON/O2. 3101282   7.9250   NaN        S  
 3      0            113803  53.1000  C123        S  
 4      0            373450   8.0500

## 타겟 데이터 확인

In [230]:
new_survived=pd.Categorical(df['survived'])
new_survived=new_survived.rename_categories(['Died', 'Survived'])

new_survived.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Died,549,0.616162
Survived,342,0.383838


## 데이터 분리

In [231]:
from sklearn.model_selection import train_test_split
SEED = 42

X_tr, X_te = train_test_split(df, random_state=SEED, test_size = 0.2)
X_tr = X_tr.reset_index(drop=True)
X_te = X_te.reset_index(drop=True)

In [232]:
X_tr.shape, X_te.shape, X_tr.columns

((712, 12),
 (179, 12),
 Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
        'parch', 'ticket', 'fare', 'cabin', 'embarked'],
       dtype='object'))

In [233]:
new_survived = pd.Categorical(X_tr['survived'])
new_survived = new_survived.rename_categories(["Died","Survived"])              
print(new_survived[:5])
new_survived.describe()

['Died', 'Died', 'Died', 'Died', 'Died']
Categories (2, object): ['Died', 'Survived']


Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Died,444,0.623596
Survived,268,0.376404


# Data Cleaning

### 필요없는 데이터

In [234]:
X_tr['passengerid'].nunique(), X_tr.shape[0]

(712, 712)

In [235]:
# passengerid는 전체 데이터가 unique 하기 때문에 삭제
X_tr.drop('passengerid', axis=1, inplace=True)
X_te.drop('passengerid', axis=1, inplace=True)

### 결측치 처리

In [236]:
(X_tr.isnull().sum() / X_tr.shape[0]).round(4).sort_values(ascending=False) # 각 걸럼별 결측치 비율

cabin       0.7767
age         0.1966
embarked    0.0028
survived    0.0000
pclass      0.0000
name        0.0000
sex         0.0000
sibsp       0.0000
parch       0.0000
ticket      0.0000
fare        0.0000
dtype: float64

In [237]:
print(f'before: {X_tr.shape} / isnull().sum(): {X_tr.isnull().sum().sum()}')
# 결측치가 있는 행 제거 : X_tr.dropna(axis=0)
X_tr = X_tr.drop('cabin', axis=1)
X_te = X_te.drop('cabin', axis=1)
print(f'after: {X_tr.shape} / isnull().sum(): {X_tr.isnull().sum().sum()}')

before: (712, 11) / isnull().sum(): 695
after: (712, 10) / isnull().sum(): 142


In [238]:
X_tr['age'] = X_tr['age'].fillna(X_tr['age'].median())
X_te['age'] = X_te['age'].fillna(X_tr['age'].median())

In [239]:
embarked_mode = X_tr['embarked'].mode().values[0]

X_tr['embarked'] = X_tr['embarked'].fillna(embarked_mode)
X_te['embarked'] = X_te['embarked'].fillna(embarked_mode)

In [240]:
X_tr.isnull().sum().sum(), X_te.isnull().sum().sum()

(0, 0)

# Feature Extraction
> 기존 Feature에 기반하여 새로운 Feature들을 생성

## 데이터 타입

In [241]:
X_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  712 non-null    int64  
 1   pclass    712 non-null    int64  
 2   name      712 non-null    object 
 3   sex       712 non-null    object 
 4   age       712 non-null    float64
 5   sibsp     712 non-null    int64  
 6   parch     712 non-null    int64  
 7   ticket    712 non-null    object 
 8   fare      712 non-null    float64
 9   embarked  712 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 55.8+ KB


### 수치형 데이터 타입 변환

In [242]:
df_number = X_tr.select_dtypes(include=np.number)
df_number.columns

Index(['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare'], dtype='object')

In [243]:
df_number.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  712 non-null    int64  
 1   pclass    712 non-null    int64  
 2   age       712 non-null    float64
 3   sibsp     712 non-null    int64  
 4   parch     712 non-null    int64  
 5   fare      712 non-null    float64
dtypes: float64(2), int64(4)
memory usage: 33.5 KB


In [244]:
df_number.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,712.0,712.0,712.0,712.0,712.0,712.0
mean,0.376404,2.330056,29.204129,0.553371,0.379213,32.586276
std,0.484824,0.824584,13.007971,1.176404,0.791669,51.969529
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,22.0,0.0,0.0,7.925
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,35.0,1.0,0.0,30.5
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [245]:
# survived
X_tr["survived"] = X_tr["survived"].astype("int32")
X_te["survived"] = X_te["survived"].astype("int32")

In [246]:
# pclass
X_tr['pclass'].unique()

array([1, 2, 3], dtype=int64)

In [247]:
X_tr['pclass']=X_tr['pclass'].astype('category')
X_te['pclass']=X_te['pclass'].astype('category')

In [248]:
# age
X_tr['age']=X_tr['age'].astype('int32')
X_te['age']=X_te['age'].astype('int32')

In [249]:
# sibsp
X_tr['sibsp'].unique()

array([0, 1, 4, 3, 2, 8, 5], dtype=int64)

In [250]:
X_tr['sibsp']=X_tr['sibsp'].astype('category')
X_te['sibsp']=X_te['sibsp'].astype('category')

In [251]:
# parch 
X_tr['parch'].unique()

array([0, 2, 1, 6, 4, 3, 5], dtype=int64)

In [252]:
X_tr['parch']=X_tr['parch'].astype('category')
X_te['parch']=X_te['parch'].astype('category')

In [253]:
# fare
X_tr['fare'].unique()

array([ 28.5   ,  13.    ,   7.925 ,   7.8542,  31.275 , 247.5208,
        26.55  ,  27.7208,   7.8958,  35.5   ,  24.15  ,  12.275 ,
         7.0542,   9.5   ,  26.    ,  90.    , 227.525 ,  57.    ,
         6.2375,   8.6625,  26.25  ,   9.5875,   7.2292,  22.3583,
         9.4833, 120.    ,  14.4583,   8.05  , 211.5   ,   7.25  ,
         7.725 ,  25.4667,  21.075 ,  30.    ,  61.3792,  20.2125,
        30.5   ,   7.05  ,  14.5   ,   7.5208, 151.55  ,  21.    ,
       262.375 ,   7.75  ,   7.775 ,  80.    ,   9.8417,  12.35  ,
         0.    ,   7.225 ,   6.4375,  12.475 , 133.65  ,   6.975 ,
        77.9583,  10.5   , 106.425 ,  81.8583,  11.1333,  27.75  ,
       153.4625,   8.3   ,  15.05  , 110.8833,  15.0458,  39.6875,
         7.8792,  23.45  ,   7.65  ,  15.7417,  15.2458,  51.8625,
        15.5   ,  41.5792,  14.4542,  10.5167,  20.525 ,  89.1042,
        36.75  ,  55.4417,  50.    ,  13.8625,  16.7   ,  13.5   ,
        35.    ,  55.9   ,   7.8   ,  34.375 ,  18.    ,  47.1

In [254]:
X_tr['fare']=X_tr['fare'].astype('float32')
X_te['fare']=X_te['fare'].astype('float32')

In [255]:
X_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   survived  712 non-null    int32   
 1   pclass    712 non-null    category
 2   name      712 non-null    object  
 3   sex       712 non-null    object  
 4   age       712 non-null    int32   
 5   sibsp     712 non-null    category
 6   parch     712 non-null    category
 7   ticket    712 non-null    object  
 8   fare      712 non-null    float32 
 9   embarked  712 non-null    object  
dtypes: category(3), float32(1), int32(2), object(4)
memory usage: 33.6+ KB


### 범주형 데이터 타입 변경

In [256]:
df_object=X_tr.select_dtypes(include='object')
df_object

Unnamed: 0,name,sex,ticket,embarked
0,"Partner, Mr. Austen",male,113043,S
1,"Berriman, Mr. William John",male,28425,S
2,"Tikkanen, Mr. Juho",male,STON/O 2. 3101293,S
3,"Hansen, Mr. Henrik Juul",male,350025,S
4,"Andersson, Miss. Ebba Iris Alfrida",female,347082,S
...,...,...,...,...
707,"Salkjelsvik, Miss. Anna Kristine",female,343120,S
708,"Cairns, Mr. Alexander",male,113798,S
709,"Hansen, Mr. Claus Peter",male,350026,S
710,"Carter, Miss. Lucile Polk",female,113760,S


In [257]:
df_object.head(), df_object.tail()

(                                 name     sex             ticket embarked
 0                 Partner, Mr. Austen    male             113043        S
 1          Berriman, Mr. William John    male              28425        S
 2                  Tikkanen, Mr. Juho    male  STON/O 2. 3101293        S
 3             Hansen, Mr. Henrik Juul    male             350025        S
 4  Andersson, Miss. Ebba Iris Alfrida  female             347082        S,
                                  name     sex  ticket embarked
 707  Salkjelsvik, Miss. Anna Kristine  female  343120        S
 708             Cairns, Mr. Alexander    male  113798        S
 709           Hansen, Mr. Claus Peter    male  350026        S
 710         Carter, Miss. Lucile Polk  female  113760        S
 711         White, Mr. Richard Frasar    male   35281        S)

In [258]:
df_object.describe()

Unnamed: 0,name,sex,ticket,embarked
count,712,712,712,712
unique,712,2,558,3
top,"Partner, Mr. Austen",male,CA. 2343,S
freq,1,467,7,527


### 문자열

In [259]:
df_object = X_tr.select_dtypes(include='object')
df_object.columns

Index(['name', 'sex', 'ticket', 'embarked'], dtype='object')

#### 공백제거
> 앞뒤 공백제거: strip()

In [260]:
X_tr["name"] = X_tr["name"].map(lambda x: x.strip())
X_tr["ticket"] = X_tr["ticket"].map(lambda x: x.strip())

X_te["name"] = X_te["name"].map(lambda x: x.strip())
X_te["ticket"] = X_te["ticket"].map(lambda x: x.strip())

#### 문자열 포함 여부

In [261]:
dict_designation = {
    'Mr.': '남성',
    'Master.': '남성',
    'Sir.': '남성',
    'Miss.': '미혼 여성',
    'Mrs.': '기혼 여성',
    'Ms.': '미혼/기혼 여성',
    'Lady.': '숙녀',
    'Mlle.': '아가씨',
    # 직업
    'Dr.': '의사',
    'Rev.': '목사',
    'Major.': '계급',
    'Don.': '교수',
    'Col.': '군인',
    'Capt.': '군인',
    # 귀족
    'Mme.': '영부인',
    'Countess.': '백작부인',
    'Jonkheer.': '귀족'
}

def add_designation(name): # 호칭 함수
  designation = "unknown"
  for key in dict_designation.keys():
    if key in name:
      designation = key
      break
  return designation

X_tr['designation'] = X_tr['name'].map(lambda x: add_designation(x))
X_te['designation'] = X_te['name'].map(lambda x: add_designation(x))

X_tr.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,designation
0,0,1,"Partner, Mr. Austen",male,45,0,0,113043,28.5,S,Mr.
1,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13.0,S,Mr.
2,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,S,Mr.
3,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,S,Mr.
4,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,S,Miss.


In [262]:
cond = X_tr['designation'] == "unknown"
X_tr.loc[cond].head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,designation


In [263]:
cond = X_te['designation'] == "unknown"
X_te.loc[cond].head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,designation


#### 문자열 분리

In [264]:
X_tr['last_name'] = X_tr['name'].map(lambda x: x.split(',')[0])
X_te['last_name'] = X_te['name'].map(lambda x: x.split(',')[0])

In [265]:
def add_ticket_number(ticket):
  try:
    ticket_split = ticket.split(' ')
    return int(ticket_split[-1])
  except:
    return 0 # ticket이 LINE인 경우

X_tr['ticket_number'] = X_tr['ticket'].map(lambda x: add_ticket_number(x)).astype("int32")
X_te['ticket_number'] = X_te['ticket'].map(lambda x: add_ticket_number(x)).astype("int32")

X_tr.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,designation,last_name,ticket_number
0,0,1,"Partner, Mr. Austen",male,45,0,0,113043,28.5,S,Mr.,Partner,113043
1,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13.0,S,Mr.,Berriman,28425
2,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,S,Mr.,Tikkanen,3101293
3,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,S,Mr.,Hansen,350025
4,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,S,Miss.,Andersson,347082


In [266]:
X_tr['ticket'].head()

0               113043
1                28425
2    STON/O 2. 3101293
3               350025
4               347082
Name: ticket, dtype: object

### 집계

#### 피봇 테이블

In [267]:
df_pivot = pd.pivot_table(X_tr, index='pclass', values='fare', aggfunc='mean').reset_index()
df_pivot.rename(columns={'fare': 'fare_mean_by_pclass'}, inplace=True)
df_pivot.head()

Unnamed: 0,pclass,fare_mean_by_pclass
0,1,89.253914
1,2,20.575939
2,3,13.934861


In [268]:
print(f'beofre: {X_tr.shape}')
X_tr = pd.merge(X_tr, df_pivot, how = 'left', on='pclass')
X_te = pd.merge(X_te, df_pivot, how = 'left', on='pclass')
print(f'after: {X_tr.shape}')
# left : X_tr => 데이터 원본 유지
X_tr.head()

beofre: (712, 13)
after: (712, 14)


Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,designation,last_name,ticket_number,fare_mean_by_pclass
0,0,1,"Partner, Mr. Austen",male,45,0,0,113043,28.5,S,Mr.,Partner,113043,89.253914
1,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13.0,S,Mr.,Berriman,28425,20.575939
2,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,S,Mr.,Tikkanen,3101293,13.934861
3,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,S,Mr.,Hansen,350025,13.934861
4,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,S,Miss.,Andersson,347082,13.934861


#### 그룹

In [269]:
agg_dict = {'survived':'mean','sibsp':'nunique','parch':'nunique'}
df_groupby = X_tr.groupby('pclass').agg(agg_dict).reset_index()
df_groupby.rename(columns = {'survived':'survived_by_pclass','sibsp':'len_sibsp_by_pclass','parch':'len_parch_by_pclass'}, inplace=True)
df_groupby

Unnamed: 0,pclass,survived_by_pclass,len_sibsp_by_pclass,len_parch_by_pclass
0,1,0.607362,4,4
1,2,0.483444,4,4
2,3,0.241206,7,7


In [270]:
df['sibsp'].unique()

array([1, 0, 3, 4, 2, 5, 8], dtype=int64)

### 데이터 변환 / 조합

apply(), map() 등 사용

In [271]:
def sub_age(age):
    return str((age // 10) * 10) + '대'
    
X_tr['sub_age'] = X_tr['age'].map(lambda x: sub_age(x))
X_te['sub_age'] = X_te['age'].map(lambda x: sub_age(x))
X_tr.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,designation,last_name,ticket_number,fare_mean_by_pclass,sub_age
0,0,1,"Partner, Mr. Austen",male,45,0,0,113043,28.5,S,Mr.,Partner,113043,89.253914,40대
1,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13.0,S,Mr.,Berriman,28425,20.575939,20대
2,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,S,Mr.,Tikkanen,3101293,13.934861,30대
3,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,S,Mr.,Hansen,350025,13.934861,20대
4,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,S,Miss.,Andersson,347082,13.934861,0대


In [272]:
def add_sub_embarked(row):
    return str(row['embarked']) + str(row['pclass']) + str(row['sibsp']) + str(row['parch'])

X_tr['sub_embarked'] = X_tr.apply(lambda row : add_sub_embarked(row), axis=1)
X_te['sub_embarked'] = X_te.apply(lambda row : add_sub_embarked(row), axis=1)
X_tr.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,designation,last_name,ticket_number,fare_mean_by_pclass,sub_age,sub_embarked
0,0,1,"Partner, Mr. Austen",male,45,0,0,113043,28.5,S,Mr.,Partner,113043,89.253914,40대,S100
1,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13.0,S,Mr.,Berriman,28425,20.575939,20대,S200
2,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,S,Mr.,Tikkanen,3101293,13.934861,30대,S300
3,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,S,Mr.,Hansen,350025,13.934861,20대,S310
4,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,S,Miss.,Andersson,347082,13.934861,0대,S342


### 날짜

In [273]:
DATA_PATH = "../../Data/"

df_cinemaTicket = pd.read_csv(DATA_PATH+"cinemaTicket_Ref.csv")
df_cinemaTicket.shape

(142524, 14)

In [274]:
df_cinemaTicket.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142524 entries, 0 to 142523
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   film_code     142524 non-null  int64  
 1   cinema_code   142524 non-null  int64  
 2   total_sales   142524 non-null  int64  
 3   tickets_sold  142524 non-null  int64  
 4   tickets_out   142524 non-null  int64  
 5   show_time     142524 non-null  int64  
 6   occu_perc     142399 non-null  float64
 7   ticket_price  142524 non-null  float64
 8   ticket_use    142524 non-null  int64  
 9   capacity      142399 non-null  float64
 10  date          142524 non-null  object 
 11  month         142524 non-null  int64  
 12  quarter       142524 non-null  int64  
 13  day           142524 non-null  int64  
dtypes: float64(3), int64(10), object(1)
memory usage: 15.2+ MB


#### datetime 적용

In [275]:
df_cinemaTicket['date'] = pd.to_datetime(df_cinemaTicket['date'])
df_cinemaTicket.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142524 entries, 0 to 142523
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   film_code     142524 non-null  int64         
 1   cinema_code   142524 non-null  int64         
 2   total_sales   142524 non-null  int64         
 3   tickets_sold  142524 non-null  int64         
 4   tickets_out   142524 non-null  int64         
 5   show_time     142524 non-null  int64         
 6   occu_perc     142399 non-null  float64       
 7   ticket_price  142524 non-null  float64       
 8   ticket_use    142524 non-null  int64         
 9   capacity      142399 non-null  float64       
 10  date          142524 non-null  datetime64[ns]
 11  month         142524 non-null  int64         
 12  quarter       142524 non-null  int64         
 13  day           142524 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(10)
memory usage: 15.2 MB


In [276]:
df_cinemaTicket['date'][:5]

0   2018-05-05
1   2018-05-05
2   2018-05-05
3   2018-05-05
4   2018-05-05
Name: date, dtype: datetime64[ns]

In [277]:
df_cinemaTicket['date'].dt.year[:5]

0    2018
1    2018
2    2018
3    2018
4    2018
Name: date, dtype: int64

In [278]:
df_cinemaTicket['date'].dt.month[:5]
df_cinemaTicket['date'].dt.day[:5]
df_cinemaTicket['date'].dt.quarter[:5]
df_cinemaTicket['date'].dt.weekday[:5] # 0~6 : 월요일 ~ 일요일
df_cinemaTicket['date'].dt.dayofyear[:5] # 연기준 몇일째인지

0    125
1    125
2    125
3    125
4    125
Name: date, dtype: int64

### 진행바

In [279]:
from tqdm.auto import tqdm
import numpy as np 

In [280]:
i=0
for i in tqdm(np.random.rand(10000000)):
    i = i**2

  0%|          | 0/10000000 [00:00<?, ?it/s]

### with pandas

In [281]:
tqdm.pandas() # 판다스에서 progress_apply 메소드를 사용할수 있게 된다.

In [282]:
import time

def do_apply(x):
    time.sleep(0.01)
    return x
tmp = df.progress_apply(do_apply,axis = 1)

  0%|          | 0/891 [00:00<?, ?it/s]