# Base Setting

## 라이브러리

In [None]:
# Library import
import pandas as pd

import numpy as np
from numpy import linalg

import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib.patches as mpatches
from matplotlib import cm
from matplotlib.ticker import LinearLocator

import seaborn as sns

import math

from scipy import stats

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.graphics.factorplots import interaction_plot

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import datasets


import missingno as msno
import os
import urllib
import re
import random
from openpyxl import load_workbook

from datetime import datetime


import sympy
# Juypter 노트북에서 수학식의 LaTeX 표현을 위해 필요함
sympy.init_printing(use_latex='mathjax')


# from vega_datasets import data
# from tabulate import tabulate

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline



## 구글드라이브

In [None]:
# Google drive mout
from google.colab import drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 현재 디렉토리 위치
print(os.getcwd())

# 마운트된 구글드라이브 확인
!ls

# 해당 드라이브로 이동 
# 내 드라이브는 원격서버가 아니라 로컬서버로 간주하므로 명령어 실행시 앞단에 !를 붙이지 않는다.
# cd /content/drive/Shareddrives/coding/

/content
drive  sample_data


In [None]:
# 파일 디렉토리 경로지정
path = '/content/drive/Shareddrives/coding/Colab_git_'

# 데이터 디렉토리 경로지정
path_data = path + 'data/'

# 결과 디렉토리 경로지정
path_results = path + 'results/'

## 기타설정

In [None]:
# 그래프 한글폰트 개선
# plt.rcParams['font.family'] ='Malgun Gothic'
# plt.rcParams['axes.unicode_minus'] =False
# plt.rc('font', family='NanumGothic')

In [None]:
# 그래프 한글폰트 개선
# !apt -qq -y install fonts-nanum > /dev/null
 
# fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
# font = fm.FontProperties(fname=fontpath, size=10)
# fm._rebuild()
 
# # 그래프에 retina display 적용
# %config InlineBackend.figure_format = 'retina'
 
# # Colab 의 한글 폰트 설정
# plt.rc('font', family='NanumBarunGothic') 

In [None]:
# pandas loat format 설정
pd.options.display.float_format = None                  # float 포멧 초기화
# pd.options.display.float_format = '{:.2f}'.format      # 소수점 둘째자리까지 표시
# pd.options.display.float_format = '{:,.2f}'.format      # 천단위 콤마찍고, 소수점 둘째자리까지 표시

In [None]:
# 그래프 스타일 설정
# plt.style.use('seaborn')
# plt.style.use('default')
# sns.set(font_scale=1)

# 1. EDA


In [None]:
df_red = pd.read_csv(path_data + 'winequality-red.csv', sep=';')
df_white = pd.read_csv(path_data + 'winequality-white.csv', sep=';')

In [None]:
# Red wine 데이터셋의 shape 확인
print(df_red.shape) # (Rows, Columns)의 형태로 출력

# White wine 데이터셋의 shape 확인
print(df_white.shape)

(1599, 12)
(4898, 12)


In [None]:
# Red wine 데이터셋의 중복된 데이터 개수 확인
df_red.duplicated().sum()

240

In [None]:
# drop_duplicates()를 통해 중복된 데이터 제거
df_red = df_red.drop_duplicates()
df_red.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur-dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5


In [None]:
# index 재정렬
df_red = df_red.reset_index(drop=True)
df_red.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur-dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5


In [None]:
# White wine 데이터셋의 중복된 데이터 개수 확인
df_white.duplicated().sum()

937

In [None]:
# drop_duplicates() 를 통해 중복된 데이터 제거
df_white = df_white.drop_duplicates()

# index 재정렬
df_white = df_white.reset_index(drop=True)
df_white.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,6.2,0.32,0.16,7.0,0.045,30.0,136.0,0.9949,3.18,0.47,9.6,6


In [None]:
# red dataframe에 red를 나타내는 array 생성
red_color = np.repeat('red', len(df_red))
red_color

array(['red', 'red', 'red', ..., 'red', 'red', 'red'], dtype='<U3')

In [None]:
# white dataframe에 white를 나타내는 array 생성
white_color = np.repeat('white', len(df_white))
white_color

array(['white', 'white', 'white', ..., 'white', 'white', 'white'],
      dtype='<U5')

In [None]:
# 위에서 만든 array를 dataframe 추가
df_red['color'] = red_color
df_white['color'] = white_color

In [None]:
# column의 이름 통일을 위해, 수정
df_red.rename(columns = {'total_sulfur-dioxide' : 'total_sulfur_dioxide'}, inplace=True)

In [None]:
# 합치려는 데이터셋의 columns가 동일한지 확인
set(df_red.columns) == set(df_white.columns)

True

# 1. 제목