# 기타

## 자주 사용하는 단축키

* ctrl + m h : 단축키 표시
* ctrl + shift + enter : 섹션 셀 삽입
* ctrl + m d : 선택 셀 삭제
* ctrl + enter : 코드셀 실행
* shift + enter : 셀을 실행하고 다음 셀 선택
* ctrl + m b : 아래에 코드 셀 삽입
* ctrl + m v : 아래에 텍스트 셀 삽입
* ctrl + . : 모든출력 지우기
* ctrl + shift + a : 모든 셀 선택




## 학습 라이브러리

- numpy
- scipy
- pandas
- scikit-learn



# Chapter2 데이터 전처리

## 2.1 데이터 탐색

### 2.1.3 파이썬 데이터 탐색

타이타닉 데이터

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 타이타닉 데이터
df_raw = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")
print(df_raw)

In [None]:
# 변수 타입 변환
df_type = df_raw.copy()
df_type["Survived"]=df_type["Survived"].astype(str) # 범주형 변수
df_type["Pclass"]=df_type["Pclass"].astype(str)

print(df_type.info())

In [None]:
# 데이터 기술 통계 확인
df1 = df_type.copy()

print(df1.describe(include='all'))

In [None]:
# pclass 탑승객의 빈도수
df1 = df_type.copy()
print(df1.groupby("Pclass").size())
print(df1.groupby("Pclass").count()["PassengerId"])

In [None]:
# Histogram
df1 = df_type.copy()
plt.hist(df1["Fare"])
plt.show()

# 데이터 분리
data_0 = df1[df1["Survived"]=='0']["Fare"]
data_1 = df1[df1["Survived"]=='1']["Fare"]

# boxplot
fig, ax = plt.subplots()
ax.boxplot([data_0, data_1])
plt.show()


In [None]:
# 성별 간에 생존자 수 차이
df_a = df_type.copy()


df_male = df_a[df_a["Sex"]=="male"]
df_female = df_a[df_a["Sex"]=="female"]
male_dead = df_male.groupby("Survived").size()[0]
male_survive = df_male.groupby("Survived").size()[1]
male_survive_rate = male_survive / (male_survive + male_dead)

female_dead = df_female.groupby("Survived").size()[0]
female_survive = df_female.groupby("Survived").size()[1]
female_survive_rate = female_survive / (female_survive + female_dead)

print("male survive rate : %0.3f \nfemale survive rate : %0.3f" %(male_survive_rate, female_survive_rate))



## 2.2 데이터 전처리 개요

### 2.2.1 데이터 전처리 유형

데이터 전처리는 데이터 가공(data manipulation), 데이터 핸들링(data handling) 등 으로 불린다

### 2.2.2 데이터 변환

In [None]:
import numpy as np
import pandas as pd


#data 생성
meat_consumption_korean = 5*np.random.randn(1000)+53.9
meat_consumption_japan = 4*np.random.randn(1000)+32.7

meat_consumption_df = pd.DataFrame({"korean":meat_consumption_korean,
                                    "japanese":meat_consumption_japan})
print(meat_consumption_df)

In [None]:
# z 표준화 3가지
import scipy.stats as ss
df = meat_consumption_df.copy()
df["korean_zscore"] = ss.zscore(df["korean"])
df["japanese_zscore"] = ss.zscore(df["japanese"])

df["korean_zscore_np"] = (df["korean"]-np.mean(df["korean"]))/np.std(df["korean"])
df["japanese_zscore_np"] = (df["japanese"]-np.mean(df["japanese"]))/np.std(df["japanese"])

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df["korean_zscore_skl"] = scaler.fit_transform(df[["korean"]]) # series객체 넣어야함
df["japanese_zscore_skl"] = scaler.fit_transform(df[["japanese"]])
print(type(df["korean"]))
print(type(df[["korean"]]))

print(df)


In [None]:
# minmax 표준화
from sklearn.preprocessing import MinMaxScaler

df = meat_consumption_df.copy()
scaler = MinMaxScaler()

df["korean_mm"] = scaler.fit_transform(df[["korean"]])
df["japanese_mm"] = scaler.fit_transform(df[["japanese"]])


df["korean_mm_np"]=(df["korean"]-df["korean"].min()) / (df["korean"].max()-df["korean"].min())
df["japanese_mm_np"]=(df["japanese"]-df["japanese"].min()) / (df["japanese"].max()-df["japanese"].min())
print(df)



USJudgeRatings data


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as ss

df = pd.read_csv("/content/drive/MyDrive/bigdata_sample/USJudgeRatings.csv")

In [None]:
df1 = df.copy()
# 왜도
print(ss.skew(df1["CONT"]))
print(ss.skew(df1["PHYS"]))

plt.hist(df1["CONT"])
plt.show()

plt.hist(df1["PHYS"])
plt.show()


In [None]:
df1 = df.copy()

# 왜도에 따른 변수 변환
print(df1.head(5))
df["CONT2"] = np.log10(df["CONT"])
df["PHYS2"] = np.log10(np.max(df["PHYS"]+1)- df["PHYS"])

print(ss.skew(df["CONT2"]))
print(ss.skew(df["PHYS2"]))




In [None]:
# histogram 범주화
data = [["철수", 52],["영희", 92],["미영", 84],["시완", 71],["미경", 65],["영환", 81],["숙경", 66],["부영", 77],["민섭", 73],["보연", 74]]
df = pd.DataFrame(data, columns=["이름", "성적"])

plt.hist(df["성적"], bins=5, range=[50,100], rwidth = 0.9) # 5개 구간
plt.show()

In [None]:
# 조건문으로 구간 나누기
# df2 = df.copy()
# df2["등급"] =0

# df2.loc[(df["성적"]<60),"등급"] = "F"
# df2.loc[(df["성적"]>=60)&(df["성적"]<70),"등급"] = "D"
# df2.loc[(df["성적"]>=70)&(df["성적"]<80),"등급"] = "C"
# df2.loc[(df["성적"]>=80)&(df["성적"]<90),"등급"] = "B"
# df2.loc[(df["성적"]>=90),"등급"] = "A"
# # print(df2)


# #cut 함수 이용
df4 = df.copy()
df4["등급"] = pd.cut(x=df4["성적"], bins=[0,60,70,80,90,100], labels=["F", "D", "C", "B", "A"],
                   include_lowest=True) # 각 구간의 낮은 경계값 포함
# print(df4)

# qcut : 각 범주에 균등한 수의 데이터 채워지도록 분할
df5 = df.copy()
df5["등급"] = pd.qcut(x=df5["성적"], q=5, labels=["F","D","C","B","A"])
print(df5)
plt.hist(df5["등급"])
plt.show()


### 2.2.3 차원축소 : PCA(주성분분석)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

iris = pd.read_csv("/content/drive/MyDrive/bigdata_sample/iris.csv")

In [None]:
# 범주형 변수 분리
df = iris.drop("species", axis=1)
df_species = iris["species"]
print(df_species)

In [None]:
# 변수 정규화
df2=df.copy()
print(df2.columns)

x = StandardScaler()
df2["sepal_length"] = x.fit_transform(df2[["sepal_length"]])
df2["sepal_width"] = x.fit_transform(df2[["sepal_width"]])
df2["petal_length"] = x.fit_transform(df2[["petal_length"]])
df2["petal_width"] = x.fit_transform(df2[["petal_width"]])

# print(df2)

# PCA 수행
pca = PCA(n_components=4)
p_score = pca.fit_transform(df2)
print(p_score.shape)
print(pca.explained_variance_ratio_)


### 2.2.4 결측치 처리

In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/bigdata_sample/titanic.csv")

In [None]:
print(df.isnull().sum())
print(df.isnull().sum().sum()) # 결측치 총 개수

df1 = df.dropna(axis=0)
print(df1.shape)

df2 = df.dropna(subset=["Age"], axis=0) # Age열의 na만 삭제
print(df2.shape)
# 평균값 대체
avg_age = df["Age"].mean()
df3 = df.copy()
df3["Age"].fillna(avg_age, inplace=True)
print(df3.isnull().sum())


In [None]:
# 최빈값 대체
df4 = df.copy()
print(df4.isnull().sum())
mode = df4["Embarked"].mode()[0]
df4["Embarked"].fillna(mode, inplace=True)
print(df4.isnull().sum())

In [None]:
# 인접값 대체
data = [1,2,3,None,None,None,5,6,7]
df = pd.DataFrame(data, columns=["test"])
print(df.fillna(method="ffill")) # 이전(앞선)데이터로 채움
print(df.fillna(method="bfill")) # 다음 데이터로 채움



### 2.2.5 이상치 처리

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = 10*np.random.randn(200)+50
df = pd.DataFrame({"data":data})
df.loc[200] = 2
df.loc[201] = 100
df.loc[202] = 10
df.loc[203] = 110



In [None]:
# 히스토그램 판별

plt.hist(df, bins=20, rwidth=0.8)
plt.show()

In [None]:
# IQR 사용
df2 = df.copy()
plt.boxplot(df2["data"])
plt.show()

Q1 = df2["data"].quantile(.25)
Q3 = df2["data"].quantile(.75)
IQR = Q3-Q1
print(Q1, Q3, IQR)

print(df2.describe())


In [None]:
# outlier 구하기
df3 = df.copy()

low_con = Q1 - 1.5*IQR
upper_con = Q3 + 1.5*IQR
low_out = df3[df3["data"]<low_con]
upper_out = df3[df3["data"]>upper_con]
print(low_out, upper_out, sep="\n")

### 2.2.6 평활화(smoothing)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("/content/drive/MyDrive/bigdata_sample/lynx.csv")

In [None]:
df2 = df.copy()
df2 = df2.iloc[:,1:]

print(df2.head(5))
print(df2.info())

In [None]:
# 단순이동평균
df3 = df2.copy()
df3["sma"] = df3["value"].rolling(10).mean()

plt.plot(df3["value"])
plt.plot(df3["sma"])
plt.show()

In [None]:
# 지수 이동평균
df4 = df2.copy()
df4["ewma"] = df4["value"].ewm(10).mean()
plt.plot(df4["value"])
plt.plot(df4["ewma"])
plt.show()

# Chapter3 데이터 분석

## 3.1 빅데이터 분석 과정

### 3.1.1 빅데이터 분석과정의 이해

### 3.1.2 사이킷런 패키지

### 3.1.3 분석모델 성능 평가 방법


#### 지도학습-회귀(예측)

In [None]:
# 지도학습-회귀(예측)
## mse(mean squared error)
## rmse(root mse)
## rmsle(root mean squared logarithmic error)
## mae(mean absolute error)
## r2 (r-squared)
## adjusted r2
## mspe(mean squared percentage error)
## mape(mean absolute percentage error)
## aic (akaike information criterion)
## bic (bayes information criterion)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# 단순 선형회귀 분석용 데이터
# mpg : 연비
# cylinders:실린더개수
# displacement : 배기량
# acceleration:가속력
# model year : 출시년도
auto_mpg = pd.read_csv("/content/drive/MyDrive/bigdata_sample/auto-mpg.csv")

# 전처리 및 모델 fitting
auto_mpg["horsepower"] = auto_mpg["horsepower"].fillna(auto_mpg["horsepower"].mean())
x = auto_mpg[["weight"]]
y = auto_mpg["mpg"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10)
model = LinearRegression()
model.fit(x_train,y_train)
pred = model.predict(x_test)

# 다중선형
x_mul = auto_mpg[["weight","horsepower","cylinders","displacement"]]
y_mul = auto_mpg["mpg"]
x_train_mul, x_test_mul, y_train_mul, y_test_mul = train_test_split(x_mul, y_mul, test_size=0.2, random_state=10)
model_mul = LinearRegression()
model_mul.fit(x_train_mul,y_train_mul)
pred_mul = model_mul.predict(x_test_mul)


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error,mean_absolute_percentage_error ,mean_squared_log_error
import numpy as np
import statsmodels.api as sm

## mse(mean squared error)
mse = mean_squared_error(y_test, pred)
mse_cal = sum((y_test-pred)**2)/len(y_test)
print("mse : ", mse, mse_cal)

## rmse(root mse)
rmse = np.sqrt(mse)
print("rmse : ", rmse)

## rmsle(root mean squared logarithmic error)
msle = mean_squared_log_error(y_test, pred)
print("msle : ", msle)

## mae(mean absolute error)
mae = mean_absolute_error(y_test, pred)
mae_cal = sum(abs(y_test-pred))/len(y_test)
print("mae : ", mae, mae_cal)

## r2 (r-squared)
r2 = r2_score(y_test, pred)
def r2_cal(y_test, pred):
  y_mean = y_test.mean()
  SST = sum((y_test-y_mean)**2)
  SSR = sum((y_test-pred)**2) # 설명이 안된부분
  SSE = sum((pred-y_mean)**2) # 설명이 된 부분
  r2 = 1-SSR/SST
  # r2 = SSE/SST
  return r2
print("r2 : ", r2, r2_cal(y_test, pred))

## adjusted r2
n = len(y_test_mul)
k = x_test_mul.shape[1]
r2_mul = r2_score(y_test_mul, pred_mul)
r2_adj = 1-(n-1)*(1-r2_mul)/(n-k-1)
print("r2_adj : ", r2_adj)


## mspe(mean squared percentage error)
mspe = sum(((y_test-pred)/y_test)**2)/len(y_test)
print("mspe : ", mspe)

## mape(mean absolute percentage error)
mape = mean_absolute_percentage_error(y_test, pred)
mape_cal = sum(abs((y_test-pred)/y_test))/len(y_test)
print("mape : ", mape, mape_cal)

## aic (akaike information criterion)
## bic (bayes information criterion)




#### 지도학습-분류

In [None]:
# confusion matrix(오차행렬)
# accuracy 정확도
# precision 정밀도
# recall 재현율
# f1 score
# roc
# auc


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
titanic = pd.read_csv("/content/drive/MyDrive/bigdata_sample/titanic.csv")


# 전처리
titanic.drop("Cabin", axis=1, inplace=True)
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].mean())
titanic["Embarked"] = titanic["Embarked"].fillna(titanic["Embarked"].mode()[0])
titanic["FamilySize"] = titanic["SibSp"]+titanic["Parch"]
titanic["Sex"] = LabelEncoder().fit_transform(titanic["Sex"])
titanic["Embarked"] = LabelEncoder().fit_transform(titanic["Embarked"])

y = titanic["Survived"]
x = titanic[["Pclass","Sex","Age","Fare","Embarked","FamilySize"]]
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=10)


# 모델 fitting
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_test)


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import auc, roc_auc_score, roc_curve
from sklearn.metrics import RocCurveDisplay
import matplotlib.pyplot as plt

rept = classification_report(y_test, pred)
print(rept)

# confusion matrix(오차행렬)
confu = confusion_matrix(y_test, pred)
print("confusion matrix : \n", confu)

# x축이 모델 y축이 실제답
TN = confu[0][0]
FP = confu[0][1]
FN = confu[1][0]
TP = confu[1][1]

# accuracy 정확도
acc = accuracy_score(y_test, pred)
acc_cal = (TN+TP)/(TN+FN+FP+TP)
print("accuracy : ", acc, acc_cal)

# precision 정밀도
prec = precision_score(y_test, pred)
prec_cal_p = TP/(TP+FP)
prec_cal_n = TN/(TN+FN)
print("precision : ", prec, prec_cal_p, prec_cal_n)

# recall 재현율
rec = recall_score(y_test,pred)
rec_cal_p = TP/(TP+FN)
rec_cal_n = TN/(TN+FP)
print("recall : ", rec, rec_cal_p, rec_cal_n)

# f1 score
f1 = f1_score(y_test, pred)
f1_cal_p = 2*(prec_cal_p*rec_cal_p)/(prec_cal_p+rec_cal_p)
f1_cal_n = 2*(prec_cal_n*rec_cal_n)/(prec_cal_n+rec_cal_n)
print("f1 score : ", f1, f1_cal_p, f1_cal_n)


# roc, auc
fpr_cal = FP/(FP+TN)
tpr_cal = TP/(TP+FN)
fpr, tpr, thresholds = roc_curve(y_test, pred)
print("fpr: ", fpr, fpr_cal)
print("tpr: ", tpr, tpr_cal)

auc_score = auc(fpr, tpr)
auc_score2 = roc_auc_score(y_test,pred)
print("auc_score : ", auc_score, auc_score2)

display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=auc_score)
display.plot()
plt.show()


#### 비지도학습

In [None]:
#silhouette(실루엣 계수)
#Dunn Index


In [221]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans

iris=pd.read_csv("/content/drive/MyDrive/bigdata_sample/iris.csv")

# 전처리
iris["species"] = LabelEncoder().fit_transform(iris["species"])
# x = iris.drop("species", axis=1)
x = iris.copy()
# fitting
model = KMeans(n_clusters=3, n_init=10, max_iter=500, random_state=42)
model.fit(x)

cluster_center = model.cluster_centers_
cluster_prediction = model.predict(x)

print(cluster_center)
print(cluster_prediction)
# print(pd.DataFrame(cluster_center))
# print(cluster_prediction)
inertia = model.inertia_
print("inertia : ", inertia)
sco = model.score(iris)
print(sco)

[[6.62244898 2.98367347 5.57346939 2.03265306 2.        ]
 [5.006      3.428      1.462      0.246      0.        ]
 [5.91568627 2.76470588 4.26470588 1.33333333 1.01960784]]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 2 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]
inertia :  87.22062785114048
-87.22062785114046


In [231]:
from sklearn.metrics import silhouette_score, silhouette_samples

iris_property = iris.drop("species", axis=1)
labels = cluster_prediction
# print(iris_property)
silhouette = silhouette_score(iris_property, labels)
print(silhouette)

score_samples = silhouette_samples(iris_property, labels)
iris_property['silhouette_coeff'] = score_samples
average_score = silhouette_score(iris_property, labels)

iris_property.head(3)

0.5111496981090736


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,silhouette_coeff
0,5.1,3.5,1.4,0.2,0.846713
1,4.9,3.0,1.4,0.2,0.807609
2,4.7,3.2,1.3,0.2,0.822507


## 3.2 지도학습 - 분류

In [None]:
import pandas as pd
import numpy as np

# 전처리용 모듈
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

# 분류 모델 모듈
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# 평가 지표
import sklearn.metrics as metric



In [None]:
# 데이터
titanic = pd.read_csv("/content/drive/MyDrive/bigdata_sample/titanic.csv")

df = pd.read_csv("/content/drive/MyDrive/bigdata_sample/Bank_Personal_Loan_Modelling.csv")
# df = pd.read_csv("")
print(df)
print(df.info())
print(df.describe(include="all"))


In [None]:
# 전처리
## 의사결정나무
### 범주형 변수 지수형으로 변환해야함
titanic = pd.read_csv("/content/drive/MyDrive/bigdata_sample/titanic.csv")
titanic.drop("Cabin", axis=1, inplace=True)
titanic["Age"].fillna(titanic["Age"].mean(), inplace=True)
titanic["Embarked"].fillna(titanic["Embarked"].mode()[0], inplace=True)
print(titanic.info())
le = LabelEncoder()
titanic["Sex"] =le.fit_transform(titanic["Sex"])
titanic["Embarked"]= le.fit_transform(titanic["Embarked"])
y = titanic["Survived"]
x = titanic[["Pclass","Sex","Age","Fare","Embarked"]]

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=11)

model = DecisionTreeClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_test)
acc = metric.accuracy_score(y_test, pred)
print(acc)
# print(dir(metric))
rept = metric.classification_report(y_test,pred)
conf = metric.confusion_matrix(y_test,pred)

print(rept)
print(conf)

print(98/(98+16))



In [None]:
# 공통 작업
## 데이터 분할
## 모델 fitting
## 모델 prediction
## 모델 평가
print(2*0.84*0.82/(0.84+0.82))


### 3.2.1 의사결정나무

In [None]:
import numpy as np
import pandas as pd
import sklearn

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

df = pd.read_csv("/content/drive/MyDrive/bigdata_sample/titanic.csv")

In [None]:
df2 = df.copy()
print(df2.info())

In [None]:
# 데이터 전처리
df2 = df.copy()
avg_age = df2["Age"].mean()
df2["Age"] = df2["Age"].fillna(avg_age)
df2.drop("Cabin", inplace= True, axis=1)
df2["Embarked"] = df2["Embarked"].fillna(df2["Embarked"].mode()[0])
print(df2.info())

from sklearn.preprocessing import LabelEncoder
print(df2.describe(include='all'))

# 레이블 인코딩 : 범주형 변수를 숫자형으로 변환
df2["Sex"] = LabelEncoder().fit_transform(df2["Sex"])
df2["Embarked"] = LabelEncoder().fit_transform(df2["Embarked"])
print(df2.head(10))

# 파생 변수 생성
df2["FamilySize"] = df2["SibSp"]+df2["Parch"]
print(df2.info())

In [None]:
# 분석용 데이터 셋 준비
df3 = df2.copy()
x = df3[["Pclass","Sex","Age","Fare","Embarked","FamilySize"]]
y = df3["Survived"]

# 학습 / 테스트 데이터 분할
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2 , random_state=11)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)


In [None]:
# 데이터 분석
dt=DecisionTreeClassifier(random_state=11)
dt.fit(x_train, y_train)

pred = dt.predict(x_test)

# 성능 평가
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, pred)
print(acc)

### 3.2.2 KNN

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("/content/drive/MyDrive/bigdata_sample/iris.csv")

In [None]:
# 데이터 탐색
df2 = df.copy()
print(df2.head())
print(df2.info())
print(df2.describe(include="all"))

In [None]:
from sklearn.preprocessing import MinMaxScaler

# 데이터 전처리
minmax = MinMaxScaler()
df2["sepal_length"] = minmax.fit_transform(df2[["sepal_length"]])
df2["sepal_width"] = minmax.fit_transform(df2[["sepal_width"]])
df2["petal_length"] = minmax.fit_transform(df2[["petal_length"]])
df2["petal_width"] = minmax.fit_transform(df2[["petal_width"]])

print(df2)

In [None]:
# 데이터 셋 준비
df3 = df2.copy()
x = df3[["sepal_length", "sepal_width", "petal_length", "petal_width"]]
y = df3["species"]

# 데이터 분할
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=11)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# 분석
knn = KNeighborsClassifier(n_neighbors=3) # n_neighbors값이 달라짐에 따라 성능 달라짐 조정
knn.fit(x_train, y_train)
pred = knn.predict(x_test)

# 성능 측정
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, pred)
print(acc)


In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("/content/drive/MyDrive/bigdata_sample/titanic.csv")
# print(df.head(5))
# print(df.info())
# print(df.describe(include='all'))

df["Age"].fillna(df["Age"].mean(), inplace=True)
df.drop("Cabin", axis=1, inplace=True)
df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)
df["FamilySize"] = df["SibSp"] + df["Parch"]

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["Sex"]=le.fit_transform(df["Sex"])
df["Embarked"] = le.fit_transform(df["Embarked"])
# print(df.groupby("Sex").size())
# print(df.groupby("Embarked").size())

# 수기변환
# df["Sex"]= df["Sex"].replace("female",1)
# df["Sex"]= df["Sex"].replace("male",2)
# df["Embarked"]= df["Embarked"].replace("C",1)
# df["Embarked"]= df["Embarked"].replace("Q",2)
# df["Embarked"]= df["Embarked"].replace("S",3)
# print(df.groupby("Sex").size())
# print(df.groupby("Embarked").size())

from sklearn.model_selection import train_test_split
y = df["Survived"]
x = df[["Pclass","Sex","Age","Fare","Embarked","FamilySize"]]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=11)
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(x_train, y_train)
pred = knn.predict(x_test)


from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test,pred)
print(acc)

from sklearn.metrics import classification_report
rpt = classification_report(y_test, pred)
print(rpt)

from sklearn.metrics import confusion_matrix
mat = confusion_matrix(y_test,pred)
print(mat)


### 3.2.3 SVM


In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/bigdata_sample/titanic.csv")


In [None]:
# 데이터 탐색
df2 = df.copy()
# print(df.info())
# print(df.describe(include="all"))

# 전처리
df2["Age"] = df2["Age"].fillna(df2["Age"].mean())
df2["Embarked"] = df2["Embarked"].fillna(df2["Embarked"].mode()[0])
df2.drop("Cabin", axis=1,inplace=True)
df2["FamilySize"] = df2["SibSp"] + df2["Parch"]

print(df2.info())

In [None]:
# one-hot encoding
onehot_sex = pd.get_dummies(df2["Sex"])
onehot_embarked = pd.get_dummies(df2["Embarked"])
df3 = pd.concat([df2, onehot_sex, onehot_embarked], axis=1) # axis 조심
print(df3.info())


In [None]:
# 분석 데이터셋 준비
from sklearn.model_selection import train_test_split
y = df3["Survived"]
x = df3[["Pclass", "Age", "Fare", "FamilySize", "female", "male", "C", "Q", "S"]]

x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.3, random_state=10)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

In [None]:
# 분석
from sklearn.svm import SVC
svm = SVC()
svm.fit(x_train,y_train)
pred = svm.predict(x_test)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
acc = accuracy_score(y_test, pred)
mat = confusion_matrix(y_test, pred)
rep = classification_report(y_test, pred)
print(acc)
print(mat)
print(rep)



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def svm_test(kerner, c, gamma):
  df2 = pd.read_csv("/content/drive/MyDrive/bigdata_sample/titanic.csv")

  # 전처리
  df2["Age"] = df2["Age"].fillna(df2["Age"].mean())
  df2["Embarked"] = df2["Embarked"].fillna(df2["Embarked"].mode()[0])
  df2.drop("Cabin", axis=1,inplace=True)
  df2["FamilySize"] = df2["SibSp"] + df2["Parch"]

  # one-hot encoding
  onehot_sex = pd.get_dummies(df2["Sex"])
  onehot_embarked = pd.get_dummies(df2["Embarked"])
  df3 = pd.concat([df2, onehot_sex, onehot_embarked], axis=1) # axis 조심

  # 분석 데이터셋 준비
  y = df3["Survived"]
  x = df3[["Pclass", "Age", "Fare", "FamilySize", "female", "male", "C", "Q", "S"]]

  x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.3, random_state=10)

  # 분석
  svm = SVC(kernel=kerner, C=c, gamma=gamma)
  svm.fit(x_train,y_train)
  pred = svm.predict(x_test)
  acc = accuracy_score(y_test, pred)
  mat = confusion_matrix(y_test, pred)
  rep = classification_report(y_test, pred)
  print(rep)


svm_test('rbf', 1, 0.1)


### 3.2.4 로지스틱 회귀

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
df= pd.read_csv("/content/drive/MyDrive/bigdata_sample/iris.csv")


# 로지스틱 회귀이므로 전처리 정규화 필요??
from sklearn.preprocessing import MinMaxScaler
mx = MinMaxScaler()
df["sepal_length"] = mx.fit_transform(df[["sepal_length"]])
df["sepal_width"] = mx.fit_transform(df[["sepal_width"]])
df["petal_length"] = mx.fit_transform(df[["petal_length"]])
df["petal_width"] = mx.fit_transform(df[["petal_width"]])

# print(df.info())
y = df["species"]
x = df[["sepal_length", "sepal_width", "petal_length", "petal_width"]]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=11)
# print(x_train.shape, x_test.shape)

import sklearn
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()
lg.fit(x_train, y_train)
pred = lg.predict(x_test)

from sklearn.metrics import classification_report, accuracy_score
rpt = classification_report(y_test, pred)
acc = accuracy_score(y_test,pred)
print(acc)
print(rpt)

### 3.2.5 랜덤 포레스트

In [None]:
import sklearn
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/bigdata_sample/titanic.csv")

df.drop("Cabin", axis=1, inplace = True)
df["Age"].fillna(df["Age"].mean(),inplace=True)
df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)
df["FamilySize"] = df["SibSp"] + df["Parch"]

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["Sex"] = le.fit_transform(df["Sex"])
df["Embarked"] = le.fit_transform(df["Embarked"])

y=df["Survived"]
x=df[["Pclass","Sex","Age","Fare","Embarked","FamilySize"]]

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=11)



from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, max_depth=3, random_state=20)
rf.fit(x_train, y_train)
pred = rf.predict(x_test)

from sklearn.metrics import classification_report
rpt = classification_report(y_test, pred)
print(rpt)

## 3.3 지도학습 - 회귀(예측)

In [None]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression


auto_mpg = pd.read_csv("/content/drive/MyDrive/bigdata_sample/auto-mpg.csv")


In [None]:
# 단순 선형회귀
auto_mpg["horsepower"] = auto_mpg["horsepower"].fillna(auto_mpg["horsepower"].mean())
print(auto_mpg.info())
print(auto_mpg.describe(include="all"))

# mpg : 연비
# cylinders:실린더개수
# displacement : 배기량
# acceleration:가속력
# model year : 출시년도

In [None]:
# print(auto_mpg.corr())
x = auto_mpg[["weight"]]
y = auto_mpg["mpg"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10)

#model fitting
model = LinearRegression()
model.fit(x_train,y_train)
intercept = model.intercept_
coef = model.coef_
pred = model.predict(x_test)
pred2 = intercept + x_test*coef
pred_df = pred2.copy()
pred_df["pred"] = pred
pred_df["test"] = pred_df["weight"]==pred_df["pred"]
# print(pred_df)

from sklearn.metrics import r2_score, mean_squared_error
mse = mean_squared_error(y_test, pred)
# print(mse)
mse_cal = sum((y_test-pred)**2)/len(y_test)
print(mse, mse_cal)


# pred2 = intercept + coef*x_test



### 3.3.1 단순 선형회귀

In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/bigdata_sample/auto-mpg.csv")

print(df.head(5))
print(df.info())
print(df.describe(include="all"))

In [None]:
# 시각화
import matplotlib.pyplot as plt

df.plot(kind="scatter", x="cylinders", y="mpg")
df.plot(kind="scatter", x="displacement", y="mpg")

In [None]:
df2 = df.dropna(axis=0)
y = df2["mpg"]
x = df2[["weight"]]

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=10)

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train, y_train)
pred = lr.predict(x_test)
print(lr.coef_)
print(lr.intercept_)


# 성능 평가
from sklearn.metrics import r2_score
score = r2_score(y_test, pred)
print(score)

### 3.3.2 다중 선형회귀

In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/bigdata_sample/housing.csv")

print(df.info())
print(df.describe(include="all"))
print(df.head(5))

In [None]:
df2 = df.copy()

df2.dropna(axis=0, inplace=True)
df2.drop("ocean_proximity", axis=1, inplace=True)


# 다중 회귀
y_m = df2["median_house_value"]
x_m = df2.drop("median_house_value", axis=1)
# 단순 회귀
y = df2["median_house_value"]
x = df2[["median_income"]]

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
x_train_m, x_test_m, y_train_m, y_test_m = train_test_split(x_m, y_m, test_size=0.3, random_state=42)


from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr_m = LinearRegression()
lr.fit(x_train, y_train)
lr_m.fit(x_train_m, y_train_m)

from sklearn.metrics import r2_score
pred = lr.predict(x_test)
pred_m = lr_m.predict(x_test_m)
r2 = r2_score(y_test, pred)
r2_m = r2_score(y_test_m, pred_m)
print(r2, r2_m)



### 3.3.3 의사결정나무

In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/bigdata_sample/housing.csv")
df2 = df.copy()

df2.dropna(axis=0, inplace=True)
df2.drop("ocean_proximity", axis=1, inplace=True)

y = df2["median_house_value"]
x = df2.drop("median_house_value", axis=1)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(max_depth=3, random_state=42)
dt.fit(x_train, y_train)
pred = dt.predict(x_test)

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, pred)
print(mse)



### 3.3.4 랜덤 포레스트

In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/bigdata_sample/housing.csv")
df2 = df.copy()

df2.dropna(axis=0, inplace=True)
df2.drop("ocean_proximity", axis=1, inplace=True)

y = df2["median_house_value"]
x = df2.drop("median_house_value", axis=1)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(max_depth=3, random_state=42)
rf.fit(x_train, y_train)
pred = rf.predict(x_test)

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, pred)
print(mse)



## 3.4 비지도학습

### 3.4.1 군집분석 k-means

In [None]:
import pandas as pd


df=pd.read_csv("/content/drive/MyDrive/bigdata_sample/iris.csv")


In [None]:
df2 = df.copy()
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df2["species"] = le.fit_transform(df2["species"])

# 시각화
import seaborn as sns
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import proj3d

# sns.pairplot(df, hue="species")
# plt.show()

In [None]:
from sklearn.cluster import KMeans
cluster1 = KMeans(n_clusters=3, n_init=10, max_iter=500, random_state=42)

cluster1.fit(df2)

cluster_center = cluster1.cluster_centers_
cluster_prediction = cluster1.predict(df2)
# print(pd.DataFrame(cluster_center))
# print(cluster_prediction)

df3 = df2.copy()
df3["cluster"] = cluster_prediction
print(df3)


In [None]:
# 성능 시각화
scope =range(1,10)
inertias =[] #각 군집별 오차제곱합
for k in scope:
  model = KMeans(n_clusters=k)
  model.fit(df2)
  inertias.append(model.inertia_)

plt.figure(figsize=(4,4))
plt.plot(scope, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.show()

### 3.4.2 연관분석

In [None]:
import sklearn.cluster

# dir(sklearn)
print(sklearn.__all__)
# import sklearn.feature_selection
# dir(sklearn.feature_selection)
# print(sklearn.datasets.__all__)