# 구름 환경에서 수행한 코드
`내점당구매액` 파생변수를 만들어서 이를 예측하는 모델 생성

In [None]:
# 출력을 원하실 경우 print() 함수 활용
# 예시) print(df.head())

# getcwd(), chdir() 등 작업 폴더 설정 불필요
# 파일 경로 상 내부 드라이브 경로(C: 등) 접근 불가

# 데이터 파일 읽기 예제
import pandas as pd
X_test = pd.read_csv("data/X_test.csv")
X_train = pd.read_csv("data/X_train.csv")
# y_train = pd.read_csv("data/y_train.csv")

# 사용자 코딩
# 1. 데이터 파악
# print(X_train.describe())


# 2. 데이터 전처리
## 2.1. 파생변수 '내점당구매액' 생성
# X = pd.merge(X_train, y_train, how = 'inner', on = 'cust_id')
##### Train
All_df = X_train.copy()
All_df['내점당구매액'] = All_df['총구매액'] / All_df['내점일수']

##### Test
All_df_test = X_test.copy()
All_df_test['내점당구매액'] = All_df_test['총구매액'] / All_df_test['내점일수']
# print(All_df_test)


## 2.2. 독립변수로써 필요없는 변수들 제거 (cust_id, 파생변수 생성에 사용된 두 컬럼)
##### Train
All_df_preprocessing = All_df.copy()
del All_df_preprocessing['cust_id']
del All_df_preprocessing['총구매액']
del All_df_preprocessing['내점일수']
# print(All_df_preprocessing)

##### Test
All_df_test_preprocessing = All_df_test.copy()
del All_df_test_preprocessing['cust_id']
del All_df_test_preprocessing['총구매액']
del All_df_test_preprocessing['내점일수']
# print(All_df_test_preprocessing)


## 2.3. 결측치 확인 및 채워넣기 (2295개 존재)
##### Train
# print(All_df_preprocessing.isnull().sum())
All_df_preprocessing = All_df_preprocessing.fillna(0)
# print(All_df_preprocessing.isnull().sum())

##### Test
All_df_test_preprocessing = All_df_test_preprocessing.fillna(0)
# print(All_df_test_preprocessing.isnull().sum())


## 2.4. X, y 나누기
X_train = All_df_preprocessing[All_df_preprocessing.columns[:7]]
y_train = All_df_preprocessing[['내점당구매액']]

X_test = All_df_test_preprocessing[All_df_test_preprocessing.columns[:7]]
y_test = All_df_test_preprocessing[['내점당구매액']]   # 사용 안 해도 됨!

## 2.5. MinMax Scaling
from sklearn.preprocessing import MinMaxScaler
##### Train
# print(X.dtypes)
X_train_minmax = X_train[['환불금액', '내점당구매건수', '주말방문비율', '구매주기']]
# print(X_train_minmax)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_minmax)
X_train_scaled = pd.DataFrame(data = X_train_scaled,
														 columns = X_train_minmax.columns)
# print(X_train_scaled)

##### Test
X_test_minmax = X_test[['환불금액', '내점당구매건수', '주말방문비율', '구매주기']]
# print(X_train_minmax)

X_test_scaled = scaler.transform(X_test_minmax)
X_test_scaled = pd.DataFrame(data = X_test_scaled,
														 columns = X_test_minmax.columns)
# print(X_test_scaled)


## 2.6. One-hot encoding
##### Train
X_train_onehot = X_train[['주구매상품', '주구매지점']]
X_train_onehot_df = pd.get_dummies(X_train_onehot)
# print(X_train_onehot_df.shape)

##### Test
X_test_onehot = X_test[['주구매상품', '주구매지점']]
X_test_onehot_df = pd.get_dummies(X_test_onehot)
# print(X_test_onehot_df.shape)

### 이슈 발생!: Train data set에 열이 Test set보다 하나 더 많다. 찾아내자 (-> 주구매상품_소형가전)
for col in X_train_onehot_df:
	if col not in X_test_onehot_df.columns:
		pass
		# print(col)
		
### 이슈 해결: Test data의 해당 열에 0으로 채워주자 (어짜피 없으니까 열이 안 생긴거잖아!)
X_test_onehot_df['주구매상품_소형가전'] = 0
# print(X_test_onehot_df.shape)

## 2.5의 결과와 2.6의 결과를 합치기
X_train_final = pd.concat([X_train_scaled, X_train_onehot_df], axis = 1)
# print(X_train_final)
X_test_final = pd.concat([X_test_scaled, X_test_onehot_df], axis = 1)
# print(X_test_final)

			

# 3. 모델 생성 및 평가
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import classification_report, f1_score, accuracy_score
from xgboost import XGBRegressor

# RF
scores = cross_val_score(RandomForestRegressor(), X_train_final, y_train, cv=3)
print(scores)
print("RF model's score: %f (CROSS_VALIDATION)" % scores.mean())

model = RandomForestRegressor()
model.fit(X_train_final, y_train)
score = model.score(X_train_final, y_train)
print("RF model's Train set score: %f" % score)
### test score
test_score = model.score(X_test_final, y_test)
print("RF model's Test set score: %f\n" % test_score)

# KNN
scores = cross_val_score(KNeighborsRegressor(), X_train_final, y_train, cv=3)
print(scores)
print("KNN model's score: %f (CROSS_VALIDATION)" % scores.mean())

model = KNeighborsRegressor()
model.fit(X_train_final, y_train)
score = model.score(X_train_final, y_train)
print("KNN model's Train set score: %f" % score)
### test score
test_score = model.score(X_test_final, y_test)
print("KNN model's Test set score: %f\n" % test_score)

# XGB
scores = cross_val_score(XGBRegressor(), X_train_final, y_train, cv=3)
print(scores)
print("XGB model's score: %f (CROSS_VALIDATION)" % scores.mean())

model = XGBRegressor()
model.fit(X_train_final, y_train)
score = model.score(X_train_final, y_train)
print("XGB model's Train set score: %f" % score)
### test score
test_score = model.score(X_test_final, y_test)
print("XGB model's Test set score: %f\n" % test_score)


# 답안 제출 참고
# 아래 코드 예측변수와 수험번호를 개인별로 변경하여 활용
# pd.DataFrame({'cust_id': X_test.cust_id, 'gender': pred}).to_csv('003000000.csv', index=False)
