In [2]:
import pkg_resources 
import pandas as pd
OutputDataSet = pd.DataFrame(sorted([(i.key, i.version) for i in pkg_resources.working_set])) 
print(OutputDataSet)

# 전체 칼럼 출력하기
pd.options.display.max_columns = None

                      0       1
0             alabaster  0.7.12
1       anaconda-client   1.7.2
2    anaconda-navigator   2.0.3
3      anaconda-project   0.9.1
4                 anyio   2.2.0
..                  ...     ...
250                yapf  0.31.0
251                zict   2.0.0
252                zipp   3.4.1
253          zope.event   4.5.0
254      zope.interface   5.3.0

[255 rows x 2 columns]


## 1.  고객 3500명에 대한 학습용 데이터를 이용하여 성별 예측 모형을 만든 후, 평가용 데이터에 적용하여 얻은 예측값(남자일 확률)을 csv 파일로 생성. 데이터 전처리, Feature Engineering, 분류 알고리즘 사용, 하이퍼 파라미터 최적화, 모형 앙상블 등이 수반


In [None]:
## 데이터 불러오기
x_train = pd.read_csv('./bigData/x_train.csv', encoding='CP949')
y_train = pd.read_csv('./bigData/y_train.csv', encoding='CP949')
x_test = pd.read_csv('./bigData/x_test.csv', encoding='CP949')

## 데이터 탐색
x_train.shape, y_train.shape, x_test.shape
x_train.head()
x_train.info()
x_train.describe().T
x_train.isnull().sum()
x_test.isnull().sum()

## 데이터 전처리
### 불필요한 칼럼 삭제
x_test_cust_id = x_test['cust_id']
x_train = x_train.drop(columns = ['cust_id'])
y_train = y_train.drop(columns = ['cust_id'])
x_test = x_test.drop(columns = ['cust_id'])

### 결측값 처리
x_train['환불금액'] = x_train['환불금액'].fillna(0)
x_test['환불금액'] = x_test['환불금액'].fillna(0)

### 범주형 칼럼 Encoding
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
x_train['주구매상품'] = encoder.fit_transform(x_train['주구매상품'])
x_test['주구매상품'] = encoder.fit_transform(x_test['주구매상품'])
encoder.classes_
x_train['주구매지점'] = encoder.fit_transform(x_train['주구매지점'])
x_test['주구매지점'] = encoder.fit_transform(x_test['주구매지점'])
encoder.classes_

### 파생변수 만들기
x_train.loc[x_train['환불금액'] > 0, '환불금액_NEW'] = 1
x_train.loc[x_train['환불금액'] == 0, '환불금액_NEW'] = 0
x_test.loc[x_test['환불금액'] > 0, '환불금액_NEW'] = 1
x_test.loc[x_test['환불금액'] == 0, '환불금액_NEW'] = 0

x_train = x_train.drop(columns = ['환불금액'])
x_test = x_test.drop(columns = ['환불금액'])

### 표준화
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train), columns = x_train.columns)
x_test = pd.DataFrame(scaler.fit_transform(x_test), columns = x_test.columns)

### 상관관계확인
x_train[['총구매액', '최대구매액', '환불금액_NEW']].corr()
x_train = x_train.drop(columns = ['최대구매액'])
x_test = x_test.drop(columns = ['최대구매액'])

## 데이터 학습 및 평가
### 데이터 학습
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
y_test_pred = model.predict(x_test)
pd.DataFrame(y_test_pred)

### 결과 예측
y_test_proba = model.predict_proba(x_test)
y_test_proba_man = pd.DataFrame(y_test_proba)[1]

## 모델 평가
y_train_pred = model.predict(x_train)
from sklearn.metrics import roc_auc_score
roc_auc_score(y_train, y_train_pred)

## 결과 저장
result = pd.concat([x_test_cust_id, y_test_proba_man], axis=1)
result.columns = ['cust_id', 'gender']
result.to_csv('./bigData/result1.csv', index=False)
result_test = pd.read_csv('./bigData/result1.csv')
print(result_test)

## 2. 고객 891명에 대한 학습용 데이터를 이용하여 생존 여부를 예측하는 모형을 만든다. 이후 예측 모형을 평가용 데이터에 적용하여 418명 승객의 생존 여부 예측값을 CSV 파일로 생성

In [70]:
### 데이터 불러오기
x_train = pd.read_csv('./bigData/titanic_x_train.csv', encoding='CP949')
x_test = pd.read_csv('./bigData/titanic_x_test.csv', encoding='CP949')
y_train = pd.read_csv('./bigData/titanic_y_train.csv', encoding='CP949')

### 데이터 탐색 
y_train.columns = ['PassengerId', 'Survived']
# print(x_train.head(), x_test.head(), y_train.head())
# print(x_train.info(), x_test.info(), y_train.info())
pd.concat([x_train, y_train['Survived']], axis=1).corr()['Survived']

### 데이터 정제
x_train = x_train.drop(columns = ['PassengerId', '나이', '승객이름', '티켓번호', '객실번호'])
x_test_passengerId = x_test['PassengerId']
x_test = x_test.drop(columns = ['PassengerId', '나이', '승객이름', '티켓번호', '객실번호'])
y_train = y_train.drop(columns = ['PassengerId'])

# print(x_train['선착장'].mode())

x_train['선착장'] = x_train['선착장'].fillna('S')
x_test['운임요금'] = x_test['운임요금'].fillna(x_test['운임요금'].mean())

# print(x_train.isnull().sum())
# print(x_test.isnull().sum())

x_train['성별'] = x_train['성별'].replace('male', 0).replace('female', 0)
x_test['성별'] = x_test['성별'].replace('male', 0).replace('female', 0)

x_train_선착장 = pd.get_dummies(x_train['선착장'], drop_first = True).rename(columns = {'Q': '선착장Q', 'S': '선착장S'})
x_train = pd.concat([x_train, x_train_선착장], axis=1)
x_train = x_train.drop(columns = ['선착장'])
x_test_선착장 = pd.get_dummies(x_test['선착장'], drop_first = True).rename(columns = {'Q': '선착장Q', 'S': '선착장S'})
x_test = pd.concat([x_test, x_test_선착장], axis=1)
x_test = x_test.drop(columns = ['선착장'])

x_train['가족수'] = x_train['형제자매배우자수']+x_train['부모자식수']
x_test['가족수'] = x_test['형제자매배우자수']+x_test['부모자식수']
x_train = x_train.drop(columns = ['형제자매배우자수', '부모자식수'])
x_test = x_test.drop(columns = ['형제자매배우자수', '부모자식수'])

#x_train.head()

### 데이터 모델링
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(x_train, y_train)
y_test_pred = pd.DataFrame(model.predict(x_test))

### 모델 평가
from sklearn.metrics import roc_auc_score
y_train_pred = model.predict(x_train)
roc_auc_score(y_train, y_train_pred)

### 결과 저장
result = pd.concat([x_test_passengerId, y_test_pred], axis=1)
result.columns = ['Passenger', 'Survived']
result.to_csv('./bigData/result2.csv', index = False)
print(pd.read_csv('./bigData/result2.csv'))

     Passenger  Survived
0          892         1
1          893         0
2          894         0
3          895         0
4          896         1
..         ...       ...
413       1305         0
414       1306         1
415       1307         0
416       1308         0
417       1309         1

[418 rows x 2 columns]


  model.fit(x_train, y_train)


In [71]:
import sklearn.preprocessing
print(dir(sklearn.preprocessing))

['Binarizer', 'FunctionTransformer', 'KBinsDiscretizer', 'KernelCenterer', 'LabelBinarizer', 'LabelEncoder', 'MaxAbsScaler', 'MinMaxScaler', 'MultiLabelBinarizer', 'Normalizer', 'OneHotEncoder', 'OrdinalEncoder', 'PolynomialFeatures', 'PowerTransformer', 'QuantileTransformer', 'RobustScaler', 'StandardScaler', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '_csr_polynomial_expansion', '_data', '_discretization', '_encoders', '_function_transformer', '_label', 'add_dummy_feature', 'binarize', 'label_binarize', 'maxabs_scale', 'minmax_scale', 'normalize', 'power_transform', 'quantile_transform', 'robust_scale', 'scale']


## 3. 고객 10866건에 대한 학습용 데이터를 이용하여 자전거 대여량 예측 모형을 만든다. 생성한 예측 모형으로 평가용 데이터에 해당하는 6493건의 자전거 대여량 예측값을 CSV 파일로 생성

In [63]:
### 데이터 불러오기
x_train = pd.read_csv('./bigData/bike_x_train.csv', encoding='CP949')
x_test = pd.read_csv('./bigData/bike_x_test.csv', encoding='CP949')
y_train = pd.read_csv('./bigData/bike_y_train.csv', encoding='CP949')
y_train.columns = ['datetime', 'count']

### 데이터 탐색
# print(x_train.head())
# print(x_test.head())
# print(y_train.head())
# print(x_train.info())
# print(x_test.info())
# print(y_train.info())
# print(x_train.isnull().sum())
# print(x_test.isnull().sum())
# print(y_train.isnull().sum())

### 데이터 정제
x_train['year'] = pd.to_datetime(x_train['datetime']).dt.year
x_train['month'] = pd.to_datetime(x_train['datetime']).dt.month
x_train['day'] = pd.to_datetime(x_train['datetime']).dt.day
x_train['hour'] = pd.to_datetime(x_train['datetime']).dt.hour
x_test['year'] = pd.to_datetime(x_test['datetime']).dt.year
x_test['month'] = pd.to_datetime(x_test['datetime']).dt.month
x_test['day'] = pd.to_datetime(x_test['datetime']).dt.day
x_test['hour'] = pd.to_datetime(x_test['datetime']).dt.hour

x_test_datetime = x_test['datetime']
x_train = x_train.drop(columns = ['datetime', 'day'])
x_test = x_test.drop(columns = ['datetime', 'day'])
y_train = y_train.drop(columns = ['datetime'])

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train), columns = x_train.columns)
x_test = pd.DataFrame(scaler.fit_transform(x_test), columns = x_test.columns)

### 데이터 학습
# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier()
# model.fit(x_train, y_train)
# y_test_pred = pd.DataFrame(model.predict(x_test))

from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model.fit(x_train, y_train)
y_test_pred = pd.DataFrame(model.predict(x_test))

# print(y_test_pred.head())

### 모델 평가 
from sklearn.metrics import roc_auc_score, f1_score, r2_score
y_train_pred = model.predict(x_train)
print(r2_score(y_train, y_train_pred))

### 결과 저장
result = pd.concat([x_test_datetime, y_test_pred], axis=1)
result.columns = ['datetime', 'count']
result.to_csv('./bigData/result3.csv', index = False)
print(pd.read_csv('./bigData/result3.csv'))

0.9999077799610809
              datetime  count
0      2011-01-20 0:00    5.0
1      2011-01-20 1:00    5.0
2      2011-01-20 2:00    3.0
3      2011-01-20 3:00    2.0
4      2011-01-20 4:00    2.0
...                ...    ...
6488  2012-12-31 19:00  262.0
6489  2012-12-31 20:00  214.0
6490  2012-12-31 21:00   98.0
6491  2012-12-31 22:00   98.0
6492  2012-12-31 23:00   54.0

[6493 rows x 2 columns]
