In [1]:
import argparse
import pandas as pd

from collections import namedtuple
from itertools import product

from ml_class import TestSplit, Preprocessor#, RobustScaler, StandardScaler

from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score


In [4]:
def main():
    scalers = [RobustScaler, StandardScaler]
    models = [LogisticRegression, DecisionTreeClassifier]
    dataset_type = ['train', 'validation', 'test']

    # scalers_models_combination = list(product(scalers, models))
    index_combination = list(product(scalers, models, dataset_type))

    a = namedtuple('Index', 'scaler model data')
    index_combination_a = [a(index[0].__name__, index[1].__name__, index[2]) for index in index_combination]
    
    # print(scalers_models_combination)
    
    # comb_n = []
    # for index in scalers_models_combination:
        # scaler = index[0]
        # model = index[1]

        # combination_name = f"{scaler.__name__}-{model.__name__}"
        # comb_n.append(combination_name)
    
    # a = list(product(comb_n, dataset_type))

    index = pd.MultiIndex.from_tuples(index_combination_a, names=["scaler", "model", "data"])

    report_df = pd.DataFrame(index=index, columns=["Precision", "Recall", "Accuracy", "F1"])

    prediction_df = pd.DataFrame()


    for index in index_combination_a:
        # scaler = index[0]
        # model = index[1]
        # combination_name = f"{scaler.__name__}-{model.__name__}"

        preprocessor = Preprocessor(eval(index.scaler))
        model = eval(index.model)()#(max_iter=700)

        train_score, validation_score = process_train_data(preprocessor, model)
        test_prediction, test_score = process_test_data(preprocessor, model)
        prediction_df[f"{index.scaler}-{index.model}"] = test_prediction

        dataset_type = {'train': train_score, 'validation': validation_score, 'test': test_score}
        add_report(report_df, index.scaler, index.model, dataset_type)

    prediction_df.to_csv('./result/{}'.format(get_arguments().prediction))
    report_df.to_csv('./result/{}'.format(get_arguments().report))

def process_train_data(preprocessor, model):
    # get dataset
    train_data = get_data('marketing_train.csv')

    # split data
    target = get_arguments().target
    X_train, X_validation, y_train, y_validation = TestSplit().split_test(train_data, target)

    # fit & transform
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_validation_transformed = transform_data(preprocessor, X_validation)

    # fit & predict
    model.fit(X_train_transformed, y_train)
    X_train_prediction = predict_data(model, X_train_transformed)
    X_validatrion_prediction = predict_data(model, X_validation_transformed)

    # score
    train_score = get_scores(y_train, X_train_prediction)
    validation_score = get_scores(y_validation, X_validatrion_prediction)

    return train_score, validation_score

def process_test_data(preprocessor, model):
    """
    return: prediction, score
    """

    # get dataset
    arguments = get_arguments()
    test_data = get_data(arguments.input)
    target = arguments.target

    # transform
    test_transformed = transform_data(preprocessor, test_data.drop(target, axis=1))

    # predict
    test_prediction = predict_data(model, test_transformed)
        
    # score
    test_score = get_scores(test_data[target], test_prediction)

    return test_prediction, test_score

def get_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', default='marketing_test.csv', help='Write file name you want to predict')
    parser.add_argument('--prediction', default='pred.csv', help='Write file name you want to save prediction')
    parser.add_argument('--report', default='report.csv', help='Write file name you want to save report')
    parser.add_argument('--target', default='insurance_subscribe', help='Write target feature')

    return parser.parse_args()

def get_data(file_name):
    return pd.read_csv('./data/{}'.format(file_name))

def transform_data(preprocessor, dataset_without_target):
    return preprocessor.transform(dataset_without_target)

def predict_data(model, transformed_data):
    return model.predict(transformed_data)

def get_scores(actual_y, predicted_y):
    funcs = [precision_score, recall_score, accuracy_score, f1_score]
    scores = []

    for func in funcs:
        scores.append(func(actual_y, predicted_y))

    print(scores)
    return scores

def add_report(report, scaler, model, dataset_type):
    for data, score in dataset_type.items():
        report.loc[(scaler, model, data)] = score
    
    return report

if __name__ == "__main__":
    main()

# python ml_data.py --input “marketing_test.csv” --prediction “pred.csv” --report “report.csv”

usage: ipykernel_launcher.py [-h] [--input INPUT] [--prediction PREDICTION]
                             [--report REPORT] [--target TARGET]
ipykernel_launcher.py: error: unrecognized arguments: -f /home/soo/.local/share/jupyter/runtime/kernel-23852365-e2c5-4f6a-a3e8-52bee98fc824.json


SystemExit: 2

In [3]:
import argparse
import pandas as pd
import numpy as np

from sklearn.preprocessing import RobustScaler #as SklearnRobustScaler
from sklearn.preprocessing import StandardScaler as SklearnStandardScaler
from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder
from sklearn.impute import SimpleImputer as SklearnSimpleImputer

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# class StandardScaler(SklearnStandardScaler):
#     def transform(self, X):
#         return pd.DataFrame(super().transform(X), columns=X.columns, index=X.index)

# class RobustScaler(SklearnRobustScaler):
#     def transform(self, X):
#         return pd.DataFrame(super().transform(X), columns=X.columns, index=X.index)

# class SimpleImputer(SklearnSimpleImputer):
#     def transform(self, X):
#         return pd.DataFrame(super().transform(X), columns=X.columns, index=X.index)

# class OneHotEncoder(SklearnOneHotEncoder):
#     def transform(self, X):
#         sparse_matrix = super(OneHotEncoder, self).transform(X)
#         new_columns = self.get_column_names(X=X)
#         return pd.DataFrame(sparse_matrix.toarray(), columns=new_columns, index=X.index)

#     def get_column_names(self, X):
#         new_columns = []
#         for i, column in enumerate(X.columns):
#             j = 0
#             while j < len(self.categories_[i]):
#                 new_columns.append(f'{column}_{self.categories_[i][j]}')
#                 j += 1
#         return new_columns
        
class TestSplit:
    def split_test(self, dataset, target):
        self.features_without_target = dataset.drop(target, axis=1)
        self.target_feature = dataset[target]

        X_train, X_test, y_train, y_test = train_test_split(self.features_without_target, self.target_feature) #기본비율
        return X_train, X_test, y_train, y_test
        
class Preprocessor(TransformerMixin):
    def __init__(self, scaler):
        self.scaler = scaler

    def transform_numeric_features(self):
        return Pipeline(steps=[
            ('imputer', SklearnSimpleImputer(strategy='median')),
            ('scaler', self.scaler())
            ])

    @staticmethod
    def transform_categorical_features():
        return Pipeline(steps=[
            ('imputer', SklearnSimpleImputer(strategy='most_frequent')),
            ('onehot', SklearnOneHotEncoder())
            ])

    def transform_columns(self):
        return ColumnTransformer(transformers = [
            ('nums', self.transform_numeric_features(), self.numeric_features.columns),
            ('cats', self.transform_categorical_features(), self.categorical_features.columns)
            ])

    def get_column_names(self):
        # nums_names = self.column_transformer.named_transformers_['nums']['scaler'].get_feature_names(self.categorical_features.columns) #.fit_transform(self.numeric_features).columns
        cats_names = self.column_transformer.named_transformers_['cats']['onehot'].get_feature_names(self.categorical_features.columns) #.fit_transform(self.categorical_features).columns
        feat_names = np.concatenate([self.numeric_features.columns, cats_names])
        return feat_names

    def fit(self, X, y=None):
        self.dataset = X
        self.numeric_features = X.select_dtypes(np.number)
        self.categorical_features = X.select_dtypes(exclude=np.number)
        self.column_transformer = self.transform_columns()
        self.column_transformer.fit(X)
        return self
    
    def transform(self, X):
        transformed_data = self.column_transformer.transform(X)
        column_names = self.get_column_names()
        return pd.DataFrame(transformed_data, columns=column_names, index=X.index)


# train_data = pd.read_csv('./data/marketing_train.csv')
# test_data = pd.read_csv('./data/marketing_test.csv')

# target = "insurance_subscribe"
# X_train, X_val, y_train, y_val = TestSplit().split_test(train_data, target)

# preprocessor = Preprocessor()
# X_train_transformed = preprocessor.fit_transform(X_train)
# X_test_transformed = preprocessor.transform(X_val)
# test_transformed = preprocessor.transform(test_data.drop(target, axis=1))


# model = LogisticRegression(max_iter=600)
# model.fit(X_train_transformed, y_train)

# X_train_prediction = model.predict(X_train_transformed)
# X_validation_prediction = model.predict(X_test_transformed)
# test_prediction = model.predict(test_transformed)

# X_train_prediction = predict(X_train_transformed)
# X_validation_prediction = predict(X_test_transformed)
# test_prediction = predict(test_transformed)

# def predict(transformed_data):
#     return model.predict(transformed_data)

# def make_pred():
#     prediction_df = pd.DataFrame(test_prediction, columns=["Predicted value"]).rename_axis("ID")
#     prediction_df.to_csv("pred.csv")


def get_scores(actual_y, predicted_y):
    funcs = [precision_score, recall_score, accuracy_score, f1_score]
    # columns = []
    scores = []

    for func in funcs:
        # columns.append(func.__name__)
        scores.append(func(actual_y, predicted_y))

    # report_df = pd.DataFrame(columns=columns)
    # report_df.loc['Robust-Logistic'] = scores

    return scores


def make_report():
    report_df = pd.DataFrame(index=["Logistic"], columns=["Precision", "Recall", "Accuracy", "F1"])

    compare_y = [(y_train, X_train_prediction), (y_val, X_validation_prediction), (test_data[target], test_prediction)]
    for y_data in compare_y:
        actual_y, predicted_y = y_data[0], y_data[1]
        # report_df.loc[('Logistic', 'train')] = get_scores(actual_y, predicted_y)
        
    print(report_df)
    return report_df

# make_report()

# print(get_scores(y_train, X_train_prediction))
# print(get_scores(y_test, X_test_prediction))
# print(get_scores(test_data[target], test_prediction))

# get_scores.to_csv("report.csv")


# def main():
#     parser = argparse.ArgumentParser()
#     parser.add_argument('--input', required=True)
#     parser.add_argument('--prediction', default='pred.csv', help='Write file name you want to save prediction')
#     parser.add_argument('--report', default='report.csv', help='Write file name you want to save report')

#     train, test = read_data()
#     transformed_data = preprocess()
#     model  = model_train()

# def read_data(path='marketing_train.csv'):
#     return pd.read_csv(path)

# def test():
#     read_data(input_path)

# 1. train 하기
# 2. input으로 들어온 데이터 읽기
# 3. 데이터 transform, prediction, score  하기
# 4. 저장하기 

# if __name__ == "__main__":
#     main()


# k-fold, model seletor

In [1]:
import pathlib

In [9]:
path = pathlib.Path('~/solid{}').with_suffix('.abc')

In [16]:
path

PosixPath('~/solid{}.abc')

In [18]:
joblib_dir_path = './joblib'

In [23]:
str(pathlib.Path(joblib_dir_path, '{}').with_suffix('.joblib')).format('preprocessor')

'joblib/preprocessor.joblib'

In [17]:
pathlib.Path('abc', 'def')

PosixPath('abc/def')

In [15]:
str(path).format('_preprocessor')

'~/solid_preprocessor.abc'

In [5]:
import os
os.path.join('~/solid'+'.abc')

'~/solid.abc'

In [31]:
a = os.path.join(joblib_dir_path, '{}' + '.joblib')

In [32]:
a.format('a')

'./joblib/a.joblib'

In [33]:
b = os.path.join(joblib_dir_path, '{}').endswith('.joblib')

In [35]:
b

False

In [36]:
pd

NameError: name 'pd' is not defined

In [37]:
import pandas as pd

In [38]:
pd

<module 'pandas' from '/home/soo/.pyenv/versions/3.8.3/envs/aaa-3.8.3/lib/python3.8/site-packages/pandas/__init__.py'>

In [39]:
a = [[1,2,3], [4,5,6], [7,8,9]]

In [40]:
a

[[1, 2, 3], [4, 5, 6], [7, 8, 9]]

In [41]:
pd.concat(a)

TypeError: cannot concatenate object of type '<class 'list'>'; only Series and DataFrame objs are valid

In [42]:
b = pd.DataFrame()

In [43]:
pd.concat(a, b)

TypeError: cannot concatenate object of type '<class 'list'>'; only Series and DataFrame objs are valid

In [53]:
f = open('requirements.txt', 'r')

In [54]:
print(f.read())

pandas
scikit-learn
numpy



In [67]:
f.close()

In [50]:
with open('requirements.txt', 'r') as f:
    print(f.read())

pandas
scikit-learn
numpy



In [68]:
f.read()

ValueError: I/O operation on closed file.

In [106]:
c = pd.DataFrame(a, columns=['A', 'B', 'C'])

In [107]:
c

Unnamed: 0,A,B,C
0,1,2,3
1,4,5,6
2,7,8,9


In [73]:
b

In [104]:
a

[[1, 2, 3], [4, 5, 6], [7, 8, 9]]

In [95]:
pd.concat([b, d])

Unnamed: 0,0
0,"[1, 2, 3]"
1,"[4, 5, 6]"
2,"[7, 8, 9]"


In [98]:
d = pd.Series(a)

In [99]:
d

0    [1, 2, 3]
1    [4, 5, 6]
2    [7, 8, 9]
dtype: object

In [87]:
list_1 = [1,2,3]
list_2 = [4,5,6]
list_3 = [7,8,9]

In [89]:
pd.DataFrame([list_1, list_2, list_3])

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [103]:
e = pd.Series(a[0], a[1], a[2])

TypeError: Field elements must be 2- or 3-tuples, got '7'

e

In [102]:
e

0    [1, 2, 3]
1    [4, 5, 6]
2    [7, 8, 9]
dtype: object

In [109]:
score_list = [1]

In [110]:
score_list.append(2)

In [111]:
score_list

[1, 2]

In [120]:
score_list.extend([3, 4])

In [121]:
score_list

[1, 2, [3, 4], 3, 4, 3, 4]

In [6]:
from sklearn.preprocessing import StandardScaler
from utils import Preprocessor

In [14]:
p = Preprocessor(StandardScaler)

In [15]:
p

<utils.Preprocessor at 0x7fc5435f24f0>

In [16]:
p._build_pipeline()

ColumnTransformer(transformers=[('numeric',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 None),
                                ('categorical',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehotencoder',
                                                  OneHotEncoder())]),
                                 None)])

In [26]:
p.get_column_names()

AttributeError: 'Preprocessor' object has no attribute 'column_transformer'

In [None]:
imp

In [1]:
import tempfile
tmp = tempfile.NamedTemporaryFile(suffix=".csv")
# tmp.close()

In [2]:
tmp

<tempfile._TemporaryFileWrapper at 0x7f9f248a2490>

In [9]:
f = open(tmp.name, 'r')

In [10]:
f.read()

'abc'

In [11]:
f.close()

In [6]:
f = open(tmp.name, 'w')

In [7]:
f.write('abc')

3

In [8]:
f.close()

In [None]:
with open(tmp.name, 'r'):
    f.read()

In [6]:
import shutil

In [7]:
import os

In [14]:
os.mkdir(tmp.name)

In [12]:
tmp.close()

In [13]:
os.mkditmp.name

'/tmp/tmp5s7njbve.csv'