In [4]:
from itertools import product
import tkinter as tk
import tkinter.messagebox as msg
from tkinter import ttk
import xgboost as xgb
import lightgbm as lgb
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
import pandas as pd
import os.path

class DataAnalysis(tk.Tk):
    def __init__(self):
        super().__init__()
        self.content = ttk.Frame(self)
        self.frame1 = ttk.Frame(self.content, borderwidth=5, 
                relief="sunken", width=200,height=100)
        self.frame3 = ttk.Frame(self.content, borderwidth=5, 
                   relief="sunken", width=200,height=100)
        self.frame2 = ttk.Frame(self.content, borderwidth=5,
                               relief='sunken', width=200)
        

        self.f1label = ttk.Label(self.frame1, text='文件路径:')
        self.f1entry = ttk.Entry(self.frame1)
        self.f1button1 = ttk.Button(self.frame1, text='读取数据',
                                    command=self.read_data)
        self.f1button2 = ttk.Button(self.frame1, text='数据报告', 
                                    command=self.profiling)

        self.f3label1 = ttk.Label(self.frame3, text='模型类型:')
        self.f3entry1 = ttk.Entry(self.frame3)
        self.onevar=tk.BooleanVar()
        self.f3check = ttk.Checkbutton(self.frame3, text="分类",
                    variable=self.onevar, onvalue=True)
        self.f3label2 = ttk.Label(self.frame3, text='类别：')
        self.f3entry2 = ttk.Entry(self.frame3)
        self.f3button = ttk.Button(self.frame3, text='基线模型',
                                   command=self.baseline)
        

        self.content.grid(column=0, row=0)
        self.frame1.grid(column=0, row=0, columnspan=2, rowspan=1)
        self.f1label.grid(column=0, row=0)
        self.f1entry.grid(column=1,row=0)
        self.f1button1.grid(column=2, row=0)
        self.f1button2.grid(column=3, row=0, columnspan=2)

        self.frame3.grid(column=0, row=2, columnspan=2, rowspan=2)
        self.f3label1.grid(column=0, row=3)
        self.f3entry1.grid(column=1, row=3,columnspan=1)
        self.f3label2.grid(column=0, row=4,columnspan=1)
        self.f3entry2.grid(column=1,row=4, columnspan=1)
        self.f3check.grid(column=2, row=3,columnspan=1)
        self.f3button.grid(column=4, row=3, columnspan=1)
        
    def read_data(self):
        filename = self.f1entry.get()
        path = os.path.dirname(os.path.dirname(__file__))
        filepath = os.path.join(path,'data/{}'.format(filename))
        self.data = pd.read_csv(filepath)
        
    def profiling(self):
        name = self.f1entry.get().split('.')[0]
        profile = self.data.profile_report(title='数据报告')
        profile.to_file(output_file="{}.html".format(name))
        
    def baseline(self):
        clf = self.f3entry1.get()
        method = self.onevar.get()
        self.get_target( )
        scoring, result = self._baseline(clf=clf, method=method, cv=5)
        msg.showinfo(title='结果',message='{}: {:.4f}'.format(' '.join(scoring.split('_')), result)) 
        
    def get_target(self):
        target = self.f3entry2.get( )
        self.y = self.data.pop(target)
        
    def split_cat_num(self):
        cat = self.data.select_dtypes(include='object')
        num = self.data.select_dtypes(exclude='object')
        return cat, num
    
    def _baseline(self, clf='xgboost', method=True, cv=5):
        
        params = dic =  {'max_depth': 5, 'max_leaf_nodes': 30,
 'min_samples_leaf': 10, 'min_samples_split': 30, 'n_estimators': 100}
        models = [xgb.XGBClassifier(), xgb.XGBRegressor(), 
                  lgb.LGBMClassifier(),
        lgb.LGBMRegressor(),ensemble.RandomForestClassifier(**params),
                 ensemble.RandomForestRegressor(**params)]
        items = product(['xgboost','lightgbm','randomforest'],
                        [True,False])
        mapping = {key:value for key, value in zip(items, models)}
        est = mapping[(clf, method)]
        if method:
            scoring = 'roc_auc'
        else:
            scoring = 'neg_mean_square_error'
        cat, num = self.split_cat_num()
        if not num.empty:
            num = num.fillna(num.median())
        df = pd.concat([num, cat], axis=1)
        X = pd.get_dummies(df, dummy_na=True)
        res = cross_val_score(est, X.values, self.y.values, scoring=scoring, cv=cv)
        return scoring, res.mean()
    
    
dataAna = DataAnalysis()
dataAna.mainloop()

In [44]:
class DataAnalysis(tk.Tk):
    def __init__(self):
        super().__init__()
        self.content = ttk.Frame(self)
        self.frame1 = ttk.Frame(self.content, borderwidth=5, 
                relief="sunken", width=200,height=100)
        self.frame3 = ttk.Frame(self.content, borderwidth=5, 
                   relief="sunken", width=200,height=100)
        self.frame2 = ttk.Frame(self.content, borderwidth=5,
                               relief='sunken', width=200)
        

        self.f1label = ttk.Label(self.frame1, text='文件路径:')
        self.f1entry = ttk.Entry(self.frame1)
        self.f1button1 = ttk.Button(self.frame1, text='读取数据',
                                    command=None)
        self.f1button2 = ttk.Button(self.frame1, text='数据报告', 
                                    command=None)

        self.f3label1 = ttk.Label(self.frame3, text='模型类型:')
        self.f3entry1 = ttk.Entry(self.frame3)
        self.onevar=tk.BooleanVar()
        self.f3check = ttk.Checkbutton(self.frame3, text="分类",
                    variable=self.onevar, onvalue=True)
        self.f3label2 = ttk.Label(self.frame3, text='类别：')
        self.f3entry2 = ttk.Entry(self.frame3)
        self.f3button = ttk.Button(self.frame3, text='基线模型',
                                   command=None)
        # 单一值处理
        self.f2lb1 = ttk.Label(self.frame2, text='单一值阈值')
        self.f2entry1 = ttk.Entry(self.frame2)
        self.f2lb2 = ttk.Label(self.frame2, text='删除单一值的阈值')
        self.f2entry2 = ttk.Entry(self.frame2)
        # 缺失值处理
        self.f2lb3 = ttk.Label(self.frame2, text='缺失值处理方式')
        self.f2lbx1Variate = tk.StringVar(
            value=['删除','指示变量', 'None','单独一类'])
        self.f2lbx1 = tk.Listbox(self.frame2, 
                    listvariable=self.f2lbx1Variate, height=3)
        self.f2lb7 = ttk.Label(self.frame2, text='填充方式')
        self.f2lbx2Variate = tk.StringVar(
        value=['mean','median', 'mode', '模型填充'])
        self.f2lbx2 = tk.Listbox(self.frame2, 
                    listvariable=self.f2lbx2Variate, height=3)
        # 异常值处理
        self.f2lb4 = ttk.Label(self.frame2, text='异常值处理的最小阈值')
        self.f2entry3 = ttk.Entry(self.frame2)
        self.f2lb5 = ttk.Label(self.frame2, text='异常值处理的最大阈值')
        self.f2entry4 = ttk.Entry(self.frame2)
        # 特征选择
        self.f2lb6 = ttk.Label(self.frame2, text='特征选择方法')
        self.f2lbx3 = tk.Listbox(self.frame2, 
                    listvariable=None,height=3)
        # 数据保存
        self.f2btm1 = ttk.Button(self.frame2,text='保存数据')
        # 建立模型
        self.f2btm2 = ttk.Button(self.frame2,text='建立模型')
        
        self.content.grid(column=0, row=0)
        self.frame1.grid(column=0, row=0)
        self.f1label.grid(column=0, row=0, padx=1, pady=1)
        self.f1entry.grid(column=1,row=0)
        self.f1button1.grid(column=2, row=0)
        self.f1button2.grid(column=3, row=0, columnspan=2)

        self.frame3.grid(column=0, row=2)
        self.f3label1.grid(column=0, row=3, padx=1, pady=1)
        self.f3entry1.grid(column=1, row=3,columnspan=1)
        self.f3label2.grid(column=0, row=4,columnspan=1)
        self.f3entry2.grid(column=1,row=4, columnspan=1)
        self.f3check.grid(column=2, row=3,columnspan=1)
        self.f3button.grid(column=4, row=3, columnspan=1)
        
        self.frame2.grid(column=0, row=5)
        self.f2lb1.grid(column=0, row=5)
        self.f2entry1.grid(column=0, row=6,columnspan=1)
        self.f2lb2.grid(column=0, row=7,padx=1, pady=1)
        self.f2entry2.grid(column=0, row=8,columnspan=1)
        self.f2lb3.grid(column=1, row=5,padx=1, pady=1)
        self.f2lbx1.grid(column=1, row=6, columnspan=1, rowspan=3)
        self.f2lb7.grid(column=2, row=5,)
        self.f2lbx2.grid(column=2, row=6, columnspan=1, rowspan=3)
        self.f2lb4.grid(column=3, row=5,padx=1, pady=1)
        self.f2entry3.grid(column=3, row=6,columnspan=1)
        self.f2lb5.grid(column=3, row=7,padx=1, pady=1)
        self.f2entry4.grid(column=3, row=8,columnspan=1)
        self.f2lb6.grid(column=4, row=5, columnspan=1, rowspan=1)
        self.f2lbx3.grid(column=4, row=6, columnspan=1, rowspan=3)
        self.f2btm1.grid(column=5, row=6,columnspan=1,rowspan=2)
        self.f2btm2.grid(column=6, row=6,columnspan=1, rowspan=2)
data = DataAnalysis()
data.mainloop()