In [10]:
import pandas as pd
import numpy as np

import time
from dateutil.relativedelta import relativedelta
import datetime
import calendar
from dateutil import rrule


class PandasAnalysis(object):
    # __doc__内容
    """
    作者：1979令狐冲
    E-mail：klmtldh@163.com

    pandas数据分析类
    类名称：pandasAnalysis
    函数：
        __init__(self,df)
        get_df(self) 返回类型pd.DataFrame
        cleaning(self,df,blank=None) 类返回型pd.DataFrame
        sort_list(self,obj,list_sort,column=None) 返回类型pd.DataFrame或类型pd.Series
        split_record(self,df: pd.DataFrame, sp_cloc: str, sign: str)返回类型pd.DataFrame
        plus_record(self,dfp: pd.DataFrame, pr_cloc: str, sign: str)返回类型pd.DataFrame
        split_period(self,df, start, end, interval)返回类型pd.DataFrame
        plus_period(self,dfp: pd.DataFrame, s_date,e_date,pr_cloc: str, sign: str)返回类型pd.DataFrame

    使用方法：
        1.实例化
        import pandas as pd
        import pandasAnalysis as pa
        df=pd.DataFrame(
                        [
                            ['A','B','C',pd.to_datetime('2020-12-31'),pd.to_datetime('2021-1-8'),'完成情况1'],
                            ['D','E','F',pd.to_datetime('2021-2-26'),pd.to_datetime('2021-3-7'),'完成情况2'],
                            ['C','E','F',pd.to_datetime('2021-1-20'),pd.to_datetime('2022-3-8'),'完成情况3'],
                        ],columns=['编号','责任人','关键任务','开始日期','结束日期','完成情况'])
        df_pa=pa.pandasAnalysis(df)
        2.get_df(self)函数
        获取df值，封装内部数据

    """

    def __init__(self, df, df_sort_list=None):
        self.df = df
        self.df_sort_list = df_sort_list
        self.df_cleaning = None


    # 数据清洗，主要删除重复值，删除缺失值，删除空格
    def cleaning(self, df=None, blank=None):
        if df is None:
            df=self.df
        else:
            pass
        # 删除重复值
        df.drop_duplicates(inplace=True)
        # 删除缺失值
        df.dropna(axis=0, how='all', inplace=True)
        # 删除空格
        if blank == 'both':
            df = df.applymap((lambda x: str.strip(x) if isinstance(x, str) else x))
        elif blank == 'all':
            df = df.applymap((lambda x: "".join(x.split()) if isinstance(x, str) else x))
        else:
            pass
        self.df_cleaning=df
        return df

    # 按照指定列及list排序，可以实现Series和DataFrame,也可以实现排序list少数排序列的值。
    def sort_list(self, obj, list_sort, column=None):
        object_=''
        if isinstance(obj, pd.Series):
            df = pd.DataFrame(obj)
            df = df.reset_index()
            df.columns = ['index', 'values']
            df1 = df[df['index'].isin(list_sort)].copy()
            df2 = df[~df['index'].isin(list_sort)].copy()
            df1['index'] = df1['index'].astype('category')
            df1['index'].cat.set_categories(list_sort, inplace=True)
            df1.sort_values('index', ascending=True, inplace=True)
            df = pd.concat([df1, df2])
            object_ = pd.Series(df['values'].values, index=df['index'])
        elif isinstance(obj, pd.DataFrame):
            obj1 = obj[obj[column].isin(list_sort)].copy()
            obj2 = obj[~obj[column].isin(list_sort)].copy()
            obj1[column] = obj1[column].astype('category')
            obj1[column].cat.set_categories(list_sort, inplace=True)
            object_ = obj1.sort_values(column, ascending=True)
            object_ = pd.concat([object_, obj2])
        return object_

    def split_record(self, sp_cloc: str, sign: str, df=None):
        if df is None:
            df = self.df
        else:
            pass
        df[sp_cloc] = df[sp_cloc].str.split(sign)
        df=df.explode(sp_cloc)
        # 重置index
        df = df.reset_index(drop=True)
        return df

    # 将某列以外都相同的行以','进行合并
    def plus_record(self, dfp: pd.DataFrame, pr_cloc: str, sign: str):
        # 获取dfp的列名
        list_cl = list(dfp)
        # 去除需要合并列的列名
        list_cl.remove(pr_cloc)
        # 按照去除合并列名进行排序
        dfp.sort_values(list_cl, inplace=True)
        # 按照排序后行，重新设置index
        dfp = dfp.reset_index(drop=True)
        # 复制dfp，用以处理
        dfpc = dfp.copy()
        # 去除合并列
        dfpc.drop([pr_cloc], axis=1, inplace=True)
        # 进行查重处理
        list_dp = dfpc.duplicated()
        # 查找重复分界点
        x = list_dp[list_dp.isin([False])].index
        # 因为没有找到index插入的方法，将分界点index转为list
        list_x = []
        for q in range(len(x)):
            list_x.append(x[q])
        # 主要用于加入最后一条记录index
        list_x.append(len(dfp))

        # print(list_x)
        # x.append(int64(len(dfp)))
        yn = []
        # 循环获取重复记录段数据
        for i in range(len(list_x) - 1):
            # 判断是否有需要合并项
            if (list_x[i + 1] - list_x[i]) > 1:
                # 若有序号间隔大于1，则进入循环
                for j in range(list_x[i + 1] - list_x[i]):
                    # 取出需要合并数据，形成list
                    yn.append(dfp.loc[list_x[i] + j, pr_cloc])
                # 将list合并成以sign为分隔字符串。
                y = sign.join(yn)
                # 将字符串赋给dfp第一列
                dfp.loc[list_x[i], pr_cloc] = y
                # 删除多余项目
                for k in range(list_x[i + 1] - list_x[i] - 1):
                    dfp.drop(list_x[i] + 1 + k, axis=0, inplace=True)
                # 清空记录list
                yn = []
        # 重置index
        dfp = dfp.reset_index(drop=True)

        return dfp

    def get_n_day(self, date_time, n=1, m=0):
        # this_month_start = datetime.datetime(self.date_time.year, self.date_time.month, 1)
        this_month_nday = datetime.datetime(date_time.year, date_time.month, n)  # +datetime.timedelta(days=n)
        this_month_end = datetime.datetime(date_time.year, date_time.month,
                                           calendar.monthrange(date_time.year, date_time.month)[1])
        # n_month_start=this_month_start +relativedelta(months=m)
        n_month_end = this_month_end + relativedelta(months=m)
        n_month_nday = this_month_nday + relativedelta(months=m)
        return n_month_nday, n_month_end

    def get_current_week(self, date_time, n=1, w=0):
        monday, sunday = date_time, date_time
        one_day = datetime.timedelta(days=1)
        while monday.weekday() != 0:
            monday -= one_day
        while sunday.weekday() != 6:
            sunday += one_day
        # 返回当前的星期一和星期天的日期
        week_n = monday + datetime.timedelta(days=n)
        n_week_end = sunday + relativedelta(weeks=w)
        n_week_nday = week_n + relativedelta(weeks=w)
        return n_week_nday, n_week_end

    def split_period(self, df, start, end, interval):
        # 重置df.index，保证后面编号不覆盖。
        df = df.reset_index(drop=True)
        # 确定有几行
        df_scope = len(df)
        # 遍历所有行
        for i in range(df_scope):
            # 判断时间行是时间类型
            if isinstance(df[start][i], datetime.datetime) and isinstance(df[end][i], datetime.datetime):
                # 将开始时间行赋值给变量d_start;结束时间赋值给d_end。
                d_start = df[start][i]
                d_end = df[end][i]
                # 判断间隔值是m——月；w——月；d——日
                if interval == 'm':
                    delta = rrule.rrule(rrule.MONTHLY, dtstart=d_start, until=d_end).count()
                    # 解决跨月问题只要月份不同就判定为跨月
                    if self.get_n_day(df[start][i], m=delta - 1)[1] < df[end][i]:
                        delta = delta + 1
                    loop_delta = 0
                    if delta > 1:
                        loop_delta = delta
                        for j in range(loop_delta):
                            df_scope = j + df_scope
                            df.loc[df_scope] = df.loc[i]
                            # this_month_start,this_month_end = get_n_day(df[start][df_scope])
                            if j == 0:
                                df.loc[df_scope, end] = self.get_n_day(df[start][df_scope])[1]
                            elif j == loop_delta - 1:
                                df.loc[df_scope, start] = self.get_n_day(df[start][df_scope], m=j)[0]
                            else:
                                df.loc[df_scope, start], df.loc[df_scope, end] = self.get_n_day(df[start][df_scope],
                                                                                                m=j)
                        df.drop(index=[i], inplace=True)
                    df_scope = df_scope + 1
                # 判断间隔值是m——月；w——周；d——日
                if interval == 'w':
                    delta = rrule.rrule(rrule.WEEKLY, dtstart=d_start, until=d_end).count()
                    # 解决跨周问题,关键点是开始日期所在周推delta个周后的周末是否小于end日期
                    if self.get_current_week(df[start][i], w=delta - 1)[1] < df[end][i]:
                        delta = delta + 1
                    else:
                        pass

                    loop_delta = 0
                    if delta > 1:
                        loop_delta = delta
                        for j in range(loop_delta):
                            df_scope = j + df_scope
                            df.loc[df_scope] = df.loc[i]
                            if j == 0:
                                df.loc[df_scope, end] = self.get_current_week(df[start][df_scope])[1]
                            elif j == loop_delta - 1:
                                df.loc[df_scope, start] = self.get_current_week(df[start][df_scope], w=j)[0]


                            else:
                                df.loc[df_scope, start], df.loc[df_scope, end] = self.get_current_week(
                                    df[start][df_scope], w=j)

                        df.drop(index=[i], inplace=True)
                    df_scope = df_scope + 1
                if interval == 'd':
                    delta = rrule.rrule(rrule.DAILY, dtstart=d_start, until=d_end).count()
                    loop_delta = 0
                    if delta > 1:
                        loop_delta = delta
                        for j in range(loop_delta):
                            df_scope = j + df_scope
                            df.loc[df_scope] = df.loc[i]
                            df.loc[df_scope, start] = d_start + relativedelta(days=+j)
                            df.loc[df_scope, end] = d_start + relativedelta(days=+j)
                        df.drop(index=[i], inplace=True)
                    df_scope = df_scope + 1
        df.reset_index(inplace=True, drop=True)
        return df

    # 将某列以外都相同的行,按照时间','进行合并
    def plus_period(self, dfp: pd.DataFrame, s_date, e_date, pr_cloc: str, sign: str):
        # 获取dfp的列名
        list_cl = list(dfp)
        # 去除需要合并列的列名
        list_cl.remove(s_date)
        list_cl.remove(e_date)
        list_cl.remove(pr_cloc)
        # 按照去除合并列名进行排序
        dfp.sort_values(list_cl, inplace=True)
        # 按照排序后行，重新设置index
        dfp = dfp.reset_index(drop=True)
        # 复制dfp，用以处理
        dfpc = dfp.copy()
        # 去除合并列
        dfpc.drop([s_date, e_date, pr_cloc], axis=1, inplace=True)
        # 进行查重处理
        list_dp = dfpc.duplicated()
        # 查找重复分界点
        x = list_dp[list_dp.isin([False])].index
        # 因为没有找到index插入的方法，将分界点index转为list
        list_x = []
        for q in range(len(x)):
            list_x.append(x[q])
        # 主要用于加入最后一条记录index
        list_x.append(len(dfp))

        # print(list_x)
        # x.append(int64(len(dfp)))
        yn = []
        # 循环获取重复记录段数据
        for i in range(len(list_x) - 1):
            # 判断是否有需要合并项
            if (list_x[i + 1] - list_x[i]) > 1:
                # 若有序号间隔大于1，则进入循环

                for j in range(list_x[i + 1] - list_x[i]):
                    # 取出需要合并数据，形成list
                    yn.append(dfp.loc[list_x[i] + j, pr_cloc])
                # 将list合并成以sign为分隔字符串。
                y = sign.join(yn)
                # 排序
                dfp_d = dfp.loc[list_x[i]:list_x[i + 1] - 1, :].copy()
                dfp_d.sort_values(s_date, inplace=True)
                dfp_d.reset_index(drop=True, inplace=True)
                # 将字符串赋给dfp第一列
                dfp.loc[list_x[i], pr_cloc] = y
                dfp.loc[list_x[i], s_date] = dfp_d.loc[0, s_date]
                dfp.loc[list_x[i], e_date] = dfp_d.loc[list_x[i + 1] - list_x[i] - 1, e_date]

                # 删除多余项目
                for k in range(list_x[i + 1] - list_x[i] - 1):
                    dfp.drop(list_x[i] + 1 + k, axis=0, inplace=True)
                # 清空记录list
                yn = []
        # 重置index
        dfp = dfp.reset_index(drop=True)

        return dfp

    # 将pandas数据装换为文本
    def pandas_text(self, obj, drop_list=None, index_name=None, unit='项', punctuation=[',', '。']):
        tx = ''
        if isinstance(obj, pd.Series):
            text = ''
            for i in range(len(obj)):
                if i != len(obj) - 1:
                    text += str(obj.index[i]) + ' ' + str(obj[i]) + unit + punctuation[0]
                else:
                    text += str(obj.index[i]) + ' ' + str(obj[i]) + unit + punctuation[1]
            text = str(obj.name) + '：' + text
            tx = text

        if isinstance(obj, pd.DataFrame):
            text_list = []
            if index_name is None:

                for column in df.iteritems():
                    if column[0] not in drop_list:
                        text = ''
                        for i in range(len(column[1])):
                            if i != len(column[1]) - 1:
                                text += str(column[1].index[i]) + ' ' + str(column[1][i]) + unit + punctuation[0]
                            else:
                                text += str(column[1].index[i]) + ' ' + str(column[1][i]) + unit + punctuation[1]
                            # print('列名'+column[0],'\n',column[1])
                        text = str(column[1].name) + '：' + text
                        text_list.append(text)
            else:
                df.set_index(index_name, drop=True, inplace=True)
                for column in df.iteritems():
                    if column[0] not in drop_list:
                        text = ''
                        for i in range(len(column[1])):
                            if i != len(column[1]) - 1:
                                text += str(column[1].index[i]) + ' ' + str(column[1][i]) + unit + punctuation[0]
                            else:
                                text += str(column[1].index[i]) + ' ' + str(column[1][i]) + unit + punctuation[1]
                            # print('列名'+column[0],'\n',column[1])
                        text = str(column[1].name) + '：' + text
                        text_list.append(text)

            tx = text_list
        return tx

    # 获取DataFrame中null空缺值的个数，返回列表和文字；df为DataFrame，all默认值为1列出全部项，为0时只列出有null值的项。
    def null_item(self,df, all=1):
        null_ = df.isnull().sum()
        null_.sort_values(ascending=False, inplace=True)
        null_.name = '空缺值'
        if all:
            null_text = self.pandas_text(null_, '项')
        else:
            null_ = null_[null_.values != 0].copy()
            null_text = self.pandas_text(null_, '项', punctuation=[',', ',']) + '其余数据完整。'
        return null_, null_text


if __name__ == '__main__':
    start = time.time()
    #print(PandasAnalysis.__doc__)
    df = pd.DataFrame(
        [
            ['A', '孔令、刘 媛媛 ', 'C', pd.to_datetime('2020-12-31'), pd.to_datetime('2021-1-8'), '完成情况1'],
            ['D', '李 黎、李进 、昆 明', 'F', pd.to_datetime('2021-2-26'), pd.to_datetime('2021-3-7'), '完成情况2'],
            ['C', '王 玺', np.nan, pd.to_datetime('2021-1-20'), pd.to_datetime('2022-3-8'), '完成情况3'],
        ], columns=['编号', '责任人', '关键任务', '开始日期', '结束日期', '完成情况'])

    pa = PandasAnalysis(df)
    pa.cleaning(blank='all')
    print(pa.df)
    print(pa.df_cleaning)
    # df2 = pa.cleaning(pa.get_df(), 'both')
    # df3 = pa.cleaning(pa.get_df(), 'all')
    # print(df)
    # print(pa.get_df())
    # print(df1)
    # print(df2)
    # print(df3)
    # ls = ['D', 'A', 'B', 'C']
    # df4 = pa.sort_list(df3, ls, '编号')
    # print(df4)
    df5 = pa.split_record( '责任人', '、',pa.cleaning(blank='all'))
    print(df5)
    # df6 = pa.plus_record(df5, '责任人', '，')
    # print(df6)
    # df7 = pd.DataFrame(
    #     [
    #         ['A', 'B', 'C', pd.to_datetime('2020-12-31'), pd.to_datetime('2021-1-8'), '完成情况1'],
    #         ['D', 'E', 'F', pd.to_datetime('2021-2-26'), pd.to_datetime('2021-3-7'), '完成情况2'],
    #         ['C', 'E', 'F', pd.to_datetime('2021-1-20'), pd.to_datetime('2022-3-8'), '完成情况3'],
    #     ]
    #     , columns=['编号', '责任人', '关键任务', '开始日期', '结束日期', '完成情况'])
    # print(df7)
    # df8 = pa.split_period(df7, '开始日期', '结束日期', 'm')
    # print(df8)
    # df9 = pa.plus_period(df8, '开始日期', '结束日期', '完成情况', '；')
    # print(df9)
    # path = r"D:\JGY\300-Work工作\320-PM项目管理\电网年度运行方式\2021年昆明电网八大运行风险防控措施分解表（审查稿).xlsx"
    # df_risk = pd.read_excel(path, '一、主保护、稳控装置拒动导致电网稳定破坏风险', engine="openpyxl")
    # print(df_risk.head)
   

    elapsed = (time.time() - start)
    print("Time used:",elapsed)


  编号          责任人 关键任务       开始日期       结束日期   完成情况
0  A     孔令、刘 媛媛     C 2020-12-31 2021-01-08  完成情况1
1  D  李 黎、李进 、昆 明    F 2021-02-26 2021-03-07  完成情况2
2  C          王 玺  NaN 2021-01-20 2022-03-08  完成情况3
  编号       责任人 关键任务       开始日期       结束日期   完成情况
0  A    孔令、刘媛媛    C 2020-12-31 2021-01-08  完成情况1
1  D  李黎、李进、昆明    F 2021-02-26 2021-03-07  完成情况2
2  C        王玺  NaN 2021-01-20 2022-03-08  完成情况3
  编号  责任人 关键任务       开始日期       结束日期   完成情况
0  A   孔令    C 2020-12-31 2021-01-08  完成情况1
1  A  刘媛媛    C 2020-12-31 2021-01-08  完成情况1
2  D   李黎    F 2021-02-26 2021-03-07  完成情况2
3  D   李进    F 2021-02-26 2021-03-07  完成情况2
4  D   昆明    F 2021-02-26 2021-03-07  完成情况2
5  C   王玺  NaN 2021-01-20 2022-03-08  完成情况3
Time used: 0.024484634399414062


In [16]:
list_1 = [36, 5, -12, 9, -21]

keys = [36, 5,  12, 9,  21]
sorted([36, 5, -12, 9, -21],key=str.keys)

AttributeError: type object 'str' has no attribute 'keys'

In [9]:
import pandas as pd
df = pd.DataFrame(
    [
        ['A', '孔令、刘 媛媛 ', 'C', pd.to_datetime('2020-12-31'), pd.to_datetime('2021-1-8'), '完成情况1'],
        ['D', '李 黎、李进 、昆 明', 'F', pd.to_datetime('2021-2-26'), pd.to_datetime('2021-3-7'), '完成情况2'],
        ['C', '王 玺', np.nan, pd.to_datetime('2021-1-20'), pd.to_datetime('2022-3-8'), '完成情况3'],
    ], columns=['编号', '责任人', '关键任务', '开始日期', '结束日期', '完成情况'])
df1=pd.toDF(df)
print(df)
print(df1)

AttributeError: module 'pandas' has no attribute 'toDF'