https://mp.weixin.qq.com/s?__biz=MjM5ODkzMzMwMQ==&mid=2650414738&idx=4&sn=291b3cd0a1abcbcbf5661fe9b5c4caed&chksm=becd98c889ba11deb392729961fbf844ed70bde1c203570f93b969e1f49c03f9a2672c795e76&scene=126&sessionid=1593691947&key=d28e5ef1d5d516753b90b3da896b87cc42d273354f534163bbbb745c2df4507e929838e9d27a6f41654879cd9ce6e89af580f2c02553edd65162ca0d33b184334300d9dec75d022ce6b4f3a890d96153&ascene=1&uin=MjA1MjAyODkxNg%3D%3D&devicetype=Windows+10+x64&version=6209051a&lang=zh_CN&exportkey=AY613ediuml%2BOiLzM4tOow8%3D&pass_ticket=TTbfflTude6XHy0P68%2Fx35vP%2Fnt7hPuBsvpukmQN97jygfZCqOotQywB77daKGb8

In [None]:
# 1.学会写抽象类
import os
from abc import ABCMeta,abstractmethod
class DataProcessor(metaclass=ABCMeta):
    """Base processor to be used for all preparation."""
    def __init__(self,input_directory,output_directory):
        self.input_directory = input_directory
        self.output_directory = output_directory
    @abstractmethod
    def read(self):
        """Read raw data."""
    @abstractmethod
    def process(self):
        """Processes raw data.This step should create the raw dataframe with all the required features. Shouldn't implement statistical or text cleaning."""
    @abstractmethod
    def save(self):
        """Saves processed data."""
class Trainer(metaclass=ABCMeta):
    def __init__(self,directory):
        self.directory = directory
        self.model_directory = os.path.join(directory,'models')
    @abstractmethod
    def preprocess(self):
        """This takes the preprocessed data and returns clean data. This is more about statistical or text cleaning."""
    @abstractmethod
    def set_model(self):
        """Define model here."""
    @abstractmethod
    def fit_model(self):
        """This takes the vectorised data and returns a trained model."""
    @abstractmethod
    def generate_metrics(self):
        """Generates metric with trained model and test data."""
    @abstractmethod
    def save_model(self,model_name):
        """This method saves the model in our required format."""
class Predict(metaclass=ABCMeta):
    """Base predictor to be used for all models."""
    def __init__(self, directory):
        self.directory = directory
        self.model_directory = os.path.join(directory, 'models')
    @abstractmethod
    def load_model(self):
        """Load model here."""
    @abstractmethod
    def preprocess(self):
        """This takes the raw data and returns clean data for prediction."""
    @abstractmethod
    def predict(self):
        """This is used for prediction."""
class BaseDB(metaclass=ABCMeta):
    """ Base database class to be used for all DB connectors."""
    def get_connection(self):
        """This creates a new DB connection."""
    def close_connection(self):
        """This closes the DB connection."""

In [None]:
# 2.在最前面设置你的随机数种子
def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

In [None]:
# 3.从几行数据开始
df_train = pd.read_csv('train.csv',nrows=1000)

In [None]:
# 4.预见失败（成熟开发人员的标志）
print(len(df))
df.isna().sum()
df.dropna()
print(len(df))

In [None]:
# 5.显示处理进度
# 选项1-tqdm
from tqdm import tqdm
import time

tqdm.pandas()

df['col'] = df['col'].progress_apply(lambda x: x**2)

text = ""
for char in tqdm(["a", "b", "c", "d"]):
    time.sleep(0.25)
    text = text + char
# 选项2-fastprogress
from fastprogress.fastprogress import master_bar, progress_bar
from time import sleep
mb = master_bar(range(10))
for i in mb:
    for j in progress_bar(range(100), parent=mb):
        sleep(0.01)
        mb.child.comment = f'second bar stat'
    mb.first_bar.comment = f'first bar stat'
    mb.write(f'Finished loop {i}.')

In [None]:
# 6.Pandas很慢
import modin.pandas as pd

In [None]:
# 7.统计函数的时间
import time
def timing(f):
    """Decorator for timing functions
    Usage:
    @timing
    def function(a):
        pass
    """

In [None]:
# 8.不要再云上烧钱
import os
def run_command(cmd):
    return os.system(cmd)
def shutdown(seconds=0,os='linux'):
    """Shutdown system after seconds given. Useful for shutting EC2 to save costs."""
    if os == 'linux':
        run_command('sudo shutdown -h -t sec %s'%seconds)
    elif os == 'windows':
        run_command('shutdown -s -t %s'%seconds)

In [None]:
# 创建和保存报告
import json
import os
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, fbeta_score


def get_metrics(y, y_pred, beta=2, average_method='macro', y_encoder=None):
    if y_encoder:
        y = y_encoder.inverse_transform(y)
        y_pred = y_encoder.inverse_transform(y_pred)
    return {
        'accuracy': round(accuracy_score(y, y_pred), 4),
        'f1_score_macro': round(f1_score(y, y_pred, average=average_method), 4),
        'fbeta_score_macro': round(fbeta_score(y, y_pred, beta, average=average_method), 4),
        'report': classification_report(y, y_pred, output_dict=True),
        'report_csv': classification_report(y, y_pred, output_dict=False).replace('\n', '\r\n')
    }


def save_metrics(metrics: dict, model_directory, file_name):
    path = os.path.join(model_directory, file_name + '_report.txt')
    classification_report_to_csv(metrics['report_csv'], path)
    metrics.pop('report_csv')
    path = os.path.join(model_directory, file_name + '_metrics.json')
    json.dump(metrics, open(path, 'w'), indent=4)

In [None]:
# 写好APIs
下面是在不太高的负载下(比如1000/min)部署经典的ML和DL的好方法。
fasbut + uvicorn
Fastest — 使用fastapi编写API，因为它很快。
Documentation — 用fastapi写API让我们不用操心文档。
Workers — 使用uvicorn部署API