In [1]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [2]:
df = import_data("train.csv")

Memory usage of dataframe is 34.56 MB
Memory usage after optimization is: 9.26 MB
Decreased by 73.2%


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# 아래 코드
df_d_sum = df.groupby(['VisitNumber', 'DepartmentDescription'], as_index=False)['ScanCount'].agg('sum').sort_values(['VisitNumber', 'ScanCount', 'DepartmentDescription'], ascending=[1, 0, 1])
df_d_sum['abs_sum'] = df_d_sum.assign(abs=df_d_sum['ScanCount'].abs()).groupby(['VisitNumber'])['abs'].transform('sum')

# change 'abs_sum' for div
criteria = df_d_sum['abs_sum'] == 0
df_d_sum.loc[criteria, 'abs_sum'] = 999

# create ratio
df_d_sum['ratio'] = df_d_sum['ScanCount'] / df_d_sum['abs_sum']

# abs_sum 원복
criteria = df_d_sum['abs_sum'] == 999
df_d_sum.loc[criteria, 'abs_sum'] = 0

# Dept Na 였던 VisitNumber append
diff = set(df.VisitNumber) - set(df_d_sum.VisitNumber)
df_d_sum = df_d_sum.append(pd.DataFrame({'VisitNumber': list(diff)},)).fillna({'DepartmentDescription': 'Na',
                                                                          'ScanCount': 0,
                                                                         'ratio':0 })
# create pivot table
df_pivot = df_d_sum.pivot(index='VisitNumber', columns='DepartmentDescription', values='ratio').fillna(0)
df_pivot = pd.merge(df_pivot, df[['VisitNumber', 'TripType']].drop_duplicates(), on='VisitNumber')

# make training data
X = df_pivot.iloc[:, :-1]
y = df_pivot.iloc[:, -1]

X_t, X_te, y_t, y_te = train_test_split(X, y, test_size = 0.3, random_state=99)

# training

mod2 = RandomForestClassifier(n_estimators=200, bootstrap=False, min_samples_leaf=1, min_samples_split=3,\
                             criterion='gini').fit(X_t, y_t)

print(classification_report(y_te, mod2.predict(X_te)))



              precision    recall  f1-score   support

           3       0.79      0.82      0.81      1079
           4       0.11      0.11      0.11        99
           5       0.73      0.77      0.75      1344
           6       0.67      0.68      0.68       386
           7       0.63      0.63      0.63      1774
           8       0.67      0.67      0.67      3638
           9       0.56      0.59      0.57      2784
          12       0.09      0.04      0.05        80
          15       0.40      0.33      0.36       295
          18       0.29      0.27      0.28       162
          19       0.27      0.18      0.22       116
          20       0.51      0.46      0.48       204
          21       0.57      0.50      0.53       211
          22       0.33      0.27      0.30       250
          23       0.37      0.39      0.38        36
          24       0.52      0.54      0.53       781
          25       0.56      0.63      0.59      1147
          26       0.40    

In [8]:
import pickle

filename = 'rf_model.sav'
pickle.dump(mod2, open(filename, 'wb'))