In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import category_encoders as ce
from sklearn.model_selection import KFold

import os
from glob import glob

In [2]:
input_dir = "../input/"
output_dir = "../output/"

lgm_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 42,
    'max_depth': 7,
    "feature_fraction": 0.8,
    'subsample_freq': 1,
    "bagging_fraction": 0.95,
    'min_data_in_leaf': 2,
    'learning_rate': 0.1,
    "boosting": "gbdt",
    "lambda_l1": 0.1,
    "lambda_l2": 10,
    "verbosity": -1,
    "random_state": 71,
    "num_boost_round": 50000,
    "early_stopping_rounds": 100
}

In [3]:
train_df = pd.read_csv(input_dir + "train.csv")
test_df = pd.read_csv(input_dir + "test.csv")
submission = pd.read_csv(input_dir + "atmacup10__sample_submission.csv")
color_df = pd.read_csv(input_dir + "color.csv")
histrical_df = pd.read_csv(input_dir + "historical_person.csv")
maker_df = pd.read_csv(input_dir + "maker.csv")
collection_df = pd.read_csv(input_dir + "object_collection.csv")
material_df = pd.read_csv(input_dir + "material.csv")
palette_df = pd.read_csv(input_dir + "palette.csv")
production_place_df = pd.read_csv(input_dir + "production_place.csv")
occupation_df = pd.read_csv(input_dir + "principal_maker_occupation.csv")
principal_maker_df = pd.read_csv(input_dir + "principal_maker.csv")
production_place_df = pd.read_csv(input_dir + "production_place.csv")
technique_df = pd.read_csv(input_dir + "technique.csv")

In [4]:
# count values
principal_maker_df["object_id"].value_counts()

d6df55197a92cb4d0a46    2
348e5b70bdbc90e64dbe    2
bc703e76f140391ab814    2
85f17b7261636cae5f22    2
87f28a6ec9a93b27b32a    2
                       ..
c6ea4751202e3538a6ef    1
76fdb78473c08f38e87b    1
f7ae3061c0c3416a79d7    1
bcd99878f90214747035    1
2afcf2fd52b9008ae6b0    1
Name: object_id, Length: 24034, dtype: int64

In [7]:
def get_wide_df(input_df):
    # DataFrameの変数名を取得する関数
    def get_df_name(_df):
        name =[x for x in globals() if globals()[x] is _df][0]
        return name

    _df = input_df.copy()
    max_size = _df.groupby('object_id').size().max()
    _df['cumcount'] = _df.groupby('object_id').cumcount()
    output_df = pd.DataFrame({'object_id': _df['object_id'].unique()})
    _df_name = get_df_name(input_df)

    for i in range(max_size):
        temp_df = _df[_df['cumcount']==i].reset_index(drop=True)
        output_df = output_df.merge(temp_df[['object_id', 'name']], on='object_id', how='left').rename(columns={'name':f'{_df_name}_{i}'})

    return output_df

get_wide_df(material_df)

Unnamed: 0,object_id,material_df_0,material_df_1,material_df_2,material_df_3,material_df_4,material_df_5,material_df_6,material_df_7
0,000405d9a5e3f49fc49d,photographic paper,cardboard,,,,,,
1,001020bd00b149970f78,oil paint (paint),panel,,,,,,
2,0011d6be41612ec9eae3,oil paint (paint),canvas,,,,,,
3,0012765f7a97ccc3e9e9,photographic paper,,,,,,,
4,00133be3ff222c9b74b0,paper,,,,,,,
...,...,...,...,...,...,...,...,...,...
23581,fff1d87d79953ddab2c6,oil paint (paint),panel,,,,,,
23582,fff4bbb55fd7702d294e,photographic paper,,,,,,,
23583,fffbe07b997bec00e203,photographic paper,cardboard,,,,,,
23584,fffd43b134ba7197d890,photographic paper,,,,,,,


In [10]:
_palette_df = get_wide_df(palette_df)

In [15]:
_palette_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23586 entries, 0 to 23585
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   object_id     23586 non-null  object
 1   palette_df_0  23586 non-null  object
 2   palette_df_1  9887 non-null   object
 3   palette_df_2  1692 non-null   object
 4   palette_df_3  176 non-null    object
 5   palette_df_4  35 non-null     object
 6   palette_df_5  9 non-null      object
 7   palette_df_6  5 non-null      object
 8   palette_df_7  4 non-null      object
dtypes: object(9)
memory usage: 1.8+ MB


In [16]:
_material_df = get_wide_df(material_df)

In [17]:
_material_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23586 entries, 0 to 23585
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   object_id      23586 non-null  object
 1   material_df_0  23586 non-null  object
 2   material_df_1  9887 non-null   object
 3   material_df_2  1692 non-null   object
 4   material_df_3  176 non-null    object
 5   material_df_4  35 non-null     object
 6   material_df_5  9 non-null      object
 7   material_df_6  5 non-null      object
 8   material_df_7  4 non-null      object
dtypes: object(9)
memory usage: 1.8+ MB


In [18]:
_technique_df = get_wide_df(technique_df)

In [19]:
_technique_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17329 entries, 0 to 17328
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   object_id       17329 non-null  object
 1   technique_df_0  17329 non-null  object
 2   technique_df_1  2914 non-null   object
 3   technique_df_2  237 non-null    object
 4   technique_df_3  1 non-null      object
dtypes: object(5)
memory usage: 812.3+ KB


In [22]:
_technique_df

Unnamed: 0,object_id,technique_df_0,technique_df_1,technique_df_2,technique_df_3
0,000405d9a5e3f49fc49d,albumen print,,,
1,0012765f7a97ccc3e9e9,salted paper print,albumen print,,
2,00133be3ff222c9b74b0,etching,,,
3,0017be8caa87206532cb,albumen print,,,
4,001f4c71b4d53497b531,engraving,,,
...,...,...,...,...,...
17324,fff4bbb55fd7702d294e,albumen print,,,
17325,fffbe07b997bec00e203,albumen print,,,
17326,fffd1675758205748d7f,albumen print,,,
17327,fffd43b134ba7197d890,albumen print,,,
