In [1]:
import sys
import os 

import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

plt.set_cmap(cmap='tab10')

sns.set_palette(palette='tab10')

<Figure size 640x480 with 0 Axes>

In [2]:
df_train = pd.read_csv('../input/happiness/happiness_train_complete.csv', encoding='gbk')
df_test = pd.read_csv('../input/happiness/happiness_test_complete.csv', encoding='gbk')

df_index = pd.read_excel('../input/happiness/happiness_index.xlsx')

In [3]:
def get_feature_info(path):
    df = pd.read_excel(path)
    def feature_info(col: str|list):
        if isinstance(col, str):
            return df.loc[df['变量名']==col, :]
        return df.loc[df['变量名'].isin(col), :]
    return feature_info

In [4]:
feature_info = get_feature_info('../input/happiness/happiness_index.xlsx')

In [5]:
df_train.info(max_cols=500)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 140 columns):
 #    Column                Non-Null Count  Dtype  
---   ------                --------------  -----  
 0    id                    8000 non-null   int64  
 1    happiness             8000 non-null   int64  
 2    survey_type           8000 non-null   int64  
 3    province              8000 non-null   int64  
 4    city                  8000 non-null   int64  
 5    county                8000 non-null   int64  
 6    survey_time           8000 non-null   object 
 7    gender                8000 non-null   int64  
 8    birth                 8000 non-null   int64  
 9    nationality           8000 non-null   int64  
 10   religion              8000 non-null   int64  
 11   religion_freq         8000 non-null   int64  
 12   edu                   8000 non-null   int64  
 13   edu_other             3 non-null      object 
 14   edu_status            6880 non-null   float64
 15   ed

In [6]:
df_test.info(max_cols=500)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2968 entries, 0 to 2967
Data columns (total 139 columns):
 #    Column                Non-Null Count  Dtype  
---   ------                --------------  -----  
 0    id                    2968 non-null   int64  
 1    survey_type           2968 non-null   int64  
 2    province              2968 non-null   int64  
 3    city                  2968 non-null   int64  
 4    county                2968 non-null   int64  
 5    survey_time           2968 non-null   object 
 6    gender                2968 non-null   int64  
 7    birth                 2968 non-null   int64  
 8    nationality           2968 non-null   int64  
 9    religion              2968 non-null   int64  
 10   religion_freq         2968 non-null   int64  
 11   edu                   2968 non-null   int64  
 12   edu_other             3 non-null      object 
 13   edu_status            2519 non-null   float64
 14   edu_yr                2184 non-null   float64
 15   in

In [7]:
df_train.happiness.value_counts()

 4    4818
 5    1410
 3    1159
 2     497
 1     104
-8      12
Name: happiness, dtype: int64

In [8]:
def need_drop_cols(data:pd.DataFrame, low_thresh:float, is_train: bool = True)->list:
    drop_cols = ['id', 'happiness'] if is_train else ['id']
    ser = data.isna().mean()
    drop_cols = drop_cols + ser.loc[ser > 0.5].index.to_list()
    return drop_cols


In [9]:
x_train = df_train.drop(need_drop_cols(df_train, 0.5), axis=1)
x_test = df_test.drop(need_drop_cols(df_test, 0.5, is_train=False), axis=1)

In [10]:
def get_survey_year(data:pd.DataFrame):
    return data.assign(survey_year=lambda df: df['survey_time'].apply(lambda x: int(x[:4])))

In [None]:
def drop_cannot_anwser(data:pd.DataFrame)->pd.DataFrame:
    return data.loc[data['happiness']!= -8, :]

In [14]:
def get_age(data:pd.DataFrame)->pd.DataFrame:
    def cal_age(ser: pd.Series):
        if ser['birth'] in (-1, -3, -8):
            return ser['birth']
        return ser['survey_year'] - ser['birth']
    return data.assign(age=lambda df: df.apply(cal_age, axis=1))

In [None]:
get_age(x_train)