In [19]:
import sys
import os 

import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

plt.set_cmap(cmap='tab10')

sns.set_palette(palette='tab10')

<Figure size 640x480 with 0 Axes>

In [20]:
df_train = pd.read_csv('../input/happiness/happiness_train_complete.csv', encoding='gbk')
df_test = pd.read_csv('../input/happiness/happiness_test_complete.csv', encoding='gbk')

df_index = pd.read_excel('../input/happiness/happiness_index.xlsx')

In [21]:
def get_feature_info(path):
    df = pd.read_excel(path)
    def feature_info(col: str|list):
        if isinstance(col, str):
            return df.loc[df['变量名']==col, :]
        return df.loc[df['变量名'].isin(col), :]
    return feature_info

In [22]:
feature_info = get_feature_info('../input/happiness/happiness_index.xlsx')

In [23]:
df_train.info(max_cols=500)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 140 columns):
 #    Column                Non-Null Count  Dtype  
---   ------                --------------  -----  
 0    id                    8000 non-null   int64  
 1    happiness             8000 non-null   int64  
 2    survey_type           8000 non-null   int64  
 3    province              8000 non-null   int64  
 4    city                  8000 non-null   int64  
 5    county                8000 non-null   int64  
 6    survey_time           8000 non-null   object 
 7    gender                8000 non-null   int64  
 8    birth                 8000 non-null   int64  
 9    nationality           8000 non-null   int64  
 10   religion              8000 non-null   int64  
 11   religion_freq         8000 non-null   int64  
 12   edu                   8000 non-null   int64  
 13   edu_other             3 non-null      object 
 14   edu_status            6880 non-null   float64
 15   ed

In [24]:
df_test.info(max_cols=500)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2968 entries, 0 to 2967
Data columns (total 139 columns):
 #    Column                Non-Null Count  Dtype  
---   ------                --------------  -----  
 0    id                    2968 non-null   int64  
 1    survey_type           2968 non-null   int64  
 2    province              2968 non-null   int64  
 3    city                  2968 non-null   int64  
 4    county                2968 non-null   int64  
 5    survey_time           2968 non-null   object 
 6    gender                2968 non-null   int64  
 7    birth                 2968 non-null   int64  
 8    nationality           2968 non-null   int64  
 9    religion              2968 non-null   int64  
 10   religion_freq         2968 non-null   int64  
 11   edu                   2968 non-null   int64  
 12   edu_other             3 non-null      object 
 13   edu_status            2519 non-null   float64
 14   edu_yr                2184 non-null   float64
 15   in

In [25]:
df_train.happiness.value_counts()

 4    4818
 5    1410
 3    1159
 2     497
 1     104
-8      12
Name: happiness, dtype: int64

In [26]:
def need_drop_cols(data:pd.DataFrame, low_thresh:float, is_train: bool = True)->list:
    drop_cols = ['id', 'happiness'] if is_train else ['id']
    ser = data.isna().mean()
    drop_cols = drop_cols + ser.loc[ser > 0.5].index.to_list()
    return drop_cols


In [27]:
x_train = df_train.drop(need_drop_cols(df_train, 0.5), axis=1)
x_test = df_test.drop(need_drop_cols(df_test, 0.5, is_train=False), axis=1)

In [28]:
def get_survey_year(data:pd.DataFrame):
    return data.assign(survey_year=lambda df: df['survey_time'].apply(lambda x: int(x[:4])))

In [29]:
def drop_cannot_anwser(data:pd.DataFrame)->pd.DataFrame:
    return data.loc[data['happiness']!= -8, :]

In [30]:
def get_age(data:pd.DataFrame)->pd.DataFrame:
    def cal_age(ser: pd.Series):
        if ser['birth'] in (-1, -3, -8):
            return ser['birth']
        return ser['survey_year'] - ser['birth']
    return data.assign(age=lambda df: df.apply(cal_age, axis=1))

Unnamed: 0,survey_type,province,city,county,survey_time,gender,birth,nationality,religion,religion_freq,edu,edu_status,edu_yr,income,political,floor_area,property_0,property_1,property_2,property_3,property_4,property_5,property_6,property_7,property_8,height_cm,weight_jin,health,health_problem,depression,hukou,hukou_loc,media_1,media_2,media_3,media_4,media_5,media_6,leisure_1,leisure_2,leisure_3,leisure_4,leisure_5,leisure_6,leisure_7,leisure_8,leisure_9,leisure_10,leisure_11,leisure_12,socialize,relax,learn,social_neighbor,social_friend,socia_outing,equity,class,class_10_before,class_10_after,class_14,work_exper,insur_1,insur_2,insur_3,insur_4,family_income,family_m,family_status,house,car,invest_0,invest_1,invest_2,invest_3,invest_4,invest_5,invest_6,invest_7,invest_8,son,daughter,minor_child,marital,marital_1st,s_birth,marital_now,s_edu,s_political,s_hukou,s_income,s_work_exper,f_birth,f_edu,f_political,f_work_14,m_birth,m_edu,m_political,m_work_14,status_peer,status_3_before,view,inc_ability,inc_exp,trust_1,trust_2,trust_3,trust_4,trust_5,trust_6,trust_7,trust_8,trust_9,trust_10,trust_11,trust_12,trust_13,neighbor_familiarity,public_service_1,public_service_2,public_service_3,public_service_4,public_service_5,public_service_6,public_service_7,public_service_8,public_service_9,survey_year,age
0,1,12,32,59,2015/8/4 14:18,1,1959,1,1,1,11,4.0,-2.0,20000,1,45.0,0,1,0,0,0,0,0,0,0,176,155,3,2,5,5,2.0,4,2,5,5,4,3,1,4,3,1,2,3,4,1,4,5,4,1,2,4,3,3.0,3.0,2,3,3,3,3,1,1,1,1,1,2,60000.0,2,2,1,2,0,1,0,0,0,0,0,0,0,1,0,0.0,3,1984.0,1958.0,1984.0,6.0,1.0,5.0,40000.0,5.0,-2,4,4,1,-2,4,1,1,3,2,4,3,50000.0,4,2,-8,-8,5,3,2,3,4,3,-8,4,1,4,50,60,50,50,30.0,30,50,50,50,2015,56
1,2,18,52,85,2015/7/21 15:04,1,1992,1,1,1,12,4.0,2013.0,20000,1,110.0,0,0,0,0,1,0,0,0,0,170,110,5,4,3,1,1.0,2,2,1,3,5,1,2,3,4,3,5,4,3,2,3,4,5,1,2,4,3,6.0,2.0,1,3,6,4,8,5,1,1,1,1,1,40000.0,3,4,1,2,0,1,0,0,0,0,0,0,0,0,0,,1,,,,,,,,,1972,3,1,2,1973,3,1,2,1,1,4,2,50000.0,5,4,4,3,5,3,3,3,2,3,3,3,2,3,90,70,70,80,85.0,70,90,60,60,2015,23
2,2,29,83,126,2015/7/21 13:24,2,1967,1,0,3,4,4.0,-2.0,2000,1,120.0,0,1,1,0,0,0,0,0,0,160,122,4,4,5,1,1.0,2,2,2,5,1,3,1,4,4,3,5,4,4,2,3,5,5,5,3,4,2,2.0,5.0,2,4,5,4,6,3,2,1,1,2,2,8000.0,3,3,1,2,0,1,0,0,0,0,0,0,0,0,2,1.0,3,1990.0,1968.0,1990.0,3.0,1.0,1.0,6000.0,3.0,-2,1,1,2,-2,1,1,2,2,1,4,2,80000.0,3,3,3,3,4,3,3,3,3,3,-8,3,1,4,90,80,75,79,80.0,90,90,90,75,2015,48
3,2,10,28,51,2015/7/25 17:33,2,1943,1,1,1,3,4.0,1959.0,6420,1,78.0,0,0,0,1,0,0,0,0,0,163,170,4,4,4,1,2.0,2,1,1,5,1,1,1,5,2,4,5,4,5,1,1,5,5,5,2,4,4,1.0,6.0,1,4,5,5,7,2,4,2,2,2,2,12000.0,3,3,1,1,0,1,0,0,0,0,0,0,0,1,4,0.0,7,1960.0,,,,,,,,-2,14,1,2,-2,1,1,2,2,1,3,2,10000.0,3,3,4,3,5,3,3,5,4,3,3,3,2,3,100,90,70,80,80.0,90,90,80,80,2015,72
4,1,7,18,36,2015/8/10 9:50,2,1994,1,1,1,12,1.0,2014.0,-1,2,70.0,0,0,0,0,1,0,0,0,0,165,110,5,5,3,2,3.0,1,3,4,2,5,5,3,3,3,2,4,4,3,5,2,5,5,1,4,3,4,7.0,5.0,3,2,1,1,1,4,6,1,2,2,2,-2.0,4,3,1,1,0,1,0,0,0,0,0,0,0,0,0,,1,,,,,,,,,1970,6,1,10,1972,4,1,15,3,2,3,-8,200000.0,4,3,3,3,5,5,3,4,3,3,3,3,2,2,50,50,50,50,50.0,50,50,50,50,2015,21


Unnamed: 0,survey_type,province,city,county,survey_time,gender,birth,nationality,religion,religion_freq,edu,edu_status,edu_yr,income,political,floor_area,property_0,property_1,property_2,property_3,property_4,property_5,property_6,property_7,property_8,height_cm,weight_jin,health,health_problem,depression,hukou,hukou_loc,media_1,media_2,media_3,media_4,media_5,media_6,leisure_1,leisure_2,leisure_3,leisure_4,leisure_5,leisure_6,leisure_7,leisure_8,leisure_9,leisure_10,leisure_11,leisure_12,socialize,relax,learn,social_neighbor,social_friend,socia_outing,equity,class,class_10_before,class_10_after,class_14,work_exper,insur_1,insur_2,insur_3,insur_4,family_income,family_m,family_status,house,car,invest_0,invest_1,invest_2,invest_3,invest_4,invest_5,invest_6,invest_7,invest_8,son,daughter,minor_child,marital,marital_1st,s_birth,marital_now,s_edu,s_political,s_hukou,s_income,s_work_exper,f_birth,f_edu,f_political,f_work_14,m_birth,m_edu,m_political,m_work_14,status_peer,status_3_before,view,inc_ability,inc_exp,trust_1,trust_2,trust_3,trust_4,trust_5,trust_6,trust_7,trust_8,trust_9,trust_10,trust_11,trust_12,trust_13,neighbor_familiarity,public_service_1,public_service_2,public_service_3,public_service_4,public_service_5,public_service_6,public_service_7,public_service_8,public_service_9,survey_year,age
0,1,12,32,59,2015/8/4 14:18,1,1959,1,1,1,11,4.0,-2.0,20000,1,45.0,0,1,0,0,0,0,0,0,0,176,155,3,2,5,5,2.0,4,2,5,5,4,3,1,4,3,1,2,3,4,1,4,5,4,1,2,4,3,3.0,3.0,2,3,3,3,3,1,1,1,1,1,2,60000.0,2,2,1,2,0,1,0,0,0,0,0,0,0,1,0,0.0,3,1984.0,1958.0,1984.0,6.0,1.0,5.0,40000.0,5.0,-2,4,4,1,-2,4,1,1,3,2,4,3,50000.0,4,2,-8,-8,5,3,2,3,4,3,-8,4,1,4,50,60,50,50,30.0,30,50,50,50,2015,56
1,2,18,52,85,2015/7/21 15:04,1,1992,1,1,1,12,4.0,2013.0,20000,1,110.0,0,0,0,0,1,0,0,0,0,170,110,5,4,3,1,1.0,2,2,1,3,5,1,2,3,4,3,5,4,3,2,3,4,5,1,2,4,3,6.0,2.0,1,3,6,4,8,5,1,1,1,1,1,40000.0,3,4,1,2,0,1,0,0,0,0,0,0,0,0,0,,1,,,,,,,,,1972,3,1,2,1973,3,1,2,1,1,4,2,50000.0,5,4,4,3,5,3,3,3,2,3,3,3,2,3,90,70,70,80,85.0,70,90,60,60,2015,23
2,2,29,83,126,2015/7/21 13:24,2,1967,1,0,3,4,4.0,-2.0,2000,1,120.0,0,1,1,0,0,0,0,0,0,160,122,4,4,5,1,1.0,2,2,2,5,1,3,1,4,4,3,5,4,4,2,3,5,5,5,3,4,2,2.0,5.0,2,4,5,4,6,3,2,1,1,2,2,8000.0,3,3,1,2,0,1,0,0,0,0,0,0,0,0,2,1.0,3,1990.0,1968.0,1990.0,3.0,1.0,1.0,6000.0,3.0,-2,1,1,2,-2,1,1,2,2,1,4,2,80000.0,3,3,3,3,4,3,3,3,3,3,-8,3,1,4,90,80,75,79,80.0,90,90,90,75,2015,48
3,2,10,28,51,2015/7/25 17:33,2,1943,1,1,1,3,4.0,1959.0,6420,1,78.0,0,0,0,1,0,0,0,0,0,163,170,4,4,4,1,2.0,2,1,1,5,1,1,1,5,2,4,5,4,5,1,1,5,5,5,2,4,4,1.0,6.0,1,4,5,5,7,2,4,2,2,2,2,12000.0,3,3,1,1,0,1,0,0,0,0,0,0,0,1,4,0.0,7,1960.0,,,,,,,,-2,14,1,2,-2,1,1,2,2,1,3,2,10000.0,3,3,4,3,5,3,3,5,4,3,3,3,2,3,100,90,70,80,80.0,90,90,80,80,2015,72
4,1,7,18,36,2015/8/10 9:50,2,1994,1,1,1,12,1.0,2014.0,-1,2,70.0,0,0,0,0,1,0,0,0,0,165,110,5,5,3,2,3.0,1,3,4,2,5,5,3,3,3,2,4,4,3,5,2,5,5,1,4,3,4,7.0,5.0,3,2,1,1,1,4,6,1,2,2,2,-2.0,4,3,1,1,0,1,0,0,0,0,0,0,0,0,0,,1,,,,,,,,,1970,6,1,10,1972,4,1,15,3,2,3,-8,200000.0,4,3,3,3,5,5,3,4,3,3,3,3,2,2,50,50,50,50,50.0,50,50,50,50,2015,21
