In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
os.chdir("D:/Programming/HKU/MSDA7005")

###############################################################################
#1. Process the data of depression
#Read the file
cognition_raw_data = pd.read_stata("2018全国追踪调查/CHARLS2018r/Cognition.dta")
#We need the Depression Scale, which includes questions DC009 - DC018
cog_columns = ['ID','householdID','communityID','dc009','dc010','dc011','dc012','dc013','dc014','dc015','dc016','dc017','dc018']
cog_selected = cognition_raw_data[cog_columns]
#We first transfer all the str results into int type.
question_columns = ['dc009','dc010','dc011','dc012','dc013','dc014','dc015','dc016','dc017','dc018']
for columns in question_columns:
    cog_selected.loc[:,columns] = cog_selected[columns].apply(lambda x: int(str(x)[0]) if pd.notnull(x) and str(x)[0].isdigit() else np.nan)
#For the answer values of 1-4, a higher number indicates a deeper degree. Among the 10 questions, except for two questions, dc0016 and dc0013, which are positive, the other 8 are negative. Therefore, we need to assign scores to each of these questions. If it is a positive question, then 1-4 are assigned a value of 3-0 respectively, indicating a decreasing level of depression. Whereas, in the negative questions, 1-4 are assigned a value of 0-3 respectively, indicating an increasing level of depression.
positive_question = ['dc013','dc016']
negative_question = ['dc009','dc010','dc011','dc012','dc014','dc015','dc017','dc018']
for columns in question_columns:
    if columns in positive_question:
        score_map = {1: 3, 2: 2, 3: 1, 4: 0, 8: np.nan, 9: np.nan}
        cog_selected.loc[:,columns] = cog_selected[columns].apply(lambda x: score_map.get(x))
    else:
        score_map = {1: 0, 2: 1, 3: 2, 4: 3, 8: np.nan, 9: np.nan}
        cog_selected.loc[:,columns] = cog_selected[columns].apply(lambda x: score_map.get(x))
#We then calculate the mean of depression score. The higher the score is, the more depression the interviewee has.
depression_df = cog_selected.iloc[:,:3]
depression_df['Depression'] = cog_selected[question_columns].mean(axis=1, skipna=True)
depression_df = depression_df.dropna(subset = ['Depression'])

#Visualization
#Histogram
plt.figure(figsize=(6, 4))
sns.histplot(depression_df['Depression'], bins=10, kde=True)
plt.title('Histogram of Depression Distribution')
plt.xlabel('Depression')
plt.ylabel('Frequency')
plt.savefig('Plots/Depression_hist.png')
plt.show()
#Boxplot
plt.figure(figsize=(6, 4))
sns.boxplot(x = depression_df['Depression'])
plt.title('Box Plot of Depression')
plt.xlabel('Depression')
plt.savefig('Plots/Depression_box.png')
plt.show()
#Table
dep_table = depression_df['Depression'].describe()
print(dep_table)


###############################################################################
#2. Process the data of health
#Read the file
health_raw_data = pd.read_stata("2018全国追踪调查/CHARLS2018r/Health_Status_and_Functioning.dta")
#We need the health self-evaluation data, which is question DA002
health_columns = ['ID','householdID','communityID','da002']
health_selected = health_raw_data[health_columns]
health_selected.loc[:,'da002'] = health_selected.loc[:,'da002'].apply(lambda x: int(str(x)[0]) if pd.notnull(x) and str(x)[0].isdigit() else np.nan)
health_df = health_selected.dropna(subset = ['da002']).rename(columns={'da002': 'Health'})
#Visualization
#Histogram
plt.figure(figsize=(6, 4))
sns.histplot(health_df['Health'], bins=10, kde=True)
plt.title('Histogram of Health Distribution')
plt.xlabel('Health Score')
plt.ylabel('Frequency')
plt.savefig('Plots/Health_hist.png')
plt.show()
#Boxplot
health_df['Health'] = pd.to_numeric(health_df['Health'], errors='coerce')
plt.figure(figsize=(6, 4))
sns.boxplot(x = health_df['Health'])
plt.title('Box Plot of Health')
plt.xlabel('Health Score')
plt.savefig('Plots/Health_box.png')
plt.show()
#Table
health_table = health_df['Health'].describe()
print(health_table)


###############################################################################
#3. Process the data of work
#Read the file
work_raw_data = pd.read_stata("2018全国追踪调查/CHARLS2018r/Work_Retirement.dta")
#We need to know whether the interviewee has work to do currently, which includes question FC008, FC001, FA002_W4. If the answer to any of the 3 questions is 1 yes, then we can say that the interviewee has work to do now.
work_columns = ['ID','householdID','communityID','fc008','fc001','fa002_w4']
work_selected = work_raw_data[work_columns]
work_selected['Work'] = work_selected[['fc008', 'fc001', 'fa002_w4']].apply(lambda x: 1 if("1 Yes" in x.values) else 0,axis = 1)
work_df = work_selected[['ID','householdID','communityID','Work']]
#Visualization
#Bar Plot
plt.figure(figsize=(6, 4))
sns.barplot(x = work_df['Work'].value_counts().index, y=work_df['Work'].value_counts().values)
plt.title('Bar Plot of Work Situation')
plt.xlabel('Work or Not')
plt.ylabel('Frequency')
plt.savefig('Plots/Work_bar.png')
plt.show()
#Pie Chart
plt.figure(figsize=(6, 6))
plt.pie(work_df['Work'].value_counts(), labels=["Doesn't have work", "Have work"], autopct='%1.1f%%', startangle=90)
plt.title('Pie Chart of Work Situation')
plt.savefig('Plots/Work_pie.png')
plt.show()
#Table
work_table = work_df['Work'].describe()
print(work_table)
