In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from warnings import filterwarnings
from plotly.subplots import make_subplots
filterwarnings('ignore')

colors = ['#B1EDED','#B1B2ED','#1DE7ED','#1DA5ED','#1D50ED','#16548E']
gen_colors = ['#4169E1','#B2182B','#81007F','#D1B2FF','#EFE4E2']
JP_colors = ['#D90B0B','#F24444','#EFE4E2','#FCCE88','#64807F']
CN_colors = ['#E0201B','#FFCE3F','#A63F03','#04BF33','#F2E6D8']


df21 = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv')
df19 = pd.read_csv('../input/kaggle-survey-2019/multiple_choice_responses.csv')

df19.head()

In [None]:
JP_ndarray = df19[df19['Q3'] == 'Japan']['Q2'].values
CN_ndarray = df19[df19['Q3'] == 'China']['Q2'].values
JP_age_list = [] # 'Male'을 'Man'으로 바꿔담을 빈 리스트 생성
CN_age_list = []

for item in JP_ndarray:
    if item == 'Male':
        # 문자열 치환
        item_mod = item.replace('Male','Man')
        # 새로운 리스트에 추가
        JP_age_list.append(item_mod)
    elif item == 'Female':
        item_mod2 = item.replace('Female','Woman')
        JP_age_list.append(item_mod2)
    else :
        JP_age_list.append(item)

for item in CN_ndarray:
    if item == 'Male':
        # 문자열 치환
        item_mod = item.replace('Male','Man')
        # 새로운 리스트에 추가
        CN_age_list.append(item_mod)
    elif item == 'Female':
        item_mod2 = item.replace('Female','Woman')
        CN_age_list.append(item_mod2)
    else :
        CN_age_list.append(item)

JP_age_series = pd.Series(JP_age_list)
CN_age_series = pd.Series(CN_age_list)
#함수 고려

In [None]:
def group(data, country, question_num):
    return data[data['Q3'] == country][question_num].value_counts()

def go_Bar(name_input, xaxis, yaxis, color, OSgroup):
    return go.Bar( 
                name=name_input,
                x=xaxis,
                y=yaxis,
                marker_color=color,
                offsetgroup=OSgroup)

def go_Pie(country, label_value):
    return go.Pie(title = country,
                  labels = label_value.index,
                  values = label_value.values,
                  textinfo = 'label+percent',
                  rotation=315,
                  hole = .3,)


In [None]:
JP_age_19 =  group(df19,'Japan','Q1').sort_index()
JP_age_21 =  group(df21,'Japan','Q1').sort_index()
CN_age_19 =  group(df19,'China','Q1').sort_index()
CN_age_21 =  group(df21,'China','Q1').sort_index()


fig_age_19 = go.Figure(data=[ 
                        go_Bar('Japan', JP_age_19.index, JP_age_19.values, JP_colors[0], 0),
                        go_Bar('China', CN_age_19.index, CN_age_19.values, CN_colors[1], 1)
])

fig_age_21 = go.Figure(data=[ 
                        go_Bar('Japan', JP_age_21.index, JP_age_21.values, JP_colors[0], 0),
                        go_Bar('China', CN_age_21.index, CN_age_21.values, CN_colors[1], 1)
])

fig_age_19.update_layout(
                    barmode='group',
                    title_text='2019 China and Japan age distribution',
                    xaxis_title='Age',
                    yaxis_title='Counts')

fig_age_21.update_layout(
                    barmode='group',
                    title_text='2021 China and Japan age distribution',
                    xaxis_title='Age',
                    yaxis_title='Counts')
fig_age_19.show()
fig_age_21.show()

In [None]:
fig = make_subplots(rows=2, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}],
                                           [{'type':'domain'}, {'type':'domain'}]])
'''
def group(data, country, question_num):
    return data[data['Q3'] == country][question_num].value_counts()

def go_Pie(country, label_value):
    return go.Pie(title = country,
                  labels = label_value.index,
                  values = label_value.values,
                  textinfo = 'label+percent',
                  hole = .3,)
'''
fig.add_trace(go_Pie('2019_Japan', JP_age_series.value_counts()),1,1)
              
fig.add_trace(go_Pie('2019_China', CN_age_series.value_counts()),1,2)

fig.add_trace(go_Pie('2021_Japan', group(df21,'Japan','Q2')),2,1)

fig.add_trace(go_Pie('2021_China', group(df21,'China','Q2')),2,2)

fig.update_traces(marker=dict(colors=gen_colors[0:]))
fig.update_layout(title_text='Gender Distribution',
                  paper_bgcolor='ivory',
                  showlegend=True,
                  autosize=True,
                  height=700)
fig.show()

In [None]:
years = ['2019', '2021']
JP_country_count_19 = (df19[df19['Q3'] == 'Japan']['Q3']).count()
CN_country_count_19 = (df19[df19['Q3'] == 'China']['Q3']).count()
JP_country_count_21 = (df21[df21['Q3'] == 'Japan']['Q3']).count()
CN_country_count_21 = (df21[df21['Q3'] == 'China']['Q3']).count()

JP_country_count_19_21 = [JP_country_count_19, JP_country_count_21]
CN_country_count_19_21 = [CN_country_count_19, CN_country_count_21]

fig_country = go.Figure(data=[
                        go_Bar('Japan', years, JP_country_count_19_21, JP_colors[0], 0),
                        go_Bar('China', years, CN_country_count_19_21, CN_colors[1], 1)
])

fig_country.update_layout(
                    barmode='group',
                    title_text='2019 & 2021, the number of Kaggler living in Japan and China',
                    xaxis_title='years',
                    yaxis_title='Counts')
fig_country.show()

In [None]:
tool_list = ['Matplotlib', 'Seaborn', 'Plotly / Plotly Express', 'Ggplot / ggplot2', 'Shiny', 'D3 js', 'Altair', 'Bokeh', 'Geoplotlib', 'Leaflet / Folium', 'None', 'Other']
df19_JPCN = df19[df19.Q3.isin(['Japna','China'])]
df21_JPCN = df21[df21.Q3.isin(['Japan','China'])]
df19_JPCN_Q14 = pd.DataFrame()
df21_JPCN_Q14 = pd.DataFrame()
df19_JPCN_Q14['Q20'] = [df19_JPCN[col][1:].value_counts().index[0] for col in df19_JPCN.columns[97:109]]
df21_JPCN_Q14['Q14'] = [df21_JPCN[col][1:].value_counts().index[0] for col in df21_JPCN.columns[59:71]]
df19_JPCN_Q14['counts'] = [df19_JPCN[col][1:].value_counts().values[0] for col in df19_JPCN.columns[97:109]]
df21_JPCN_Q14['counts'] = [df21_JPCN[col][1:].value_counts().values[0] for col in df21_JPCN.columns[59:71]]

df19_JPCN_Q14['Q20'].sort_values(ascending=True)

In [None]:
df21_JPCN_Q14

In [None]:
df19_JPCN_Q14

In [None]:
print(tool_list)
print(df19_JPCN_Q14['Q20'].sort_values(ascending=True).values)
print(df21_JPCN_Q14['Q14'].sort_values(ascending=True).values)
print(type(df21_JPCN_Q14['Q14'].sort_values(ascending=True).values))

In [None]:
series1 = df19_JPCN_Q14['Q20']
series1.replace('D3.js','D3 js')

In [None]:
tool_list = ['Matplotlib', 'Seaborn', 'Plotly / Plotly Express', 'Ggplot / ggplot2', 'Shiny', 'D3 js', 'Altair', 'Bokeh', 'Geoplotlib', 'Leaflet / Folium', 'None', 'Other']
df19_JP = df19[df19.Q3.isin(['Japan'])]
df19_CN = df19[df19.Q3.isin(['China'])]
df21_JP = df21[df21.Q3.isin(['Japan'])]
df21_CN = df21[df21.Q3.isin(['China'])]
df19_JP_Q14 = pd.DataFrame()
df19_CN_Q14 = pd.DataFrame()
df21_JP_Q14 = pd.DataFrame()
df21_CN_Q14 = pd.DataFrame()
df19_JP_Q14['Q20'] = [df19_JP[col][1:].value_counts().index[0] for col in df19_JP.columns[97:109]]
df19_CN_Q14['Q20'] = [df19_CN[col][1:].value_counts().index[0] for col in df19_CN.columns[97:109]]
df21_JP_Q14['Q14'] = [df21_JP[col][1:].value_counts().index[0] for col in df21_JP.columns[59:71]]
df21_CN_Q14['Q14'] = [df21_CN[col][1:].value_counts().index[0] for col in df21_CN.columns[59:71]]
df19_JP_Q14['counts'] = [df19_JP[col][1:].value_counts().values[0] for col in df19_JP.columns[97:109]]
df19_CN_Q14['counts'] = [df19_CN[col][1:].value_counts().values[0] for col in df19_CN.columns[97:109]]
df21_JP_Q14['counts'] = [df21_JP[col][1:].value_counts().values[0] for col in df21_JP.columns[59:71]]
df21_CN_Q14['counts'] = [df21_CN[col][1:].value_counts().values[0] for col in df21_CN.columns[59:71]]


df19_JP_Q14.index = [3,0,6,4,5,2,7,1,8,9,10,11]
df19_CN_Q14.index = [3,0,6,4,5,2,7,1,8,9,10,11]
df19_JP_Q14 = df19_JP_Q14.sort_index()
df19_CN_Q14 = df19_CN_Q14.sort_index()
df21_JP_Q14['Q14'].index = [0,1,2,3,4,5,6,7,8,9,10,11]
df21_CN_Q14['Q14'].index = [0,1,2,3,4,5,6,7,8,9,10,11]
df19_JP_Q14.replace(regex = 'D3.js', value = 'D3 js', inplace = True)
df19_CN_Q14.replace(regex = 'D3.js', value = 'D3 js', inplace = True)


fig_tool1 = go.Figure(data=[
    go.Bar(name=years[0], x=df19_JP_Q14['Q20'].values, y=df19_JP_Q14['counts'].values, marker_color=JP_colors[0], offsetgroup=0),
    go.Bar(name=years[0], x=df19_JP_Q14['Q20'].values, y=df19_JP_Q14['counts'].values, marker_color=CN_colors[1], offsetgroup=0)
])
fig_tool1 = go.Figure(data=[
    go.Bar(name=years[1], x=df21_JP_Q14['Q14'].values, y=df21_JP_Q14['counts'].values, marker_color=JP_colors[1], offsetgroup=1),
    go.Bar(name=years[1], x=df21_JP_Q14['Q14'].values, y=df21_JP_Q14['counts'].values, marker_color=CN_colors[2], offsetgroup=1)
])
    
fig_tool1.update_layout(
                    barmode = 'stack',
                    title_text = 'title',
                    xaxis_title = 'Visualization libraries or tool',
                    yaxis_title = 'Counts')

fig_tool2.update_layout(
                    barmode = 'stack',
                    title_text = 'title',
                    xaxis_title = 'Visualization libraries or tool',
                    yaxis_title = 'Counts')

fig_tool1.show()
fig_tool2.show()

In [None]:
tool_list = ['Matplotlib', 'Seaborn', 'Plotly / Plotly Express', 'Ggplot / ggplot2', 'Shiny', 'D3 js', 'Altair', 'Bokeh', 'Geoplotlib', 'Leaflet / Folium', 'None', 'Other']
df19_JP = df19[df19.Q3.isin(['Japan'])]
df19_CN = df19[df19.Q3.isin(['China'])]
df21_JP = df21[df21.Q3.isin(['Japan'])]
df21_CN = df21[df21.Q3.isin(['China'])]
df19_JP_Q14 = pd.DataFrame()
df19_CN_Q14 = pd.DataFrame()
df21_JP_Q14 = pd.DataFrame()
df21_CN_Q14 = pd.DataFrame()
df19_JP_Q14['Q20'] = [df19_JP[col][1:].value_counts().index[0] for col in df19_JP.columns[97:109]]
df19_CN_Q14['Q20'] = [df19_CN[col][1:].value_counts().index[0] for col in df19_CN.columns[97:109]]
df21_JP_Q14['Q14'] = [df21_JP[col][1:].value_counts().index[0] for col in df21_JP.columns[59:71]]
df21_CN_Q14['Q14'] = [df21_CN[col][1:].value_counts().index[0] for col in df21_CN.columns[59:71]]
df19_JP_Q14['counts'] = [df19_JP[col][1:].value_counts().values[0] for col in df19_JP.columns[97:109]]
df19_CN_Q14['counts'] = [df19_CN[col][1:].value_counts().values[0] for col in df19_CN.columns[97:109]]
df21_JP_Q14['counts'] = [df21_JP[col][1:].value_counts().values[0] for col in df21_JP.columns[59:71]]
df21_CN_Q14['counts'] = [df21_CN[col][1:].value_counts().values[0] for col in df21_CN.columns[59:71]]


df19_JP_Q14.index = [3,0,6,4,5,2,7,1,8,9,10,11]
df19_CN_Q14.index = [3,0,6,4,5,2,7,1,8,9,10,11]
df19_JP_Q14 = df19_JP_Q14.sort_index()
df19_CN_Q14 = df19_CN_Q14.sort_index()
df21_JP_Q14['Q14'].index = [0,1,2,3,4,5,6,7,8,9,10,11]
df21_CN_Q14['Q14'].index = [0,1,2,3,4,5,6,7,8,9,10,11]
df19_JP_Q14.replace(regex = 'D3.js', value = 'D3 js', inplace = True)
df19_CN_Q14.replace(regex = 'D3.js', value = 'D3 js', inplace = True)


fig_tool1 = go.Figure(data=[
    go.Bar(name=years[0], x=df19_JP_Q14['Q20'].values, y=df19_JP_Q14['counts'].values, marker_color=JP_colors[0], offsetgroup=0),
    go.Bar(name=years[0], x=df19_JP_Q14['Q20'].values, y=df19_JP_Q14['counts'].values, marker_color=CN_colors[1], offsetgroup=0)
])
fig_tool1 = go.Figure(data=[
    go.Bar(name=years[1], x=df21_JP_Q14['Q14'].values, y=df21_JP_Q14['counts'].values, marker_color=JP_colors[1], offsetgroup=1),
    go.Bar(name=years[1], x=df21_JP_Q14['Q14'].values, y=df21_JP_Q14['counts'].values, marker_color=CN_colors[2], offsetgroup=1)
])
    
fig_tool1.update_layout(
                    barmode = 'stack',
                    title_text = 'title',
                    xaxis_title = 'Visualization libraries or tool',
                    yaxis_title = 'Counts')

fig_tool2.update_layout(
                    barmode = 'stack',
                    title_text = 'title',
                    xaxis_title = 'Visualization libraries or tool',
                    yaxis_title = 'Counts')

fig_tool1.show()
fig_tool2.show()

In [None]:
tool_list = ['Matplotlib', 'Seaborn', 'Plotly / Plotly Express', 'Ggplot / ggplot2', 'Shiny', 'D3 js', 'Altair', 'Bokeh', 'Geoplotlib', 'Leaflet / Folium', 'None', 'Other']
df19_JPCN = df19[df19.Q3.isin(['Japna','China'])]
df21_JPCN = df21[df21.Q3.isin(['Japan','China'])]
df19_JPCN_Q14 = pd.DataFrame()
df21_JPCN_Q14 = pd.DataFrame()
df19_JPCN_Q14['Q20'] = [df19_JPCN[col][1:].value_counts().index[0] for col in df19_JPCN.columns[97:109]]
df21_JPCN_Q14['Q14'] = [df21_JPCN[col][1:].value_counts().index[0] for col in df21_JPCN.columns[59:71]]
df19_JPCN_Q14['counts'] = [df19_JPCN[col][1:].value_counts().values[0] for col in df19_JPCN.columns[97:109]]
df21_JPCN_Q14['counts'] = [df21_JPCN[col][1:].value_counts().values[0] for col in df21_JPCN.columns[59:71]]

df19_JPCN_Q14.index = [3,0,6,4,5,2,7,1,8,9,10,11]
df19_JPCN_Q14 = df19_JPCN_Q14.sort_index()
df21_JPCN_Q14['Q14'].index = [0,1,2,3,4,5,6,7,8,9,10,11]

fig_tool = go.Figure(data=[go_Bar(years[0], df19_JPCN_Q14['Q20'].values[0], df19_JPCN_Q14['counts'].values[0], CN_colors[1]),
                        go_Bar(years[0], df19_JPCN_Q14['Q20'].values[1], df19_JPCN_Q14['counts'].values[0], CN_colors[1]),
                        go_Bar(years[0], df19_JPCN_Q14['Q20'].values[2], df19_JPCN_Q14['counts'].values[0], CN_colors[1]),
                        go_Bar(years[0], df19_JPCN_Q14['Q20'].values[3], df19_JPCN_Q14['counts'].values[0], CN_colors[1]),
                        go_Bar(years[0], df19_JPCN_Q14['Q20'].values[4], df19_JPCN_Q14['counts'].values[0], CN_colors[1]),
                        go_Bar(years[0], df19_JPCN_Q14['Q20'].values[5], df19_JPCN_Q14['counts'].values[0], CN_colors[1]),
                        go_Bar(years[0], df19_JPCN_Q14['Q20'].values[6], df19_JPCN_Q14['counts'].values[0], CN_colors[1]),
                        go_Bar(years[0], df19_JPCN_Q14['Q20'].values[7], df19_JPCN_Q14['counts'].values[0], CN_colors[1]),
                        go_Bar(years[0], df19_JPCN_Q14['Q20'].values[8], df19_JPCN_Q14['counts'].values[0], CN_colors[1]),
                        go_Bar(years[0], df19_JPCN_Q14['Q20'].values[9], df19_JPCN_Q14['counts'].values[0], CN_colors[1]),
                        go_Bar(years[0], df19_JPCN_Q14['Q20'].values[10], df19_JPCN_Q14['counts'].values[0], CN_colors[1]),
                        go_Bar(years[0], df19_JPCN_Q14['Q20'].values[11], df19_JPCN_Q14['counts'].values[0], CN_colors[1])
])
fig_tool.show()

In [None]:
df21_JPCN_Q14

In [None]:
years = ['2019', '2021']
JP_country_count_19 = (df19[df19['Q3'] == 'Japan']['Q3']).count()
CN_country_count_19 = (df19[df19['Q3'] == 'China']['Q3']).count()
JP_country_count_21 = (df21[df21['Q3'] == 'Japan']['Q3']).count()
CN_country_count_21 = (df21[df21['Q3'] == 'China']['Q3']).count()

fig_country = go.Figure(data=[
                        go_Bar('Japan', years[0], JP_country_count_19, JP_colors[0]),
                        go_Bar('China', years[0], CN_country_count_19, CN_colors[1])
])

fig_country.update_layodfut(
                    barmode='group',
                    title_text='2019 & 2021, the number of Kaggler living in Japan and China',
                    xaxis_title='years',
                    yaxis_title='Counts')
fig_country.show()

In [None]:
(df19[df19['Q3'] == 'Japan']['Q3']).count()

In [None]:
print(len(ndarray))

In [None]:
df19[df19['Q3'] == 'Japan']['Q2']

In [None]:
print(type(df21[df21['Q3'] == 'Japan']['Q2']))

In [None]:
df21[df21['Q3'] == 'Japan']['Q2'].value_counts()

In [None]:
df19[df19['Q3'] == 'Japan']['Q1'].value_counts()

In [None]:
(df19[df19['Q3'] == 'Japan']['Q1'].value_counts()).sort_index()

In [None]:
JP_age_19 =  group(df19,'Japan','Q1').sort_index()
JP_age_21 =  group(df21,'Japan','Q1').sort_index()
CN_age_19 =  group(df19,'China','Q1').sort_index()
CN_age_21 =  group(df21,'China','Q1').sort_index()

fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go_Bar('Japan', JP_age_19, JP_colors[0]),
              go_Bar('China', CN_age_19, CN_colors[1]),1,1)
fig.add_trace(go_Bar('Japan', JP_age_21, JP_colors[0]),
              go_Bar('China', CN_age_21, CN_colors[1]),1,2)

fig.update_layout(
                    barmode='group',
                    title_text='Age distribution of China & Japan',
                    xaxis_title='Age',
                    yaxis_title='Counts')
fig.show()

In [None]:
JP_age_19 =  group(df19,'Japan','Q1').sort_index()
JP_age_21 =  group(df21,'Japan','Q1').sort_index()
CN_age_19 =  group(df19,'China','Q1').sort_index()
CN_age_21 =  group(df21,'China','Q1').sort_index()

fig = make_subplots(rows=1, cols=2)

fig.add_bar(y=[2, 1, 3],
            marker=dict(color="MediumPurple"),
            name="b", row=1, col=1)

fig.add_bar(y=[1, 3, 2],
            marker=dict(color="LightSeaGreen"),
            name="d", row=1, col=2)

'''
fig.add_trace(go_Bar('Japan', JP_age_19, JP_age_19, JP_colors[0]),
              go_Bar('China', CN_age_19, CN_age_19, CN_colors[1]),1,1)
fig.add_trace(go_Bar('Japan', JP_age_21, JP_age_21, JP_colors[0]),
              go_Bar('China', CN_age_21, CN_age_21, CN_colors[1]),1,2)
'''
fig.update_layout(
                    barmode='group',
                    title_text='Age distribution of China & Japan',
                    xaxis_title='Age',
                    yaxis_title='Counts')
fig.show()