In [1]:
import numpy as np
import pandas as pd 
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly import graph_objects
import plotly.figure_factory as ff
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)

### <strong>Hello kaggler!</strong><br> we are job seeker in Korea who have been learning Python for a month. <br>We are very interested in the field of IT and big data analysis. In the meantime, there was a good competition in kaggle, so we participated. <br>Although it was not enough, we worked hard to make results. we hope you enjoy watching it.

# Comparative analysis of KOR, CH, JAP, and the USA
## - Comparison of the development of the IT industry through the kaggle user survey
- Kaggle is a world-renowned predictive model and analysis competition platform. When numerous companies and organizations register data and challenges, data scientists develop and compete for models to solve them.<p>
- As such, anyone who has entered the field of data analysis will participate in Kaggle.<p>
- The Kaggle user survey focuses on the data of people engaged in the job or studying for employment.<p>
- By comparing and analyzing this, we simply compared the degree of development of the IT industry in the United States and three other countries, which are the leaders of the IT industry.

In [2]:
df_2021 = pd.read_csv('../input/my-data/kaggle_survey_2021_responses.csv')
df_2018 = pd.read_csv('../input/my-data/2018_kaggle_ds_and_ml_survey_responses_only.csv')


Columns (0,195,201,285,286,287,288,289,290,291,292) have mixed types.Specify dtype option on import or set low_memory=False.


Columns (0,2,8,10,21,23,24,25,26,27,28,44,56,64,83,85,87,107,109,123,125,150,157,172,174,194,210,218,219,223,246,249,262,264,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,304,306,325,326,329,341,368,371,384,385,389,390,391,393,394) have mixed types.Specify dtype option on import or set low_memory=False.



In [3]:
df_2021_data= df_2021[df_2021['Q3'].isin(['United States of America', 'Japan', 'China', 'South Korea'])].reset_index(drop=True)
df_2021_asia = df_2021[df_2021['Q3'].isin(['Japan', 'China', 'South Korea'])].reset_index(drop=True)
df_2021_usa = df_2021[df_2021['Q3'].isin(['United States of America'])].reset_index(drop=True)
df_2018_data= df_2018[df_2018['Q3'].isin(['United States of America', 'Japan', 'China', 'South Korea'])]

In [4]:
df_2021_data['Q3'].replace(['United States of America', 'South Korea', 'Japan', 'China'],['USA', 'KOR', 'JAP', 'CH'], inplace=True)
df_2021_asia['Q3'].replace(['South Korea', 'Japan', 'China'],['KOR', 'JAP', 'CH'], inplace=True)
df_2021_usa['Q3'].replace('United States of America', 'USA', inplace=True)
df_2018_data['Q3'].replace(['United States of America', 'South Korea', 'Japan', 'China'],['USA', 'KOR', 'JAP', 'CH'], inplace=True)
df_2021_data['Q4'].replace(['Some college/university study without earning a bachelor’s degree', 'No formal education past high school'],['Vocational college', 'A high school graduate'], inplace=True)

# 1. 2018 vs 2021. Korea, China, Japan, and the U.S. Kagle response comparison
- The total number of respondents in the Kaggle survey was compared in 2018, similar to 2021.

In [5]:
q3_df_2018= df_2018_data.groupby(['Q3']).size().reset_index().rename(columns = {0:"Count"})
q3_df_2021= df_2021_data.groupby(['Q3']).size().reset_index().rename(columns = {0:"Count"})

fig = go.Figure()
fig.add_trace(go.Bar(
             x= q3_df_2021['Q3'],
             y=q3_df_2021['Count'],
             text = q3_df_2021['Count'],
             name='2021 of Kaggle User'
            ))
fig.add_trace(go.Bar(
             x= q3_df_2018['Q3'],
             y=q3_df_2018['Count'],
             text = q3_df_2018['Count'],
             name='2018 of Kaggle User'
            ))
 
fig.update_layout(barmode='group', xaxis_tickangle=-30,showlegend=True,
                 template = "plotly_white", title='2021 of Kaggle users')     
fig.show()

## Analysis result
- In the case of the United States and China, the number of respondents in 2021 decreased by nearly half compared to 2018.<p>
- In the case of Korea and Japan, on the contrary, it nearly doubled.<p>
- Through this, it is expected that the number of kaggle users is gradually decreasing in China and the United States, while on the contrary, it will be increasing in Korea and Japan.<p>

# 2. Comparison of age groups between Korea, China, Japan and the United States
- The United States is leading the IT and big data analysis-based industries.<p>
- Korea, China, and Japan are investing a lot in the field as latecomers.<p>
- By comparing the age distribution of kaggle users by country, you will be able to see how much attention is being paid to the current field in the three countries.<p>
- In addition, it is expected that the comparison data for age groups can serve as an indicator of the programming period, degree level, major occupational groups, and salary to be analyzed later.
    

In [6]:
q3_q1 = df_2021_data.groupby(['Q3', 'Q1']).size().reset_index().rename(columns = {0:"Count"})

def get_pnt(data, country):
    data_country = data[data['Q3'] == country].reset_index(drop = True)
    data_country['percentage'] = data_country["Count"] / data_country["Count"].sum()
    data_country['%'] = np.round(data_country['percentage'] * 100, 1)
    
    return data_country

usa_df = get_pnt(q3_q1, "USA")
china_df = get_pnt(q3_q1, "CH")
japan_df = get_pnt(q3_q1, "JAP")
korea_df = get_pnt(q3_q1, "KOR")

In [7]:
fig = make_subplots(rows=2, cols=2, subplot_titles=("USA with Q1", "China with Q1", "Japan with Q1", "Korea with Q1"), column_widths = [2, 2],
                    specs=[[{'type':'domain'}, {'type':'domain'}],
                          [{'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(labels = usa_df['Q1'], 
                     values = usa_df['%'], 
                     ),  row = 1, col = 1)

fig.add_trace(go.Pie(labels = china_df['Q1'], 
                     values = china_df['%'], 
                      ), row = 1, col = 2)

fig.add_trace(go.Pie(labels = japan_df['Q1'], 
                     values = japan_df['%'], 
                    ),  row = 2, col = 1)

fig.add_trace(go.Pie(labels = korea_df['Q1'], 
                     values = korea_df['%'], 
                     ), row = 2, col = 2)
fig.update_layout(height = 1000,
                  showlegend=True,
                 template = "plotly_white")

fig.show()

## Analysis result
- In the United States, it can be seen that developments in the field have been made for a long time. In particular, 71.3% of respondents said they were 22 to 49. It can be expected that a large number of people in the United States are participating in data analysis and IT industries, and at the same time, development is quite high.<p>
- In the case of Japan, the distribution was quite similar to that of the United States. 74.4% of respondents said they were 22 to 49 years old, 3.1% higher than in the United States. However, the difference between the two countries was shown in the detailed average age distribution. In the case of the United States, 69.1% of those aged 30 or older and 63.5% of those aged 30 or older are ahead of the United States by 5.6%.<p>
- In the case of China, the proportion of younger generations was relatively large. The percentage of people aged 18 to 39 reached 96.8%. This shows a great correlation with the results of the data to be analyzed later.<p>
-In the case of Korea, the distribution map of the middle age group stands out. The age group under the age of 21 and over 55 is very small compared to other countries. This can be understood by considering the Korean education system and the culture in which companies over 55 encourage retirement.
In addition, 29.5% of the respondents aged 25 to 29 years old compared to 12.8% of the 22 to 24 years old distribution is understandable considering that most of the respondents in the kaggle survey were male and that the average military life of Korean men is understandable. It can be expected that most of the respondents are studying in the current field from the age of 25 to 34 after being discharged from the military and hoping for the job.

# 3. Comparing the distribution of Korea, China, Japan, and US programming activities
- If the ages of the four countries were compared, this time it is a full-fledged comparison of the duration of programming activities by country.<p>
  1. In the case of the United States, it is expected to show a distribution similar to that of the age group.<p>
  2. In the case of Japan, it is similar to the United States, but it is expected that there will be more distributions of shorter careers during the activity period.<p>
  3. In the case of China, as there are many younger generations, there will be a large distribution of short periods.<p>
  4. In the case of Korea, it will be similar to or more than China.

In [8]:
q3_q6 = df_2021_data.groupby(['Q3', 'Q6']).size().reset_index().rename(columns = {0:"Count"})

def get_pnt(data, country):
    data_country = data[data['Q3'] == country].reset_index(drop = True)
    data_country['percentage'] = data_country["Count"] / data_country["Count"].sum()
    data_country['%'] = np.round(data_country['percentage'] * 100, 1)
    
    return data_country

usa_df = get_pnt(q3_q6, "USA")
china_df = get_pnt(q3_q6, "CH")
japan_df = get_pnt(q3_q6, "JAP")
korea_df = get_pnt(q3_q6, "KOR")

In [9]:
fig = make_subplots(rows=2, cols=2, subplot_titles=("USA with Q6", "China with Q6", "Japan with Q6", "Korea with Q6"), column_widths = [2, 2],
                    specs=[[{'type':'domain'}, {'type':'domain'}],
                          [{'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(labels = usa_df['Q6'], 
                     values = usa_df['%'], 
                     ),  row = 1, col = 1)

fig.add_trace(go.Pie(labels = china_df['Q6'], 
                     values = china_df['%'], 
                      ), row = 1, col = 2)

fig.add_trace(go.Pie(labels = japan_df['Q6'], 
                     values = japan_df['%'], 
                    ),  row = 2, col = 1)

fig.add_trace(go.Pie(labels = korea_df['Q6'], 
                     values = korea_df['%'], 
                     ), row = 2, col = 2)
fig.update_layout(height = 1000,
                  showlegend=True,
                 template = "plotly_white")

fig.show()

## Analysis result
- In the case of the United States, as expected, the rate was somewhat related to the age group. Most of the activity periods were evenly distributed, and 12.9% of the respondents said they were less than one year. In particular, 30.5% of respondents answered that it was more than 10-20+ compared to the other three countries.<p>
- In the case of Japan, distribution was evenly distributed like in the United States, but there was a difference in detailed distribution. The most noticeable is the rate from less than 1 year to less than 3 years. The United States accounted for 32.9%, while Japan accounted for 46.2%. This seems to be a bigger difference from the distribution of age group responses compared above. One thing to predict is that the data analysis field began to develop later than the United States, and people who majored in other jobs and majors began programming as the industry developed.<p>
- In the case of China, 84.7% of the respondents said it was less than 5 years old. In addition, only 6.4% of respondents said they had more than 10 years of experience. This can be guessed that a lot of interest and investment in the IT and big data fields are currently being made in China.<p>
- In the case of Korea, 68.8% of the respondents said it was less than 5 years, which was relatively small compared to China. In particular, the percentage of respondents said that the difference was less than a year of experience. This is expected to be because there are alternative media such as awareness of kaggle in Korea, various big data analysis competitions held in Korea, and competitions held by universities, companies, and public institutions.

# 4. Comparing the distribution of degrees between Korea, China, Japan and the US kaggle users
- This is a comparison of the degree level distribution of respondents in the data field in Korea, China, Japan, and the United States.<p>
- By analyzing the level of degrees in the data field by four countries, the association with major occupations and salary levels is compared.<p>

In [10]:
q3_q4 = df_2021_data.groupby(['Q3', 'Q4']).size().reset_index().rename(columns = {0:"Count"})

def get_pnt(data, country):
    data_country = data[data['Q3'] == country].reset_index(drop = True)
    data_country['percentage'] = data_country["Count"] / data_country["Count"].sum()
    data_country['%'] = np.round(data_country['percentage'] * 100, 1)
    
    return data_country

usa_df = get_pnt(q3_q4, "USA")
china_df = get_pnt(q3_q4, "CH")
japan_df = get_pnt(q3_q4, "JAP")
korea_df = get_pnt(q3_q4, "KOR")


In [11]:
fig = make_subplots(rows=2, cols=2, subplot_titles=("USA with Q4", "China with Q4", "Japan with Q4", "Korea with Q4"), column_widths = [2, 2],
                    specs=[[{'type':'domain'}, {'type':'domain'}],
                          [{'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(labels = usa_df['Q4'], 
                     values = usa_df['%'], 
                     ),  row = 1, col = 1)

fig.add_trace(go.Pie(labels = china_df['Q4'], 
                     values = china_df['%'], 
                      ), row = 1, col = 2)

fig.add_trace(go.Pie(labels = japan_df['Q4'], 
                     values = japan_df['%'], 
                    ),  row = 2, col = 1)

fig.add_trace(go.Pie(labels = korea_df['Q4'], 
                     values = korea_df['%'], 
                     ), row = 2, col = 2)
fig.update_layout(height = 1000,
                  showlegend=True,
                 template = "plotly_white")

fig.show()

## Analysis result
- Excluding Korea, more than 40% of master's degrees were distributed in the United States, China, and Japan. On the other hand, in Korea, 35.4% of those who obtained a bachelor's degree had the highest percentage.<p>
- In the case of the proportion of Ph.D. recipients, China was the lowest at 8.6%, and more than 10% were distributed in the rest of the countries.<p>
- In the case of the United States, the rate of no degree was the lowest, and in Japan, the rate of no degree was the highest.<p>
- It is expected that the level of education will affect the salary level and occupation in the United States, where the overall education level is high, and Korea, which has the highest bachelor's ratio.

# 5. Comparison of job distribution related to data analysis between Korea, China and Japan vs. the United States
- The big data field is currently being used in various fields.<p>
- In this regard, occupations are gradually subdivided, and they are affecting the primary, secondary, and tertiary industries beyond just the IT sector. Let's compare the distribution of these occupations by country.

In [12]:
q3_q5 = df_2021_data.groupby(['Q3', 'Q5']).size().reset_index().rename(columns = {0:"Count"})

def get_pnt(data, country):
    data_country = data[data['Q3'] == country].reset_index(drop = True)
    data_country['percentage'] = data_country["Count"] / data_country["Count"].sum()
    data_country['%'] = np.round(data_country['percentage'] * 100, 1)
    
    return data_country

usa_df = get_pnt(q3_q5, "USA")
china_df = get_pnt(q3_q5, "CH")
japan_df = get_pnt(q3_q5, "JAP")
korea_df = get_pnt(q3_q5, "KOR")

In [13]:
fig = make_subplots(rows=2, cols=2, subplot_titles=("USA with Q5", "China with Q5", "Japan with Q5", "Korea with Q5"), column_widths = [2, 2],
                    specs=[[{'type':'domain'}, {'type':'domain'}],
                          [{'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(labels = usa_df['Q5'], 
                     values = usa_df['%'], 
                     ),  row = 1, col = 1)

fig.add_trace(go.Pie(labels = china_df['Q5'], 
                     values = china_df['%'], 
                      ), row = 1, col = 2)

fig.add_trace(go.Pie(labels = japan_df['Q5'], 
                     values = japan_df['%'], 
                    ),  row = 2, col = 1)

fig.add_trace(go.Pie(labels = korea_df['Q5'], 
                     values = korea_df['%'], 
                     ), row = 2, col = 2)
fig.update_layout(height = 1000,
                  showlegend=True,
                 template = "plotly_white")

fig.show()

## Analysis result
- In the case of the United States, the ratio of data scientists and data analysts was relatively high. What's unusual is that users choose the "other" option that is not in the option the most. As mentioned earlier, data analysis is used in various industries, so it can be expected that the big data field is being used even if it is not a directly related industry, and accordingly, people from non-choice occupations participated.<p>
- In the case of Japan, there was a difference in contrast to the United States. It is the percentage of software engineers' responses. Most software engineers are generally classified as developers. This shows that Japan does not have many jobs specializing in data analysis, and is leading the field in the classical developer field.<p>
- In the case of China, the proportion of students was overwhelmingly large. In addition, the top three occupations, excluding students, were found to require professional knowledge in the big data field. Through this, once again, it can be seen that China is currently investing and interested in the big data field.<p>
- In the case of Korea, it was not much different from China. In detail, it is showing interest in big data in more diverse fields than in China.

# 6. Korea, China, Japan vs. US user's salary level comparison
- The big data field is currently known as the most popular job in Korea and at the same time, a job that can receive a large salary.<p>
- Let's check if these types of benefits are similar in other countries.<p>
- Each country has a different general national level, but I think it will be possible to confirm significant differences depending on the degree, career, and occupation above.

In [14]:
df_2021_data.loc[df_2021_data['Q25'] == '$0-999' , 'Q25'] = '0-4,999'
df_2021_data.loc[df_2021_data['Q25'] == '1,000-1,999' , 'Q25'] = '0-4,999'
df_2021_data.loc[df_2021_data['Q25'] == '2,000-2,999' , 'Q25'] = '0-4,999'
df_2021_data.loc[df_2021_data['Q25'] == '3,000-3,999' , 'Q25'] = '0-4,999'
df_2021_data.loc[df_2021_data['Q25'] == '4,000-4,999' , 'Q25'] = '0-4,999'
df_2021_data.loc[df_2021_data['Q25'] == '5,000-7,499' , 'Q25'] = '5,000-9,999'
df_2021_data.loc[df_2021_data['Q25'] == '7,500-9,999' , 'Q25'] = '5,000-9,999'
df_2021_data.loc[df_2021_data['Q25'] == '10,000-14,999'  , 'Q25'] = '10,000-19,999'
df_2021_data.loc[df_2021_data['Q25'] == '15,000-19,999' , 'Q25'] = '10,000-19,999'
df_2021_data.loc[df_2021_data['Q25'] == '20,000-24,999'  , 'Q25'] = '20,000-29,999'
df_2021_data.loc[df_2021_data['Q25'] == '25,000-29,999' , 'Q25'] = '20,000-29,999'
df_2021_data.loc[df_2021_data['Q25'] == '150,000-199,999' , 'Q25'] = '150,000-249,999'
df_2021_data.loc[df_2021_data['Q25'] == '200,000-249,999' , 'Q25'] = '150,000-249,999'
df_2021_data.loc[df_2021_data['Q25'] == '250,000-299,999' , 'Q25'] = '250,000-499,999'
df_2021_data.loc[df_2021_data['Q25'] == '300,000-499,999' , 'Q25'] = '250,000-499,999'
df_2021_data.loc[df_2021_data['Q25'] == '$500,000-999,999' , 'Q25'] = '> 500000'
df_2021_data.loc[df_2021_data['Q25'] == '>$1,000,000' , 'Q25'] = '> 500000'

In [15]:
df_2021_data_2=df_2021_data.dropna(subset=['Q25'])

In [16]:
q3_q25 = df_2021_data_2.groupby(['Q3', 'Q25']).size().reset_index().rename(columns = {0:"Count"})

def get_pnt(data, country):
    data_country = data[data['Q3'] == country].reset_index(drop = True)
    data_country['percentage'] = data_country["Count"] / data_country["Count"].sum()
    data_country['%'] = np.round(data_country['percentage'] * 100, 1)
    
    return data_country

usa_df = get_pnt(q3_q25, "USA")
china_df = get_pnt(q3_q25, "CH")
japan_df = get_pnt(q3_q25, "JAP")
korea_df = get_pnt(q3_q25, "KOR")

In [17]:

fig = make_subplots(rows = 2, cols = 2, 
                    shared_xaxes=True, 
                    shared_yaxes=True, 
                    vertical_spacing = 0.05,
                    subplot_titles=("USA with Q25", "China with Q25", "Japan with Q25", "Korea with Q25"),
                    column_widths = [0.5, 0.5]) 

fig.add_trace(go.Bar(x = usa_df['Q25'], 
                     y = usa_df['%'], 
                     text = usa_df['%'].astype(str) + "%", 
                     textposition='outside'), 
              row = 1, col = 1)

fig.add_trace(go.Bar(x = china_df['Q25'], 
                     y = china_df['%'], 
                     text = china_df['%'].astype(str) + "%", 
                     textposition='outside'), 
              row = 1, col = 2)

fig.add_trace(go.Bar(x = japan_df['Q25'], 
                     y = japan_df['%'], 
                     text = japan_df['%'].astype(str) + "%", 
                     textposition='outside'), 
              row = 2, col = 1)

fig.add_trace(go.Bar(x = korea_df['Q25'], 
                     y = korea_df['%'], 
                     text = korea_df['%'].astype(str) + "%", 
                     textposition='outside'), 
              row = 2, col = 2)
fig.update_xaxes(categoryorder='array', categoryarray= [ '0-4999','5,000-9,999', '10,000-19,999','20,000-29,999','30,000-39,999','40,000-49,999','50,000-59,999'
,'60,000-69,999','70,000-79,999','80,000-89,999','90,000-99,999','100,000-124,999','125,000-149,999','150,000-249,999','250,000-499,999','> 500000'])


fig.update_layout(height = 1000,
                  showlegend=False,
                 template = "plotly_white")

fig.show()

## Analysis result
- In the United States, 44.8% of the top three items in the salary level distribution ranged from $100,000 to $250,000. Another important part is that it is $250,000 to $500,000, which is overwhelmingly high at 7.7%. This seems to be somewhat related to the distribution of programming periods in the United States, with 47.1% of users responding with a programming period of 5 to 20 years or more, which is close to the number of users responding with $100,000 to $250,000. Of course, since it is a period of programming rather than a period in office, Kaggle users in the United States with the longest programming period seem to receive a high level of salary, even considering some errors.<p>
- In the case of Japan and Korea, the graph form is very similar. The distribution of 0-4,999 dollars is the largest, followed by the top five distributions. This seems to have been largely attributed to the similar salary levels between Japan and Korea.<p>
- The highest ratio of 0 to 4,999 in Korea, China, and Japan is close to the proportion of students in the job distribution of the three countries. With 47.4% of China, 16.1% of Japan, and 27.3% of Korea, the percentage of students' responses seems to have played a major role in the response to the salary level.

# Analysis review
- South Korea, the United States, China, and Japan compared four countries. There were some similarities to the figures expected before the comparative analysis, but interesting parts were also found.<p>
- In the case of Korea, which had a large number of respondents for less than five years, the majority of users answered that it was more than $40,000, unlike China.<p>
- On top of that, considering that the top three items except for students are data analysis, it can be seen that the field is currently a fairly high-paid job in Korea.<p>
- In addition, if you look at the situation in Korea, where most of them have less than five years of experience and have the highest bachelor's rate, you can expect that career, degree, job diversification, and salary level can also grow rapidly over time.<p>
- However, in terms of the highest salary, Korea, China, and Japan still seem to be lacking. On the other hand, the United States pays the highest salary. However, the number of users entering the data field in the United States is smaller than in other countries. In other words, it seems that students from three countries, Korea, China, and Japan, have a good view of studying and experiencing a lot and entering the United States.<p>
- In the case of China, it was showing the form of the most latecomer in the development of the data sector. Although the proportion of students will be high in the future, the distribution of job groups focused only on data analysis, and the wage of 0-4,999 dollars will exceed half of all respondents, it is expected that many users will grow rapidly as they study to enter the field.