### 출처
- 대회 : https://www.kaggle.com/competitions/kaggle-survey-2022/data
- 필사 스터디 : https://www.kaggle.com/code/eraikako/data-science-and-mlops-landscape-in-industry

#### Libraries

In [1]:
import pandas as pd
import numpy as np
import json
from collections import Counter

import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
from plotly.colors import n_colors

from IPython.core.display import display, HTML, Javascript 
import IPython.display
from IPython.display import display, clear_output

import ipywidgets as widgets
from ipywidgets import interact, interact_manual

import matplotlib as mpl
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')


Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display



#### Load Data

In [2]:
df = pd.read_csv('./Data/kaggle_survey_2022_responses.csv')

In [7]:
# 서베이 질문 따로 빼기
questions_titles = df[0:1] # 0번 행만 가져오기 
questions_titles

Unnamed: 0,Duration (in seconds),Q2,Q3,Q4,Q5,Q6_1,Q6_2,Q6_3,Q6_4,Q6_5,...,Q44_3,Q44_4,Q44_5,Q44_6,Q44_7,Q44_8,Q44_9,Q44_10,Q44_11,Q44_12
0,Duration (in seconds),What is your age (# years)?,What is your gender? - Selected Choice,In which country do you currently reside?,"Are you currently a student? (high school, uni...",On which platforms have you begun or completed...,On which platforms have you begun or completed...,On which platforms have you begun or completed...,On which platforms have you begun or completed...,On which platforms have you begun or completed...,...,Who/what are your favorite media sources that ...,Who/what are your favorite media sources that ...,Who/what are your favorite media sources that ...,Who/what are your favorite media sources that ...,Who/what are your favorite media sources that ...,Who/what are your favorite media sources that ...,Who/what are your favorite media sources that ...,Who/what are your favorite media sources that ...,Who/what are your favorite media sources that ...,Who/what are your favorite media sources that ...


In [9]:
# 서베이 답변으로만 데이터 구성
df = df[1:]
df.head()

Unnamed: 0,Duration (in seconds),Q2,Q3,Q4,Q5,Q6_1,Q6_2,Q6_3,Q6_4,Q6_5,...,Q44_3,Q44_4,Q44_5,Q44_6,Q44_7,Q44_8,Q44_9,Q44_10,Q44_11,Q44_12
1,121,30-34,Man,India,No,,,,,,...,,,,,,,,,,
2,462,30-34,Man,Algeria,No,,,,,,...,,,,,,,,,,
3,293,18-21,Man,Egypt,Yes,Coursera,edX,,DataCamp,,...,,"Kaggle (notebooks, forums, etc)",,"YouTube (Kaggle YouTube, Cloud AI Adventures, ...","Podcasts (Chai Time Data Science, O’Reilly Dat...",,,,,
4,851,55-59,Man,France,No,Coursera,,Kaggle Learn Courses,,,...,,"Kaggle (notebooks, forums, etc)","Course Forums (forums.fast.ai, Coursera forums...",,,"Blogs (Towards Data Science, Analytics Vidhya,...",,,,
5,232,45-49,Man,India,Yes,,,,,,...,,,,,,"Blogs (Towards Data Science, Analytics Vidhya,...",,,,


#### Visualization 함수

In [19]:
def wrap_df_text(df):
    return display(HTML(df.style.background_gradient(axis=0,  cmap='YlOrBr', subset=["Average number of selected choices"]).to_html().replace("\\n","<br>")))

### 종사자 구분

- Q5 질문(Are you currently a student?)에 'No'라고 응답
- Q23 질문(Select the title most similar to your current role)에 'Currently not employed'을 제외한 나머지 선택지에서 응답
- Q24 질문(In what industry is your current employer/contract?)에 응답

In [15]:
scope_df = df[
    (df["Q5"] == "No") &
    (df["Q23"] != "Currently not employed") &
    (df["Q5"].notnull())
]

## 이상치 / 평균 응답 수

- 이 데이터셋에서 '이상치'의 정의 = multiple choice 모든 선택지에 체크 표시한 사람

1. multiple choice 질문 & 선택지 항목 수

In [12]:
multiple_choice_questions = {} #multiple_choice 질문과 선택지 개수를 저장할 딕셔너리
seen_columns = [] #이미 처리한 질문을 기록하는 리스트

for col in df.columns: #각 컬럼 순회 
    question = col.split("_")[0] #예시: 'Q6_1' -> 'Q6' 부분을 질문으로 간주
    
    # 이미 처리된 질문이라면
    if question in seen_columns:
        
        ## 처음 추가하는 질문이라면
        if question not in multiple_choice_questions: 
            multiple_choice_questions[question] = 2 #선택지 개수 2로 설정 (기본적으로 2개의 선택지 존재)
        
        ## 이미 있는 질문이라면
        else: 
            multiple_choice_questions[question] = multiple_choice_questions[question] + 1 #선택지 개수 1 증가
    else:
        seen_columns.append(question)

In [13]:
multiple_choice_questions

{'Q6': 12,
 'Q7': 7,
 'Q10': 3,
 'Q12': 15,
 'Q13': 14,
 'Q14': 16,
 'Q15': 15,
 'Q17': 15,
 'Q18': 14,
 'Q19': 8,
 'Q20': 6,
 'Q21': 10,
 'Q28': 8,
 'Q31': 12,
 'Q33': 5,
 'Q34': 8,
 'Q35': 16,
 'Q36': 15,
 'Q37': 13,
 'Q38': 8,
 'Q39': 12,
 'Q40': 15,
 'Q41': 9,
 'Q42': 9,
 'Q44': 12}

2. 응답 수 계산 함수

In [16]:
def extract_the_number_of_responses(question_title : str, #질문명 (Q1, Q2 등)
                                    row : pd.Series, # 위 질문에 대한 응답자 데이터 확인 (행)
                                    columns_list : list) #질문명에 대한 모든 컬럼 확인 (Q1_A, Q1_B 등)
                                    -> str: #왜 str로 반환??
    
    num_responses = 0  #선택지 개수 저장 변수 초기화
    for col in columns_list:  #모든 컬럼을 하나씩 순회
        if col.startswith(question_title):  #현재 컬럼이 주어진 질문에 해당하는지
            if not pd.isnull(row[col]):  #결측치가 아닌 경우만 응답으로 간주
                num_responses = num_responses +1  #응답 개수를 1씩 증가
    return num_responses #응답 개수 반환

3. 응답 수 df에 추가

In [17]:
# 각 질문(col)에 대해 반복 실행
for col in list(multiple_choice_questions.keys()): #keys = 질문명 (e.g. Q1, Q2) 
    
    # 각 질문에 대한 새로운 열 생성 (e.g. Q1_number_of_responses)
    scope_df[f"{col}_number_of_responses"] = scope_df.apply(
        
        # col : "Q1" 일때
        # df.columns : 이에 해당하는 모든 컬럼을 확인하고(Q1_A, Q1_B 등)
        # x : 응답자가 선택한 항목(결측치가 아닌)의 개수를 반환 
        # axis = 1 : 함수가 행 단위로 적용되게 함 
        lambda x : extract_the_number_of_responses(col,x,df.columns),
                                                   axis = 1)

4. 이상치 / 평균 응답 수 확인

In [21]:
# 빈리스트 생성
outlier_analysis = []

for col in list(multiple_choice_questions.keys()):
    mean_responses = round(scope_df[f"{col}_number_of_responses"].mean()) #응답 평균 수 계산
    #리스트에 추가 
    outlier_analysis.append([
        col,  #질문
        multiple_choice_questions[col],  #선택지 개수
        mean_responses,   #평균 응답 수
    ])

In [22]:
# 데이터 프레임 생성 
average_responses = pd.DataFrame(outlier_analysis,
                                columns = ["Question",
                                           "Nbr of available Choices",
                                           "Average number of selected choices"])

In [26]:
# 컬럼 추가 - 질문 제목
average_responses["Question Title"] = questions_titles[
    [f"{col}_1" for col in list(multiple_choice_questions.keys())]
    ].loc[0].to_list()
## 질문에 "Select 어쩌구" 있을 때 "Select 제거"
average_responses["Question Title"] = average_responses[
    "Question Title"].apply(lambda x : x.split("(Select")[0].strip())


# scope_df에서 {col}_number_of_respones 컬럼 삭제인데 왜 삭제해야 되는지는 모르겠음
# average_responses 데이터 프레임으로 새로 만들었는데 (?)
scope_df.drop([f"{col}_number_of_responses" for col in list(multiple_choice_questions.keys())],
             axis=1, inplace=True)

# 질문 제목의 텍스트를 '80자' 기준으로 줄 바꿈 처리 
average_responses["Question Title"] = average_responses['Question Title'].str.wrap(80)

# 데이터 프레임 열 순서 재정렬
average_responses = average_responses[["Question",
                                      "Question Title",
                                      "Nbr of available Choices",
                                      "Average number of selected choices"]]
wrap_df_text(average_responses)

Unnamed: 0,Question,Question Title,Nbr of available Choices,Average number of selected choices
0,Q6,On which platforms have you begun or completed data science courses?,12,2
1,Q7,What products or platforms did you find to be most helpful when you first started studying data science?,7,2
2,Q10,"Did your research make use of machine learning? - Yes, the research made advances related to some novel machine learning method (theoretical research)",3,0
3,Q12,What programming languages do you use on a regular basis?,15,2
4,Q13,Which of the following integrated development environments (IDE's) do you use on a regular basis?,14,2
5,Q14,Do you use any of the following hosted notebook products?,16,1
6,Q15,Do you use any of the following data visualization libraries on a regular basis?,15,2
7,Q17,Which of the following machine learning frameworks do you use on a regular basis?,15,2
8,Q18,Which of the following ML algorithms do you use on a regular basis?,14,2
9,Q19,Which categories of computer vision methods do you use on a regular basis?,8,1
