In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#전공 계열별 활동 빈도 높은 feature 추출 코드

In [2]:
import pandas as pd

# 데이터 불러오기
df1 = pd.read_csv('/content/YP2021_w03_v4 (1).csv')

# 사용할 feature 및 grouping 변수 지정
"""
features =['y03a415','y03c292','y03c293','y03c297','y03c300','y03c303','y03c304',]
features = list(df.columns)
group_col = 'y03a413'
"""
features =['y03a415','y03c292','y03c293','y03c297','y03c300','y03c303','y03c304']
group_col = 'y03a413'   #전공 계열 인코딩 해서 준다고 함

# 결과 저장용
group_top_features = {}

# 계열(또는 학과)별 반복
for group, sub in df1.groupby(group_col):
    # 각 활동별 빈도(합계) 계산 (1/0 변수 기준)
    freq = sub[features].mean().sort_values(ascending=False)
    top_n = freq.head(3)   # top 3개
    group_top_features[group] = top_n

# 보기 쉽게 DataFrame으로 변환
top_df = pd.DataFrame(group_top_features)
print(top_df)

# (옵션) 계열별 Top-3 항목과 빈도 출력
for group, top3 in group_top_features.items():
    print(f"\n[{group}] 상위 3개 활동:")
    print(top3)

         1.0        2.0        3.0        4.0        5.0        6.0        \
y03a415   4.751445   4.859873   4.902941   4.957055   4.979487   4.847826   
y03c292   2.919355        NaN   2.953125   3.110390   2.721311   3.083333   
y03c297        NaN   2.979381        NaN        NaN        NaN        NaN   
y03c300   3.016129   3.170103   3.015625   3.298701   2.754098   3.250000   

         7.0        97.0       9090908.0  
y03a415   4.954545        5.0        5.0  
y03c292        NaN        NaN        5.0  
y03c297   2.744898        2.5        NaN  
y03c300   3.061224        2.5        5.0  

[1.0] 상위 3개 활동:
y03a415    4.751445
y03c300    3.016129
y03c292    2.919355
dtype: float64

[2.0] 상위 3개 활동:
y03a415    4.859873
y03c300    3.170103
y03c297    2.979381
dtype: float64

[3.0] 상위 3개 활동:
y03a415    4.902941
y03c300    3.015625
y03c292    2.953125
dtype: float64

[4.0] 상위 3개 활동:
y03a415    4.957055
y03c300    3.298701
y03c292    3.110390
dtype: float64

[5.0] 상위 3개 활동:
y03a415    4.9

#로지스틱 회귀 & 랜덤포레스트

In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

class FeatureImportanceAnalyzer:
    def __init__(self, X, y, feature_names):
        self.X = X
        self.y = y
        self.feature_names = feature_names
        self.lr_model = None
        self.rf_model = None

    # === 중요도 출력 함수 ===
    def print_importance(self, importance, model_name, top_n=5):
        series = pd.Series(importance, index=self.feature_names)
        series = series.abs().sort_values(ascending=False)
        print(f"\n[{model_name}] 중요도 TOP {top_n}")
        print(series.head(top_n))
        return series.head(top_n)

    # === 로지스틱 회귀 ===
    def fit_logistic_regression(self, **kwargs):
        self.lr_model = LogisticRegression(max_iter=200, **kwargs)
        self.lr_model.fit(self.X, self.y)
        importance = self.lr_model.coef_[0]
        return self.print_importance(importance, "Logistic Regression")

    # === 랜덤포레스트 ===
    def fit_random_forest(self, **kwargs):
        self.rf_model = RandomForestClassifier(n_estimators=100, random_state=42, **kwargs)
        self.rf_model.fit(self.X, self.y)
        importance = self.rf_model.feature_importances_
        return self.print_importance(importance, "Random Forest")

    # === 로지스틱 회귀 하이퍼파라미터 튜닝 ===
    def tune_logistic_regression(self, param_grid=None, cv=5, scoring='accuracy'):
        if param_grid is None:
            param_grid = {
                'C': [0.01, 0.1, 1, 10],
                'penalty': ['l2'],
                'solver': ['lbfgs', 'liblinear'],
                'max_iter': [100, 200]
            }
        grid = GridSearchCV(LogisticRegression(), param_grid, cv=cv, scoring=scoring, n_jobs=-1)
        grid.fit(self.X, self.y)
        print("\n[Logistic Regression] Best Params:", grid.best_params_)
        print("[Logistic Regression] Best CV Score:", grid.best_score_)
        self.lr_model = grid.best_estimator_
        importance = self.lr_model.coef_[0]
        return self.print_importance(importance, "Logistic Regression (Tuned)")

    # === 랜덤포레스트 하이퍼파라미터 튜닝 ===
    def tune_random_forest(self, param_grid=None, cv=5, scoring='accuracy'):
        if param_grid is None:
            param_grid = {
                'n_estimators': [100, 200],
                'max_depth': [None, 5, 10],
                'min_samples_split': [2, 5],
                'min_samples_leaf': [1, 2],
                'max_features': ['auto', 'sqrt']
            }
        grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=cv, scoring=scoring, n_jobs=-1)
        grid.fit(self.X, self.y)
        print("\n[Random Forest] Best Params:", grid.best_params_)
        print("[Random Forest] Best CV Score:", grid.best_score_)
        self.rf_model = grid.best_estimator_
        importance = self.rf_model.feature_importances_
        return self.print_importance(importance, "Random Forest (Tuned)")



In [5]:
# ---------------- 사용 예시 ----------------
df1['취업여부'] = df1['y01c101a'].notnull().astype(int)
target = df1['취업여부']
X = df1[features]
y = target

#결측치 처리 (예비)
X = X.fillna(X.median())

# 인스턴스 생성
analyzer = FeatureImportanceAnalyzer(X, y, features)

# 모델 학습 및 중요도

analyzer.fit_logistic_regression()
analyzer.fit_random_forest()

""" *시간 오래걸림*
# 하이퍼파라미터 튜닝
analyzer.tune_logistic_regression()
analyzer.tune_random_forest()
"""





[Logistic Regression] 중요도 TOP 5
y03c300    0.353341
y03c297    0.327131
y03c304    0.245633
y03c303    0.189937
y03a415    0.176283
dtype: float64

[Random Forest] 중요도 TOP 5
y03c300    0.217122
y03c297    0.196390
y03c293    0.160610
y03c304    0.144600
y03c292    0.142649
dtype: float64


' *시간 오래걸림*\n# 하이퍼파라미터 튜닝\nanalyzer.tune_logistic_regression()\nanalyzer.tune_random_forest()\n'


#같은 전공(계열) & 전공 내 취업자에서 특이치


**  전공 내 취업자만 다룬다.

In [8]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
from statsmodels.stats.proportion import proportions_ztest

class HighEarnerByMajorGroupAnalyzer:
    def __init__(self, df, salary_col, major_group_col, activity_cols, major_match_score_col, high_percentile=0.6):
        self.df = df
        self.salary_col = salary_col
        self.major_group_col = major_group_col
        self.activity_cols = activity_cols
        self.major_match_score_col = major_match_score_col  # 일치도 점수 컬럼
        self.high_percentile = high_percentile
        self.results = []

    def _define_high_and_normal(self, sub):
        """고연봉자/일반자 집단 생성"""
        salary_z = (sub[self.salary_col] - sub[self.salary_col].mean()) / sub[self.salary_col].std()
        cutoff = salary_z.quantile(self.high_percentile)
        sub = sub.copy()
        sub['high_salary'] = (salary_z > cutoff).astype(int)
        high = sub[sub['high_salary'] == 1]
        normal = sub[sub['high_salary'] == 0]
        return high, normal

    def _compare_activities(self, high, normal):
        """각 활동별 고연봉자 vs 일반자 차이와 유의성 검정"""
        stats = {}
        for feat in self.activity_cols:
            try:
                mean_high = high[feat].mean()
                mean_normal = normal[feat].mean()
                diff = mean_high - mean_normal
                # 이진형(0/1)
                if set(high[feat].dropna().unique()) <= {0,1} and set(normal[feat].dropna().unique()) <= {0,1}:
                    count = np.array([high[feat].sum(), normal[feat].sum()])
                    nobs = np.array([len(high), len(normal)])
                    _, p = proportions_ztest(count, nobs)
                # 수치형
                else:
                    _, p = ttest_ind(high[feat], normal[feat], nan_policy='omit')
            except Exception:
                mean_high, mean_normal, diff, p = np.nan, np.nan, np.nan, np.nan
            stats[feat] = {
                '고연봉평균': mean_high,
                '일반평균': mean_normal,
                '차이': diff,
                'p_value': p
            }
        return stats

    def analyze(self, min_group=10, print_example_n=2, pvalue_threshold=0.4, diff_threshold=0.3):
        self.results = []
        # 1. 자기전공-일치도 점수 3 이상만 필터
        df_case = self.df[(self.df[self.major_match_score_col] >= 3) & (self.df[self.major_match_score_col] <= 5)]
        print(f"\n========== [자기전공-일치도 3 이상 5이하 그룹 분석] ==========")
        # 2. 전공계열별 반복
        for major, sub in df_case.groupby(self.major_group_col):
            if len(sub) < min_group:
                continue
            # 3. 고연봉자/일반자 집단 생성
            high, normal = self._define_high_and_normal(sub)
            if len(high) < 2 or len(normal) < 2:
                continue
            # 4. 활동별 차이/유의성 검정
            stats = self._compare_activities(high, normal)
            # 5. 고연봉자에서만 특이하게 큰 활동 마킹
            special_feats = {feat: v for feat, v in stats.items() if
                             v['p_value'] is not None and v['p_value'] < pvalue_threshold and abs(v['차이']) > diff_threshold}
            result = {
                '전공계열': major,
                '고연봉자수': len(high),
                '일반자수': len(normal),
                '전체차이통계': stats,
                '고연봉자특이활동': special_feats
            }
            self.results.append(result)
        # 6. 대표결과 출력
        for res in self.results[:print_example_n]:
            print(f"\n전공계열: {res['전공계열']} | 고연봉자:{res['고연봉자수']} / 일반자:{res['일반자수']}")
            print("▶ 고연봉자에서만 유의하게 특이한 활동 (차이+유의성):")
            for feat, v in res['고연봉자특이활동'].items():
                print(f"{feat}: 고연봉자평균={v['고연봉평균']:.2f}, 일반평균={v['일반평균']:.2f}, 차이={v['차이']:.2f}, p={v['p_value']:.3f}")
        return self.results




In [15]:
# --------- 사용 예시 ---------

salary_col = 'y01d302_left' #[첫] 첫직장 입사시 연봉 (만원)
major_group_col = 'y03a413' #전공 계열
activity_cols = ['y03a415','y03c292','y03c293','y03c297','y03c300','y03c303','y03c304'] #features
major_match_score_col = 'y03c245' #자기전공-일치도
analyzer = HighEarnerByMajorGroupAnalyzer(df1, salary_col, major_group_col, activity_cols, major_match_score_col)
results = analyzer.analyze(min_group=10, print_example_n=2, pvalue_threshold=0.2, diff_threshold=0.15)




  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)



전공계열: 1.0 | 고연봉자:4 / 일반자:30
▶ 고연봉자에서만 유의하게 특이한 활동 (차이+유의성):
y03c303: 고연봉자평균=1.75, 일반평균=2.73, 차이=-0.98, p=0.085
y03c304: 고연봉자평균=1.75, 일반평균=2.73, 차이=-0.98, p=0.085

전공계열: 2.0 | 고연봉자:14 / 일반자:95
▶ 고연봉자에서만 유의하게 특이한 활동 (차이+유의성):


  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)


#같은 직무에서 특이치


**  전공 취업자와 비전공취업자 따로 분석

In [6]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
from statsmodels.stats.proportion import proportions_ztest

class HighEarnerByMajorMatchAnalyzer:
    def __init__(self, df, salary_col, job_col, activity_cols, major_match_col, high_percentile=0.9):
        self.df = df
        self.salary_col = salary_col
        self.job_col = job_col
        self.activity_cols = activity_cols
        self.major_match_col = major_match_col
        self.high_percentile = high_percentile
        self.results = []

    def _define_high_and_normal(self, sub):
        salary_z = (sub[self.salary_col] - sub[self.salary_col].mean()) / sub[self.salary_col].std()
        cutoff = salary_z.quantile(self.high_percentile)
        sub = sub.copy()
        sub['high_salary'] = (salary_z > cutoff).astype(int)
        high = sub[sub['high_salary'] == 1]
        normal = sub[sub['high_salary'] == 0]
        return high, normal

    def _compare_activities(self, high, normal):
        stats = {}
        for feat in self.activity_cols:
            try:
                mean_high = high[feat].mean()
                mean_normal = normal[feat].mean()
                diff = mean_high - mean_normal
                # 이진형(0/1)
                if set(high[feat].dropna().unique()) <= {0,1} and set(normal[feat].dropna().unique()) <= {0,1}:
                    count = np.array([high[feat].sum(), normal[feat].sum()])
                    nobs = np.array([len(high), len(normal)])
                    _, p = proportions_ztest(count, nobs)
                # 수치형
                else:
                    _, p = ttest_ind(high[feat], normal[feat], nan_policy='omit')
            except Exception:
                mean_high, mean_normal, diff, p = np.nan, np.nan, np.nan, np.nan
            stats[feat] = {
                '고연봉평균': mean_high,
                '일반평균': mean_normal,
                '차이': diff,
                'p_value': p
            }
        return stats

    def analyze(self, min_group=10, print_example_n=2, pvalue_threshold=0.05, diff_threshold=0.1):
        self.results = []
        # 정확하게 1,2=불일치, 3,4,5=일치
        group_cases = [
            {"label": "전공일치", "condition": lambda df: df[self.major_match_col].isin([3, 4, 5])},
            {"label": "전공불일치", "condition": lambda df: df[self.major_match_col].isin([1, 2])},
        ]
        for group in group_cases:
            group_name = group["label"]
            df_case = group["condition"](self.df)
            df_group = self.df[df_case]
            print(f"\n========== [{group_name} 그룹 분석] ==========")
            # 2. 직무코드별 반복
            for job, sub in df_group.groupby(self.job_col):
                if len(sub) < min_group:
                    continue
                high, normal = self._define_high_and_normal(sub)
                if len(high) < 2 or len(normal) < 2:
                    continue
                stats = self._compare_activities(high, normal)
                special_feats = {feat: v for feat, v in stats.items() if
                                 v['p_value'] is not None and v['p_value'] < pvalue_threshold and abs(v['차이']) > diff_threshold}
                result = {
                    '전공일치여부': group_name,
                    '직무코드': job,
                    '고연봉자수': len(high),
                    '일반자수': len(normal),
                    '전체차이통계': stats,
                    '고연봉자특이활동': special_feats
                }
                self.results.append(result)
            # 3. 그룹별 대표결과 출력
            group_results = [r for r in self.results if r['전공일치여부'] == group_name]
            for res in group_results[:print_example_n]:
                print(f"\n직무코드: {res['직무코드']} | 고연봉자:{res['고연봉자수']} / 일반자:{res['일반자수']}")
                print("▶ 고연봉자에서만 유의하게 특이한 활동 (차이+유의성):")
                for feat, v in res['고연봉자특이활동'].items():
                    print(f"{feat}: 고연봉자평균={v['고연봉평균']:.2f}, 일반평균={v['일반평균']:.2f}, 차이={v['차이']:.2f}, p={v['p_value']:.3f}")
        return self.results



In [7]:
# --------- 사용 예시 ---------

salary_col = 'y01d302_left'#[첫] 첫직장 입사시 연봉 (만원)
job_col = '직업코드'
activity_cols = ['y03a415','y03c292','y03c293','y03c297','y03c300','y03c303','y0c304'] #features
major_match_score_col = 'y03c245' #자기전공-일치도
analyzer = HighEarnerByMajorMatchAnalyzer(df1, salary_col, job_col, activity_cols, major_match_score_col)
results = analyzer.analyze(min_group=10, print_example_n=2, pvalue_threshold=0.05, diff_threshold=0.1)

NameError: name 'df' is not defined