In [5]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [7]:
df = pd.read_csv("02_Test_Data_Set/edu_enrollees.csv")
df.head(2)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevant_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target,Xgrp
0,8949.0,city_103,0.92,Male,Has relevant experience,no_enrollment,Graduate,STEM,>20,,,1,36.0,1.0,train
1,29725.0,city_40,0.776,Male,No relevant experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47.0,0.0,train


In [8]:
df = df.drop(["city", "company_size", "company_type"], axis = 1)

In [9]:
df.isna().sum()

enrollee_id                  0
city_development_index       0
gender                    4508
relevant_experience          0
enrolled_university        386
education_level            460
major_discipline          2813
experience                  65
last_new_job               423
training_hours               0
target                       0
Xgrp                         0
dtype: int64

In [10]:
df = df.dropna()

In [11]:
df.isna().sum()

enrollee_id               0
city_development_index    0
gender                    0
relevant_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
last_new_job              0
training_hours            0
target                    0
Xgrp                      0
dtype: int64

In [12]:
df["experience"].unique()

array(['>20', '15', '13', '7', '5', '16', '4', '11', '<1', '18', '19',
       '12', '10', '9', '2', '6', '14', '3', '8', '20', '17', '1'],
      dtype=object)

In [16]:
df["experience"].isin([">20", "<1"])[:5]

0     True
1    False
4     True
7    False
8    False
Name: experience, dtype: bool

In [9]:
df = df.loc[~df["experience"].isin([">20", "<1"]), ]
df = df.loc[~df["last_new_job"].isin([">4", "never"]), ]

In [11]:
df["experience"] = df["experience"].astype("int")
df["last_new_job"] = df["last_new_job"].astype("int")

In [10]:
len(df)

7522

In [12]:
base = df.reset_index(drop = True)

### Q1.

In [13]:
df_q1 = base[["relevant_experience", "target"]]
df_q1.head(2)

Unnamed: 0,relevant_experience,target
0,Has relevant experience,1.0
1,Has relevant experience,0.0


In [14]:
df_q1["relevant_experience"].unique()

array(['Has relevant experience', 'No relevant experience'], dtype=object)

In [15]:
df_q1_sub1 = df_q1.loc[df_q1["relevant_experience"] == "Has relevant experience", ]
df_q1_sub2 = df_q1.loc[df_q1["relevant_experience"] != "Has relevant experience", ]

In [16]:
df_q1_sub1.head(2)

Unnamed: 0,relevant_experience,target
0,Has relevant experience,1.0
1,Has relevant experience,0.0


In [21]:
val_rel1 = df_q1_sub1["target"].value_counts(normalize = True)[1]
val_rel1

0.21591095105581928

In [20]:
val_rel2 = df_q1_sub2["target"].value_counts(normalize = True)[1]
val_rel2 

0.38287331917905165

In [22]:
round(val_rel2 / val_rel1, 2)

1.77

In [24]:
df_q1_sub1["target"].mean()

0.21591095105581928

In [25]:
df_q1.groupby("relevant_experience")["target"].mean()

relevant_experience
Has relevant experience    0.215911
No relevant experience     0.382873
Name: target, dtype: float64

### Q2.

In [27]:
df_q2_c = base[["gender", "relevant_experience", "enrolled_university", "education_level", "major_discipline"]]
df_q2_c.head(2)

Unnamed: 0,gender,relevant_experience,enrolled_university,education_level,major_discipline
0,Male,Has relevant experience,no_enrollment,Graduate,STEM
1,Male,Has relevant experience,no_enrollment,Graduate,STEM


In [28]:
df_q2_c_dum = pd.get_dummies(df_q2_c)
df_q2_c_dum.head(2)

Unnamed: 0,gender_Female,gender_Male,gender_Other,relevant_experience_Has relevant experience,relevant_experience_No relevant experience,enrolled_university_Full time course,enrolled_university_Part time course,enrolled_university_no_enrollment,education_level_Graduate,education_level_Masters,education_level_Phd,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other,major_discipline_STEM
0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1
1,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1


In [30]:
df_q2_c_dum = df_q2_c_dum.drop(["gender_Other", "relevant_experience_No relevant experience",
                                "enrolled_university_no_enrollment",
                                "education_level_Phd", "major_discipline_STEM"],
                               axis = 1)

In [31]:
df_q2_c_dum.head(2)

Unnamed: 0,gender_Female,gender_Male,relevant_experience_Has relevant experience,enrolled_university_Full time course,enrolled_university_Part time course,education_level_Graduate,education_level_Masters,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other
0,0,1,1,0,0,1,0,0,0,0,0,0
1,0,1,1,0,0,1,0,0,0,0,0,0


In [33]:
df_q2_n = base[["target", "Xgrp",
                "city_development_index", "experience", "last_new_job", "training_hours"]]

In [34]:
df_q2_n.head(2)

Unnamed: 0,target,Xgrp,city_development_index,experience,last_new_job,training_hours
0,1.0,train,0.92,7,1,46.0
1,0.0,train,0.92,5,1,108.0


In [36]:
df_q2 = pd.concat([df_q2_n, df_q2_c_dum], axis = 1)
df_q2.head(2)

Unnamed: 0,target,Xgrp,city_development_index,experience,last_new_job,training_hours,gender_Female,gender_Male,relevant_experience_Has relevant experience,enrolled_university_Full time course,enrolled_university_Part time course,education_level_Graduate,education_level_Masters,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other
0,1.0,train,0.92,7,1,46.0,0,1,1,0,0,1,0,0,0,0,0,0
1,0.0,train,0.92,5,1,108.0,0,1,1,0,0,1,0,0,0,0,0,0


In [37]:
df_job2 = df_q2.copy()

In [39]:
model = LogisticRegression(C = 100000, max_iter = 1000, 
                           solver = "liblinear",
                           random_state = 123)
model.fit(X = df_q2.drop(["target", "Xgrp"], axis = 1),
          y = df_q2["target"])

LogisticRegression(C=100000, max_iter=1000, random_state=123,
                   solver='liblinear')

In [None]:
model.coef_ # 회귀 계수

OR(Odds Ratio)는 회귀계수와 np.exp() 함수를 사용하여 산출 가능
<h1>
$$e^{\beta_i}$$

</h1>

In [45]:
np.exp(model.coef_).max()

1.6721493496611732

In [46]:
str(np.exp(model.coef_).max())[:4]

'1.67'

In [52]:
df_q2.drop(["target", "Xgrp"], axis = 1).columns

Index(['city_development_index', 'experience', 'last_new_job',
       'training_hours', 'gender_Female', 'gender_Male',
       'relevant_experience_Has relevant experience',
       'enrolled_university_Full time course',
       'enrolled_university_Part time course', 'education_level_Graduate',
       'education_level_Masters', 'major_discipline_Arts',
       'major_discipline_Business Degree', 'major_discipline_Humanities',
       'major_discipline_No Major', 'major_discipline_Other'],
      dtype='object')

In [51]:
model.coef_

array([[-6.11732384e+00, -2.85015530e-02,  9.56531984e-02,
        -9.26206002e-04, -1.62975865e-01, -1.36723955e-01,
        -7.60567385e-01,  5.14109835e-01, -2.81496522e-01,
         3.23116412e-01,  1.67418791e-02,  2.61665001e-01,
         1.07607195e-01,  2.43479152e-01,  3.97900227e-01,
        -4.39387752e-01]])

In [56]:
df_coef = pd.DataFrame(dict(colname = df_q2.drop(["target", "Xgrp"], axis = 1).columns,
                            coef = model.coef_[0, ]))
df_coef

Unnamed: 0,colname,coef
0,city_development_index,-6.117324
1,experience,-0.028502
2,last_new_job,0.095653
3,training_hours,-0.000926
4,gender_Female,-0.162976
5,gender_Male,-0.136724
6,relevant_experience_Has relevant experience,-0.760567
7,enrolled_university_Full time course,0.51411
8,enrolled_university_Part time course,-0.281497
9,education_level_Graduate,0.323116


### Q3.

In [47]:
df_job2["Xgrp"].unique()

array(['train', 'test'], dtype=object)

In [48]:
df_train = df_job2.loc[df_job2["Xgrp"] == "train", ]
df_test  = df_job2.loc[df_job2["Xgrp"] == "test" , ]

In [49]:
len(df_train), len(df_test)

(4706, 2816)

In [57]:
df_train.head(2)

Unnamed: 0,target,Xgrp,city_development_index,experience,last_new_job,training_hours,gender_Female,gender_Male,relevant_experience_Has relevant experience,enrolled_university_Full time course,enrolled_university_Part time course,education_level_Graduate,education_level_Masters,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other
0,1.0,train,0.92,7,1,46.0,0,1,1,0,0,1,0,0,0,0,0,0
1,0.0,train,0.92,5,1,108.0,0,1,1,0,0,1,0,0,0,0,0,0


In [59]:
model_knn = KNeighborsClassifier(n_neighbors = 5)
model_knn.fit(X = df_train.drop(["target", "Xgrp"], axis = 1),
              y = df_train["target"])
pred = model_knn.predict(df_test.drop(["target", "Xgrp"], axis = 1))

In [60]:
pred[:5]

array([0., 0., 0., 0., 0.])

In [61]:
from sklearn.metrics import accuracy_score

In [62]:
round(accuracy_score(y_true = df_test["target"],
                     y_pred = pred), 2)

0.71

In [63]:
import sklearn

In [64]:
sklearn.__version__

'1.0.2'

In [65]:
import sklearn.metrics as sm

In [66]:
dir(sm)

['ConfusionMatrixDisplay',
 'DetCurveDisplay',
 'DistanceMetric',
 'PrecisionRecallDisplay',
 'RocCurveDisplay',
 'SCORERS',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_base',
 '_classification',
 '_dist_metrics',
 '_pairwise_fast',
 '_plot',
 '_ranking',
 '_regression',
 '_scorer',
 'accuracy_score',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'auc',
 'average_precision_score',
 'balanced_accuracy_score',
 'brier_score_loss',
 'calinski_harabasz_score',
 'check_scoring',
 'classification_report',
 'cluster',
 'cohen_kappa_score',
 'completeness_score',
 'confusion_matrix',
 'consensus_score',
 'coverage_error',
 'd2_tweedie_score',
 'davies_bouldin_score',
 'dcg_score',
 'det_curve',
 'euclidean_distances',
 'explained_variance_score',
 'f1_score',
 'fbeta_score',
 'fowlkes_mallows_score',
 'get_scorer',
 'hamming_loss',
 'hinge_loss',
 'homogeneity_completeness_v_measure',
 '

In [68]:
df_s = pd.DataFrame(dict(col1 = [1, 2, 3],
                         col2 = [2, 3, 4],
                         col3 = [3, 4, 5]),
                    index = ["A", "B", "C"])
df_s

Unnamed: 0,col1,col2,col3
A,1,2,3
B,2,3,4
C,3,4,5


In [69]:
df_s.drop("A")

Unnamed: 0,col1,col2,col3
B,2,3,4
C,3,4,5


In [71]:
df_s.drop("col1", axis = 1)

Unnamed: 0,col2,col3
A,2,3
B,3,4
C,4,5


In [73]:
df_s.drop(columns = ["col1", "col3"])

Unnamed: 0,col2
A,2
B,3
C,4


* 소괄호(): 튜플의 생성, 연산의 우선순위 지정, 함수/메서드/클래스 직후
* 중괄호{}: 딕셔너리 객체의 생성, 집합(set) 객체의 생성
* 대괄호[]: 리스트 객체의 생성, 특정 객체의 하위 속성 추출(필터링 등)

대괄호는 보통 객체 바로 뒤에 위치.  
특정 인자에 2개 이상의 원소를 할당해야 할 때 단순 쉼표(,)로 구분하지 않고 해당 원소들을 리스트 객체로 묶어서 전달해야 한다.