In [3]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [4]:
df = pd.read_csv("edu_enrollees.csv")
df.head(2)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevant_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target,Xgrp
0,8949.0,city_103,0.92,Male,Has relevant experience,no_enrollment,Graduate,STEM,>20,,,1,36.0,1.0,train
1,29725.0,city_40,0.776,Male,No relevant experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47.0,0.0,train


### 전처리

In [5]:
df = df.drop(columns = ["city", "company_size", "company_type"])
df.head(2)

Unnamed: 0,enrollee_id,city_development_index,gender,relevant_experience,enrolled_university,education_level,major_discipline,experience,last_new_job,training_hours,target,Xgrp
0,8949.0,0.92,Male,Has relevant experience,no_enrollment,Graduate,STEM,>20,1,36.0,1.0,train
1,29725.0,0.776,Male,No relevant experience,no_enrollment,Graduate,STEM,15,>4,47.0,0.0,train


In [16]:
df_cat = df.select_dtypes(exclude = "number")
df_cat.isna().sum()

In [17]:
df_num = df.select_dtypes(include = "number")
df_num.isna().sum()

In [10]:
df = df.dropna()

In [12]:
df["experience"].unique()

array(['>20', '15', '13', '7', '5', '16', '4', '11', '<1', '18', '19',
       '12', '10', '9', '2', '6', '14', '3', '8', '20', '17', '1'],
      dtype=object)

In [13]:
df["last_new_job"].unique()

array(['1', '>4', '4', '3', '2', 'never'], dtype=object)

In [14]:
df = df.loc[~df["experience"].isin([">20", "<1"]), ]
df = df.loc[~df["last_new_job"].isin([">4", "never"]), ]
df = df.reset_index(drop = True)

In [15]:
len(df)

7522

In [18]:
df_base = df.copy()

### Q1.

In [19]:
df_q1 = df_base[["relevant_experience", "target"]].copy()
df_q1.head(2)

Unnamed: 0,relevant_experience,target
0,Has relevant experience,1.0
1,Has relevant experience,0.0


In [20]:
df_q1["relevant_experience"].unique()

array(['Has relevant experience', 'No relevant experience'], dtype=object)

In [21]:
df_q1_has = df_q1.loc[df_q1["relevant_experience"] == "Has relevant experience", ]
# df_q1_has = df_q1.loc[df_q1["relevant_experience"] != "Has relevant experience", ]
df_q1_no  = df_q1.loc[df_q1["relevant_experience"] == "No relevant experience", ]

In [25]:
# df_q1_has["target"].value_counts()
# df_q1_has["target"].value_counts(normalize = True)
df_q1_has["target"].value_counts(normalize = True)[1]

0.21591095105581928

In [26]:
df_q1_no["target"].value_counts(normalize = True)[1]

0.38287331917905165

In [28]:
val_A = df_q1_no["target"].value_counts(normalize = True)[1]
val_B = df_q1_has["target"].value_counts(normalize = True)[1]
round(val_A / val_B, 2)

1.77

In [29]:
df.groupby("relevant_experience")["target"].mean()

relevant_experience
Has relevant experience    0.215911
No relevant experience     0.382873
Name: target, dtype: float64

### Q2.

In [31]:
df_q2_cat = df_base.loc[:, "gender":"major_discipline"]
df_q2_cat.head(1)

Unnamed: 0,gender,relevant_experience,enrolled_university,education_level,major_discipline
0,Male,Has relevant experience,no_enrollment,Graduate,STEM


In [32]:
df_q2_dum = pd.get_dummies(df_q2_cat)
df_q2_dum.head(1)

Unnamed: 0,gender_Female,gender_Male,gender_Other,relevant_experience_Has relevant experience,relevant_experience_No relevant experience,enrolled_university_Full time course,enrolled_university_Part time course,enrolled_university_no_enrollment,education_level_Graduate,education_level_Masters,education_level_Phd,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other,major_discipline_STEM
0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1


In [34]:
target_drop = ["gender_Other", "relevant_experience_No relevant experience", 
               "enrolled_university_no_enrollment", "education_level_Phd",
               "major_discipline_STEM"]
df_q2_dum = df_q2_dum.drop(columns = target_drop)
df_q2_dum.head(1)

Unnamed: 0,gender_Female,gender_Male,relevant_experience_Has relevant experience,enrolled_university_Full time course,enrolled_university_Part time course,education_level_Graduate,education_level_Masters,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other
0,0,1,1,0,0,1,0,0,0,0,0,0


In [36]:
target_cols = ["target", "Xgrp", "city_development_index", 
               "experience", "last_new_job", "training_hours"]
df_q2 = pd.concat([df_base[target_cols], df_q2_dum], axis = 1)
df_q2 = df_q2.reset_index(drop = True)

In [37]:
df_q2.head(1)

Unnamed: 0,target,Xgrp,city_development_index,experience,last_new_job,training_hours,gender_Female,gender_Male,relevant_experience_Has relevant experience,enrolled_university_Full time course,enrolled_university_Part time course,education_level_Graduate,education_level_Masters,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other
0,1.0,train,0.92,7,1,46.0,0,1,1,0,0,1,0,0,0,0,0,0


In [38]:
df_job2 = df_q2.copy()

In [42]:
model_lr = LogisticRegression(C = 100000, max_iter = 1000,
                              solver = "liblinear", random_state = 123)
model_lr.fit(X = df_job2.drop(columns = ["target", "Xgrp"]),
             y = df_job2["target"])

In [44]:
model_lr.coef_

In [51]:
pd.Series(model_lr.coef_[0],
          index = df_job2.columns[2:]).reset_index()

In [52]:
pd.DataFrame(model_lr.coef_, columns = df_job2.columns[2:])

Unnamed: 0,city_development_index,experience,last_new_job,training_hours,gender_Female,gender_Male,relevant_experience_Has relevant experience,enrolled_university_Full time course,enrolled_university_Part time course,education_level_Graduate,education_level_Masters,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other
0,-6.117324,-0.028502,0.095653,-0.000926,-0.162976,-0.136724,-0.760567,0.51411,-0.281497,0.323116,0.016742,0.261665,0.107607,0.243479,0.3979,-0.439388


In [58]:
# model_lr.intercept_ # 신경xxxx
np.exp(model_lr.coef_).round(2).max()

1.67

### Q3.

In [59]:
df_job2["Xgrp"].unique()

array(['train', 'test'], dtype=object)

In [60]:
df_train = df_job2.loc[df_job2["Xgrp"] == "train", ].drop(columns = "Xgrp")
df_test  = df_job2.loc[df_job2["Xgrp"] == "test" , ].drop(columns = "Xgrp")

In [61]:
len(df_train), len(df_test)

(4706, 2816)

In [63]:
model_knn = KNeighborsClassifier(n_neighbors = 5)
model_knn.fit(X = df_train.drop(columns = "target"),
              y = df_train["target"])
pred = model_knn.predict(df_test.drop(columns = "target"))

In [64]:
pred[:4]

array([0., 0., 0., 0.])

In [65]:
from sklearn.metrics import accuracy_score

In [66]:
round(accuracy_score(y_true = df_test["target"],
                     y_pred = pred), 2)

0.71

In [68]:
df_cross = pd.crosstab(df_test["target"], pred)
df_cross

col_0,0.0,1.0
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1899,193
1.0,616,108


In [73]:
arr_cross = df_cross.values
round(arr_cross.diagonal().sum() / arr_cross.sum(), 2)

0.71