In [1]:
# Import libraries and Packages 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import Series
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split

%matplotlib inline

###########################################################

# Loading Data-Set
label = pd.read_csv("secom_labels.txt", delim_whitespace=True, header=None)  #delim_whitespace=True:공백으로 구분된 파일 읽을 때
features = pd.read_csv("secom_data.txt", delim_whitespace=True,header=None)  #header=None:칼럼 이름이 없다

#칼럼 이름 추가
features = features.rename(columns={features.columns[i]: 'F'+ str(i) for i in range (590)}) # adding name to feature columns (591 features)
label = label.rename(columns={0: 'L0', 1 :'date'})   # adding name to label column

############################################################

# Concatinating to separate files(features, label을 하나로)
df = pd.concat([features,label],axis=1, ignore_index=False)
#print(df.head()) # Preliminary inspection of data-set
#print(df.shape) # Preliminary inspection of data-set
#print(df.index) # Preliminary inspection of data-set

# Dropping columns with more than 10% missing data (동떨어진 값을 중간값으로 대체)
df = df.dropna(thresh=len(df) - int(0.1 * len(df)), axis=1) 
df = df.fillna(df.median()) 
df.L0.replace(-1,0, inplace=True) # Converting label column to binary [0,1] 

# Building label vectore (y) and feature matrix(X)
y = df['L0'] #합격 불합격만 표시(0,1)
X = df.drop(['L0','date'], axis=1) #feature만 표시

# Employing Lasso regularization approach to reduce feature matrix dimenssion
# 불필요한 데이터 차원 줄이기
lasso = Lasso(alpha=0.2,normalize=False) #alpha:정규화 비율
lasso_coef = lasso.fit(X, y).coef_
print('Total number of remaining features:')
print(len(lasso_coef[lasso_coef!=0.0]))

# Making a list from selected features
val = lasso_coef[lasso_coef!=0.0]
key, = np.where(lasso_coef!=0.0) #인덱스만 반환
feature_list = X.columns[key]
val_plt = np.multiply(val,1000)
feature_list = feature_list.tolist() #array를 list로
feature_column = key.tolist()
val = val.tolist()
print('List of selected features via Lasso dimenssion reduction:')
print(feature_list)

# revising feature matrix based on LASSO features reduction
X = X[feature_list]

ImportError: cannot import name 'artist' from 'matplotlib' (c:\Users\82104\AppData\Local\Programs\Python\Python310\lib\site-packages\matplotlib\__init__.py)

In [None]:
# Statistical Analysis & Hypothesis Testing
num_replica = 3000
bs_replica = np.empty(num_replica)
ht_feature = 'F484'
threshold = 680

lower_range = df1[(df1[ht_feature]>threshold) ]
higher_range = df1[(df1[ht_feature]<=threshold)]

lower_range_ratio = len(lower_range[lower_range['L0'] == 1])/len(lower_range)
higher_range_ratio = len(higher_range[higher_range['L0'] == 1])/len(higher_range)
ratio_diff = higher_range_ratio-lower_range_ratio

print('Higher fail ratio:',"%.3f" % higher_range_ratio)
print('Lower fail ration:',"%.3f" % lower_range_ratio)
print('Ratio difference:', "%.3f" % ratio_diff)

# Bootstrapping
for i in range(num_replica):
    lr_bs = lower_range.sample(frac=1,replace=True)
    hr_bs = higher_range.sample(frac=1,replace=True)
    
    lr_bs_r = len(lr_bs[lr_bs['L0'] == 1])/len(lr_bs)
    hr_bs_r = len(hr_bs[hr_bs['L0'] == 1])/len(hr_bs)
    ratio_diff_bs = hr_bs_r - lr_bs_r
    bs_replica[i] = ratio_diff_bs  


# Histogram plot    
plt.hist(bs_replica, bins=20)
plt.axvline(ratio_diff, color='r', linestyle='dashed', linewidth=3)
plt.xlabel('Yield Ratio Difference',fontweight="bold", size=12)
plt.savefig('Hypothesis.png');
plt.show()   

# Calculating P-value
print('Mean ratio decrease:',"%.3f" % np.mean(bs_replica))
print('95% Confidence interval:', (np.percentile(bs_replica,[2.5,97.5])))
print('P-value:', np.sum(bs_replica > (ratio_diff))/num_replica)
