In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer,KNNImputer,IterativeImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV

In [2]:
#Reading csv file 1
labels=pd.read_csv('secom_labels.data',header=None,sep=' ',names=['status','timestamp'])

In [3]:
#Generate an empty list to put column names in
column_list=[]

In [4]:
#Naming columns 0-590
for x in range(591):
    column_list.append(f'col_{x}')

In [5]:
#Remove the first column name as it starts with 0
column_list.remove('col_0')

In [6]:
#Reading csv file 2
data=pd.read_csv('secom.data',header=None,sep=' ',names=column_list)

In [7]:
#Data file has 590 columns and 1567 rows
print(data.shape)
data.head()

(1567, 590)


Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,...,col_581,col_582,col_583,col_584,col_585,col_586,col_587,col_588,col_589,col_590
0,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,0.0162,...,,,0.5005,0.0118,0.0035,2.363,,,,
1,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,-0.0005,...,0.006,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045
2,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,0.0041,...,0.0148,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,-0.0124,...,0.0044,73.8432,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
4,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5031,-0.0031,...,,,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432


In [8]:
#Replace -1 with 0 in status column
labels['status']=labels['status'].replace(-1,0)

In [9]:
data=data.merge(labels,right_index=True,left_index=True)

### Manual combinations

In [11]:
#Test sample for validation
train_data, test_data = train_test_split(data,test_size=0.2,stratify=data['status'])
#Generate a list of columns excluding status and timestamp
ls_cont=[x for x in data.columns if (x!='status')&(x!='timestamp')]
#Separating target column
train_x=train_data[ls_cont].copy()
train_y=train_data['status'].copy()
test_x=test_data[ls_cont].copy()
test_y=test_data['status'].copy()

In [12]:
train_x

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,...,col_581,col_582,col_583,col_584,col_585,col_586,col_587,col_588,col_589,col_590
791,2958.83,2488.50,2197.5222,1373.0077,1.1369,100.0,106.0733,0.1240,1.3886,0.0107,...,,,0.4975,0.0142,0.0031,2.8592,0.0246,0.0064,0.0022,25.9900
1296,2951.06,2503.18,2228.4778,1721.1108,1.4301,100.0,93.6222,0.1221,1.3841,-0.0264,...,,,0.4997,0.0131,0.0034,2.6132,0.0308,0.0183,0.0063,59.3775
1549,3183.63,2498.00,2195.4444,2914.1792,1.5978,100.0,85.1011,0.1235,1.4129,-0.0081,...,,,0.5037,0.0117,0.0030,2.3203,0.0253,0.0224,0.0071,88.5812
40,2962.14,2545.71,2221.5778,1503.6230,1.1878,100.0,111.3444,0.1211,1.5424,-0.0177,...,0.0032,45.4264,0.5068,0.0201,0.0046,3.9635,0.0189,0.0086,0.0032,45.4264
1505,3026.67,2529.82,2192.7889,1268.5852,1.9935,100.0,104.5867,0.1268,1.4241,0.0008,...,0.0041,122.5713,0.4995,0.0178,0.0039,3.5600,0.0104,0.0127,0.0041,122.5713
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,2855.80,2537.35,2183.4333,1582.5646,1.3601,100.0,99.0267,0.1240,1.4912,-0.0004,...,0.0050,69.4220,0.5011,0.0122,0.0032,2.4250,0.0218,0.0152,0.0050,69.4220
1086,2888.58,2481.37,2200.2000,1121.1875,1.3171,100.0,103.8978,0.1191,1.4342,0.0168,...,,,0.4980,0.0209,0.0044,4.1921,0.0389,0.0172,0.0052,44.2355
1259,3190.78,2424.11,2191.2111,1437.5003,2.2073,100.0,97.6444,0.1235,1.3873,-0.0135,...,0.0108,237.4625,0.4982,0.0118,0.0035,2.3664,0.0137,0.0326,0.0108,237.4625
172,2954.44,2576.94,2221.9444,1551.6947,1.5296,100.0,99.2678,0.1222,1.4767,-0.0073,...,0.0051,62.9443,0.5043,0.0112,0.0032,2.2230,0.0254,0.0160,0.0051,62.9443


In [13]:
%%time
#Null values handling
#We need to impute null values before feature selection because multivariate methods calculate them using multiple columns.
#Reducing the columns before doing this might impute values that are inaccurate
#Median
#si=SimpleImputer(missing_values=np.nan, strategy='median')
#train_x=pd.DataFrame(si.fit_transform(train_x))
#test_x=pd.DataFrame(si.fit_transform(test_x))

#KNN
kn=KNNImputer(n_neighbors=4)
train_x=pd.DataFrame(kn.fit_transform(train_x),columns=ls_cont)
test_x=pd.DataFrame(kn.fit_transform(test_x),columns=ls_cont)

#MICE
#mc=IterativeImputer(max_iter=5, random_state=500)
#train_x=pd.DataFrame(mc.fit_transform(train_x))
#test_x=pd.DataFrame(mc.fit_transform(test_x))

CPU times: user 1.78 s, sys: 1.48 s, total: 3.26 s
Wall time: 870 ms


In [14]:
#Feature Selection
ls_iv=['col_60','col_65','col_130','col_29','col_511','col_104','col_39','col_1','col_131','col_26','col_125','col_211','col_73','col_22','col_112','col_248','col_126','col_128','col_92','col_346','col_432','col_91','col_81','col_80','col_295','col_418','col_23','col_133','col_160','col_79','col_469','col_512','col_181','col_279','col_15','col_77','col_498','col_113','col_572','col_41','col_96','col_138','col_317','col_411','col_446','col_31','col_543','col_562','col_32','col_134','col_274','col_82','col_10','col_62','col_479','col_210','col_288','col_153','col_141','col_276','col_68','col_341']
ls_bor=[]
ls_lasso=[]
ls_ridge=[]
ls_rfecv=['col_22', 'col_206', 'col_489', 'col_495', 'col_15', 'col_351', 'col_525', 'col_240', 'col_105', 'col_441', 'col_65', 'col_60', 'col_165', 'col_76', 'col_334', 'col_133', 'col_339', 'col_288', 'col_302', 'col_574', 'col_217', 'col_337', 'col_350', 'col_130', 'col_153', 'col_212', 'col_26', 'col_355', 'col_62']
tgt=['status']

#data[ls_rfecv].corr().to_clipboard()


In [15]:
#PCA
#2 components for visualization

#Xpca=pd.DataFrame(si.fit_transform(data[ls_cont]))
#pca = PCA(n_components=2)
#pca.fit(Xpca)

#xp = pd.DataFrame(pca.transform(Xpca), columns = ['p1', 'p2'])
#pca.explained_variance_ratio_.cumsum()

In [16]:
#sns.lmplot(x = 'p1', y = 'p2', data = xp, fit_reg=False)

In [17]:
#Merge Xt and yt for balancing data
temp_df=train_x[ls_iv].merge(train_y,left_index=True,right_index=True)
#Status column value counts, proportion of 0 and 1 must be balanced
temp_df['status'].value_counts()

status
0    923
1     69
Name: count, dtype: int64

In [18]:
#Balancing the data. We separate the rows with status 1 from rows with status 0
pos=temp_df.loc[temp_df['status']==1].copy()
neg=temp_df.loc[temp_df['status']==0].copy()
#First non null row to avoid warning
aux=pd.DataFrame([pos.iloc[0]].copy())

In [19]:
a=pos.sample(10)
b=pos.sample(10)

In [20]:
#Oversampling, we repeat random samples of 10 rows and attach them together in one big table then add these rows to the neg table
for i in range (117):
    aux=pd.concat([aux,pos.sample(10)])
bal_df=pd.concat([aux,neg])

#Undersampling
#new_df=pd.concat([pos,neg.sample(104)])

#SMOTE

#ADASYN

In [21]:
#Value counts after balancing, now both 1 and 0 are almost the same (also change datatype to int because it turns into float in the concat)
bal_df['status'].value_counts()

status
1.0    1171
0.0     923
Name: count, dtype: int64

In [22]:
#Generate train target variable, necessary because the number of rows changed after balancing
train_y=bal_df['status'].astype(int)

In [23]:
#Outlier handling
#Missing + outlier handling
ls_out=[2,3]
for col in ls_iv:
    mn=bal_df[col].mean()
    sd=bal_df[col].std()
    mdn=bal_df[col].median()
    #replace outliers with median
    l=mn-(sd*3)
    r=mn+(sd*3)
    bal_df[col].loc[bal_df[col]<l]=mdn
    bal_df[col].loc[bal_df[col]>r]=mdn

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  bal_df[col].loc[bal_df[col]<l]=mdn
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Seri

In [24]:
#Scaling data
#Standard
#sc = StandardScaler()
#Xs = pd.DataFrame(sc.fit_transform(new_df[ls_iv]),columns=ls_iv)

#Minmax
mm=MinMaxScaler()
bal_df=pd.DataFrame(mm.fit_transform(bal_df[ls_iv]),columns=ls_iv)
test_x=pd.DataFrame(mm.fit_transform(test_x[ls_iv]),columns=ls_iv)


In [25]:
#Models
#Decision tree
tr = tree.DecisionTreeClassifier()
tr = tr.fit(bal_df, train_y)

#LogisticRegression
#lr = LogisticRegression()
#lr = lr.fit(Xs, new_df['status'])

#RandomForestClassifier

In [64]:
test_x

Unnamed: 0,col_60,col_65,col_130,col_29,col_511,col_104,col_39,col_1,col_131,col_26,...,col_10,col_62,col_479,col_210,col_288,col_153,col_141,col_276,col_68,col_341
0,0.311841,0.199062,0.833414,0.476467,0.340249,0.846154,0.299260,0.426402,0.871932,0.923866,...,0.517647,0.378105,0.0,0.0,0.110617,0.122983,0.145072,0.102303,0.000083,0.000014
1,0.155078,0.499482,0.461793,0.350095,0.063380,0.230769,0.359844,0.525983,0.374173,0.676864,...,0.362353,0.413790,0.0,0.0,0.111537,0.114525,0.022895,0.015789,0.000069,0.000003
2,0.247460,0.322442,0.469376,0.843971,0.096956,0.401099,0.485078,0.760253,0.407471,0.932991,...,0.556471,0.490272,0.0,0.0,0.038302,0.046708,0.015661,0.008816,0.000084,0.000003
3,0.191485,0.429711,0.605900,0.535782,0.102222,0.395604,0.272262,0.350439,0.608111,0.760560,...,0.243529,0.297857,0.0,0.0,0.052551,0.047301,0.028021,0.016711,0.000075,0.000009
4,0.207297,0.399409,0.605900,0.515798,0.169811,0.571429,0.220542,0.373701,0.866382,0.666261,...,0.452941,0.425491,0.0,0.0,0.177034,0.197958,0.055255,0.046250,0.000089,0.000003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309,0.216178,0.382390,0.583149,0.408123,0.127434,0.620879,0.195820,0.415274,0.658485,0.960629,...,0.390588,0.477271,0.0,0.0,0.049334,0.062004,0.015001,0.007434,0.000082,0.000005
310,0.158833,0.492285,0.446625,0.789814,0.090090,0.478022,0.341872,0.610479,0.501814,0.939771,...,0.567059,0.391644,0.0,0.0,0.112456,0.129685,0.020316,0.012368,0.000094,0.000005
311,0.252507,0.312770,0.530046,0.047066,0.398809,0.780220,0.168578,0.443502,0.723586,0.603250,...,0.162353,0.365014,0.0,0.0,0.047418,0.059086,0.058148,0.039868,0.000104,0.000015
312,0.187835,0.436706,0.583149,0.423598,0.150867,0.335165,0.452997,0.388327,0.833938,0.930645,...,0.363529,0.306913,0.0,0.0,0.101195,0.118720,0.017234,0.008618,0.000087,0.000012


In [72]:
data['status'].value_counts(True)

status
0    0.933631
1    0.066369
Name: proportion, dtype: float64

In [78]:
tr.predict(test_x[ls_iv])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [26]:
tr.predict(test_x[ls_iv]).sum()

4

In [27]:
tr.score(X=test_x[ls_iv], y=test_y)

0.9203821656050956

In [28]:
roc_auc_score(y_true=test_y, y_score=tr.predict(X=test_x))

0.4931740614334471

### Testing all combinations