In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.linear_model import SGDClassifier
f1 = {}

Import and shuffle data

In [2]:
df = pd.read_csv('../input/pulsar_stars.csv')
df = df.sample(frac=1)
df.head()

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve,target_class
11866,117.695312,47.618201,0.11136,0.187767,3.345318,20.663501,8.693813,85.277517,0
9477,155.898438,50.891865,-0.293039,-0.219576,0.67893,10.138288,18.839142,400.997616,0
7788,106.882812,45.665499,0.309543,0.627811,7.25,34.5967,5.044549,25.287163,0
8187,119.054688,56.361044,0.49635,-0.238969,2.690635,18.040866,9.734548,106.960269,0
5593,128.609375,47.140123,0.072144,0.213887,5.01505,29.717254,6.231137,38.86395,0


Training on full dataset

In [3]:
X = df.drop('target_class', axis=1)
Y = df['target_class']

In [4]:
split = int(len(X)*0.6)

In [5]:
x_train = X[:split]
y_train = Y[:split]
x_test = X[split:]
y_test = Y[split:]

In [6]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
x_train = scalar.fit_transform(x_train)
x_test = scalar.transform(x_test)

In [7]:
sgd = SGDClassifier(loss='log',random_state=10, class_weight='balanced', alpha=0.01,n_jobs=-1)
sgd.fit(x_train, y_train)
print('Train accuracy = {0}, Test accuracy = {1}'.format(sgd.score(x_train, y_train), sgd.score(x_test, y_test)))
f1['full'] = [f1_score(sgd.predict(x_train), y_train),
          f1_score(sgd.predict(x_test), y_test)]

Train accuracy = 0.9688955112683926, Test accuracy = 0.9652234636871508


Replacing 30% of values with Na

In [8]:
p=0.3
ind = np.array(np.random.choice(X.shape[0]*X.shape[1],
                                size=(int(X.shape[0]*X.shape[1]*p)), 
                                replace=False))
for i in ind:
    X.iloc[i%X.shape[0],i%X.shape[1]]=np.nan

In [9]:
Xm = X.copy()

Filling na by replacing with mean

In [10]:
Xm_filled = Xm.fillna(Xm.mean())
x_train = Xm_filled[:split]
y_train = Y[:split]
x_test = Xm_filled[split:]
y_test = Y[split:]

In [11]:
x_train = scalar.fit_transform(x_train)
x_test = scalar.transform(x_test)

In [12]:
sgd.fit(x_train, y_train)
print('Train accuracy = {0}, Test accuracy = {1}'.format(sgd.score(x_train, y_train), sgd.score(x_test, y_test)))
f1['mean'] = [f1_score(sgd.predict(x_train), y_train),
          f1_score(sgd.predict(x_test), y_test)]

Train accuracy = 0.9559508288321847, Test accuracy = 0.957122905027933


Filling na using KNN regressor. <br>
The algorithm is based on an iterative approach.
For each column containing missing values:
1. Select a column containing missing values.
2. In all other columns, fill in the missing values with an average.
2. Fill the selected column using KNN regressor, using all other columns as a training dataset and, as indications, not missing values of the target column.
3. Calculate the quality of the regression.

This process is repeated until the average value of the regression quality metric for all columns containing missing values in the new itteration is greater than the value in the previous one by the value of delta.


In [13]:
def knn_na_filler(Xin, delta=0.01, n_neighbors=5):
    Xin = Xin.copy()
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    print('Filling Na with KNN...')
    missindxs = {col:Xin[Xin.loc[:,col].isnull()].index for col in Xin.columns}
    
    prev_mean_score = 0
    i=0
    while True:
        i+=1
        mean_score = 0 #mean score for column values prediction on test
        for col in Xin.columns:
            missindx = missindxs[col] #get indeces of missing values
            
            xtarget = Xin.loc[Xin.index.isin(missindx),:].drop(axis=1, columns=col) # features for missing values
            xtarget = xtarget.fillna(np.mean(xtarget)) # Na values from other columns filling with mean
            xtr = Xin.loc[~Xin.index.isin(missindx),:].drop(axis=1, columns=col) # features for training data
            xtr = xtr.fillna(np.mean(xtr))
            ytr = Xin.loc[~Xin.index.isin(missindx),col]
            
            knnna = KNeighborsRegressor(n_neighbors=n_neighbors, weights='distance')
            scalar = StandardScaler()
            pipe = Pipeline([('scalar', scalar), ('model', knnna)])
            pipe.fit(xtr[:int(len(xtr)*0.7)], ytr[:int(len(xtr)*0.7)])
            train_score = pipe.score(xtr[:int(len(xtr)*0.7)], ytr[:int(len(xtr)*0.7)])
            test_score = pipe.score(xtr[int(len(xtr)*0.7):], ytr[int(len(xtr)*0.7):])
            mean_score+=test_score
            
            Xin.loc[Xin.index.isin(missindx), col] = pipe.predict(xtarget) # filling Nan in target column with predictions
        
        
        mean_score /= len(Xin.columns)
        print('Mean score at itteration {0} eq {1}'.format(i, mean_score))
        if (mean_score-prev_mean_score)<delta:
            break
        prev_mean_score = mean_score
        
    return Xin

In [14]:
Xm_filled = Xm.fillna(knn_na_filler(Xm, 0.001, 5))
x_train = Xm_filled[:split]
y_train = Y[:split]
x_test = Xm_filled[split:]
y_test = Y[split:]
x_train = scalar.fit_transform(x_train)
x_test = scalar.transform(x_test)

Filling Na with KNN...
Mean score at itteration 1 eq 0.8708636592103532
Mean score at itteration 2 eq 0.9193770991201302
Mean score at itteration 3 eq 0.9226233824018291
Mean score at itteration 4 eq 0.9235247198523738


In [15]:
sgd.fit(x_train, y_train)
print('Train accuracy = {0}, Test accuracy = {1}'.format(sgd.score(x_train, y_train), sgd.score(x_test, y_test)))
f1['knn'] = [f1_score(sgd.predict(x_train), y_train),
             f1_score(sgd.predict(x_test), y_test)]

Train accuracy = 0.9639597690445149, Test accuracy = 0.9621508379888268


In [16]:
print('F1 score in full dataset: Train={}, Test={}'.format(f1['full'][0],f1['full'][1]))
print('F1 score with mean replacing: Train={}, Test={}'.format(f1['mean'][0],f1['mean'][1]))
print('F1 score with KNN replacing: Train={}, Test={}'.format(f1['knn'][0],f1['knn'][1]))

F1 score in full dataset: Train=0.8418560606060606, Test=0.8222698072805139
F1 score with mean replacing: Train=0.7860696517412935, Test=0.789293067947838
F1 score with KNN replacing: Train=0.8185654008438819, Test=0.8076650106458482
