In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
# Preprocessing steps as described in exercise 1
train_df = pd.read_csv("data/npf_train.csv")
train_df = train_df.set_index("date")
train_df["class4"] = train_df["class4"].astype("category")
train_df = train_df.drop("id",axis=1)
train_df = train_df.drop("partlybad",axis=1)

class2 = np.array(["event"] * train_df.shape[0],dtype="object") 
class2[train_df["class4"]=="nonevent"] = "nonevent" 
train_df["class2"] = class2
train_df["class2"] = train_df["class2"].astype("category")

train_df.head()

Unnamed: 0_level_0,class4,CO2168.mean,CO2168.std,CO2336.mean,CO2336.std,CO242.mean,CO242.std,CO2504.mean,CO2504.std,Glob.mean,...,T672.std,T84.mean,T84.std,UV_A.mean,UV_A.std,UV_B.mean,UV_B.std,CS.mean,CS.std,class2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-01,nonevent,384.462,2.284996,384.164462,2.135062,385.274688,2.211695,383.885077,1.955198,19.245511,...,0.525698,-12.422972,0.376324,1.635563,0.856948,0.026438,0.014617,0.003374,0.000733,nonevent
2000-01-20,nonevent,374.884615,0.415185,374.703333,0.385179,375.621266,0.66572,374.674177,0.43548,31.107659,...,0.373927,-8.351043,0.575679,1.441109,0.741088,0.022649,0.012479,0.001501,0.000572,nonevent
2000-01-23,nonevent,373.496585,0.189497,373.382593,0.172958,373.961481,0.235107,373.275062,0.1655,29.800885,...,0.965988,-9.651155,1.238891,2.677545,1.261612,0.044759,0.023748,0.000764,4.8e-05,nonevent
2000-02-17,nonevent,378.600367,1.93418,378.464862,1.946536,379.785872,2.865022,378.316909,1.98343,23.795211,...,0.122651,-0.829524,0.134191,2.261805,1.345651,0.030893,0.021903,0.002038,0.000751,nonevent
2000-03-25,Ib,373.128684,1.096617,372.98,1.04775,373.70183,1.259198,372.91,1.004164,252.480327,...,1.695622,-1.095864,2.090111,12.906779,7.0223,0.333523,0.239981,0.000662,0.00021,event


In [3]:
test_df = pd.read_csv('data/npf_test_hidden.csv')
test_df = test_df.drop('date',axis=1)
test_df = test_df.drop('partlybad',axis=1)

In [4]:
len(train_df.dtypes), sum(train_df.dtypes==float)

(102, 100)

All features are float.

In [5]:
features = train_df.columns.tolist()
features.remove('class4')
features.remove('class2')

Feature selection

In [6]:
threshold = 0.97

seen = []
selected = []
for feature in features:
    to_add = True
    for item in seen:
        if train_df[feature].corr(train_df[item]) > threshold:
            to_add = False
    if to_add:
        selected.append(feature)
    seen.append(feature)

train_df = train_df[selected + ['class2','class4']]
len(selected)

44

In [7]:
test_df = test_df[selected]
test_df.head()

Unnamed: 0,CO2168.mean,CO2168.std,CO242.std,Glob.mean,Glob.std,H2O168.mean,H2O168.std,NET.mean,NO168.mean,NO168.std,...,RPAR.std,SO2168.mean,SO2168.std,SWS.mean,SWS.std,T168.mean,T168.std,UV_B.mean,CS.mean,CS.std
0,376.610169,0.526617,0.56436,37.115592,24.180794,4.357458,0.097849,-3.470806,0.040423,0.036152,...,2.37263,0.18493,0.090473,926.607143,0.685257,-2.391043,0.168389,0.040709,0.000644,0.000119
1,390.624932,0.453585,0.505836,69.134531,59.895057,1.19863,0.038453,5.110229,0.573151,0.337284,...,20.140791,1.934384,0.176139,919.862069,1.940621,-18.146499,0.458852,0.103446,0.003707,0.000115
2,375.280258,1.249087,1.423506,276.485371,201.722672,4.560452,0.538227,205.803609,0.024645,0.054894,...,17.792897,0.322838,0.276723,930.580645,0.66649,4.351654,1.741449,0.557332,0.001637,0.000428
3,382.642176,3.222805,3.466259,308.904304,287.444652,5.962471,0.694415,144.066851,0.020588,0.03687,...,16.646713,0.058,0.084115,921.036275,1.012996,7.566353,3.825442,0.646806,0.00248,0.00051
4,381.492971,4.386929,4.491875,111.456879,104.807009,12.808,2.265853,100.454354,0.054114,0.063952,...,8.552627,0.638448,0.592653,866.171429,159.262701,12.422717,2.385826,0.275559,0.009429,0.004438


New features

In [8]:
for feature in selected:
    if feature.endswith('mean') and feature.replace('.mean','.std') in selected:
        train_df = train_df.copy()
        test_df = test_df.copy()
        train_df[feature.replace('.mean','.quantile1')] = train_df[feature] - 2 * train_df[feature.replace('.mean','.std')]
        train_df[feature.replace('.mean','.quantile2')] = train_df[feature] + 2 * train_df[feature.replace('.mean','.std')]
        test_df[feature.replace('.mean','.quantile1')] = test_df[feature] - 2 * test_df[feature.replace('.mean','.std')]
        test_df[feature.replace('.mean','.quantile2')] = test_df[feature] + 2 * test_df[feature.replace('.mean','.std')]

In [9]:
features = test_df.columns
combined_X = pd.concat([train_df[features],test_df])
labels = train_df[['class2','class4']]
scaler = StandardScaler()
combined_X = scaler.fit_transform(combined_X)

train_df = pd.DataFrame(combined_X[:len(train_df)])
test_df = pd.DataFrame(combined_X[len(train_df):])
train_df['class2'] = 'test'
train_df['class4'] = 'test'

for i in range(len(train_df)):
    train_df.loc[i,'class2'] = labels['class2'][i]
    train_df.loc[i,'class4'] = labels['class4'][i]

In [10]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,68,69,70,71,72,73,74,75,class2,class4
0,0.23771,-0.309011,-0.44704,-1.323625,-1.34572,-1.253581,-1.051472,-1.19394,-0.045588,-0.21501,...,2.052707,0.927877,0.324054,-0.018559,-1.744173,-1.915203,0.116689,0.138621,nonevent,nonevent
1,-0.622351,-0.878554,-0.80768,-1.229485,-1.210446,-1.036006,-0.906886,-1.318914,0.084745,0.266698,...,0.088352,-0.256184,-2.775113,3.25475,-1.321825,-1.521228,-0.750197,-0.558482,nonevent,nonevent
2,-0.746998,-0.947298,-0.908132,-1.239856,-1.228877,-1.133512,-0.628584,-1.266641,-0.306451,-0.570413,...,-0.1069,-0.264368,0.624072,-0.228494,-1.61313,-1.51376,-0.576771,-1.12515,nonevent,nonevent
3,-0.288672,-0.415869,-0.294635,-1.287518,-1.300299,-0.365133,-0.991164,-1.144773,0.565953,1.151443,...,0.08067,-0.217254,-0.638016,1.102341,-0.408554,-0.929189,-0.650076,-0.274089,nonevent,nonevent
4,-0.780036,-0.67099,-0.669236,0.527349,0.005534,-1.001033,-0.53222,0.292039,-0.292112,-0.471555,...,-0.119527,-0.304354,0.505167,-0.416789,-0.84378,-0.626883,-0.814273,-1.054749,event,Ib


In [11]:
test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,66,67,68,69,70,71,72,73,74,75
0,-0.467394,-0.844612,-0.831325,-1.181806,-1.215169,-0.715284,-0.954526,-1.395449,-0.218031,-0.50602,...,0.987728,-1.189677,0.148885,-0.307758,0.561095,-0.409494,-0.569197,-1.037969,-0.722484,-1.11807
1,0.791149,-0.866857,-0.844977,-0.927701,-0.835214,-1.533052,-1.078775,-1.296271,2.130504,1.992439,...,-0.742018,0.699835,4.548434,1.872111,0.473505,-0.486268,-2.36532,-2.388677,0.994464,-0.147984
2,-0.586822,-0.624548,-0.630906,0.717856,0.673655,-0.662732,-0.033306,1.02329,-0.287586,-0.35052,...,-0.728472,0.411539,-0.505118,0.271496,0.599053,-0.33813,-0.173767,-0.15775,-0.513011,-0.606934
3,0.074288,-0.023355,-0.154381,0.975136,1.58563,-0.299773,0.293422,0.309753,-0.305471,-0.50006,...,-0.579328,0.296396,-0.169522,-0.46624,0.502173,-0.498616,-0.278577,0.499394,-0.133755,-0.286908
4,-0.028912,0.331237,0.084871,-0.591826,-0.357407,1.472421,3.580687,-0.194309,-0.157671,-0.275366,...,-0.08018,-0.616217,-1.386771,1.346765,-3.012171,4.245149,0.571922,0.675382,-0.64018,4.412803


In [12]:
train_df.to_csv('data/cleaned_train.csv')
test_df.to_csv('data/cleaned_test.csv')