In [3]:
import pandas as pd

In [4]:
train_pre = pd.read_csv('train.csv',header=None)
final = pd.read_csv('test.csv',header=None)
trainLabels_pre = pd.read_csv('trainLabels.csv',header=None)

In [5]:
# Some analysis

print("Are the labels balanced?")
print((trainLabels_pre.iloc[:,0]).value_counts())


Are the labels balanced?
1    510
0    490
Name: 0, dtype: int64


In [6]:
# Selecting test + training set
from random import random
test_select = pd.Series([True  if random()<0.2 else False for x in range(len(train_pre))])

train, test = train_pre[~test_select], train_pre[test_select]
trainLabels, testLabels = trainLabels_pre[~test_select], trainLabels_pre[test_select]

In [7]:
train.shape, test.shape

((794, 40), (206, 40))

# Classifiers

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [9]:
lr = LogisticRegression(solver='liblinear').fit(train,trainLabels.to_numpy().ravel())

In [10]:
accuracy_score(testLabels,lr.predict(test))

0.8446601941747572

In [11]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100).fit(train,trainLabels.to_numpy().ravel())

In [12]:
accuracy_score(testLabels,rf.predict(test))

0.8980582524271845

In [13]:
from sklearn.svm import SVC
svc = SVC(gamma='auto').fit(train,trainLabels.to_numpy().ravel())

In [14]:
accuracy_score(testLabels,svc.predict(test))

0.9320388349514563

In [15]:
## Best score 0.90658 / 0.90797

# Clustering + Classifiers

In [16]:
from sklearn.decomposition import PCA

In [17]:
pca = PCA(n_components=0.95, svd_solver='full')

train_reduced = pca.fit_transform(train)
test_reduced = pca.transform(test)

In [18]:
svc = SVC(gamma='auto').fit(train_reduced,trainLabels.to_numpy().ravel())

In [19]:
accuracy_score(testLabels,svc.predict(test_reduced))

0.9223300970873787

In [20]:
predictions = svc.predict(pca.transform(final))

In [21]:
## Best score 0.90896 / 0.90648

# Classifiers + grid search

In [23]:
from sklearn.model_selection import GridSearchCV

In [31]:
# param grid

param_grid = {
    'n_estimators': [50,100,200,500],
    'max_depth' : [100,50,20],
    'criterion' : ['entropy'],
    'min_samples_split' : [0.1,0.3,0.5,0.75,1.0],
    'min_samples_leaf' : np.linspace(0.1,0.5,num=5),
    'max_features' : np.linspace(0.5,1.0,num=5+1)
}

rf = RandomForestClassifier()

In [32]:
gscv = GridSearchCV(rf,param_grid=param_grid,scoring='accuracy',cv=10,n_jobs=-1,verbose=10)

In [33]:
import numpy as np
result = gscv.fit(X=train_pre,y=np.asarray(trainLabels_pre).ravel())
result.best_params_

Fitting 10 folds for each of 1800 candidates, totalling 18000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 157 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 197 tasks      | elapsed:  

[Parallel(n_jobs=-1)]: Done 9361 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 9498 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 9637 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 9776 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 9917 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 10058 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 10201 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 10344 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 10489 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 10634 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 10781 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 10928 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 11077 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 11226 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 11377 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 11528 tasks      

{'criterion': 'entropy',
 'max_depth': 100,
 'max_features': 0.5,
 'min_samples_leaf': 0.1,
 'min_samples_split': 0.1,
 'n_estimators': 100}

In [40]:
best_rf_model = result.best_estimator_

In [41]:
best = best_rf_model.fit(train,trainLabels.to_numpy().ravel())
accuracy_score(testLabels,best.predict(test))

0.8106796116504854

# Gaussian mixture models

In [44]:
from sklearn.mixture import GaussianMixture

In [64]:
# Reimporting to do things a little differently

train = np.asarray(pd.read_csv('train.csv',header=None))
test = np.asarray(pd.read_csv('test.csv',header=None))
trainLabels = np.asarray(pd.read_csv('trainLabels.csv',header=None)).ravel()

all_data = np.r_[train,test]

In [65]:
# Finding Gaussian mixture model based on all data
n_components = [x for x in range(1,20)]
covariant_type = ['full','tied','diag','spherical']

lowest_score = np.infty

for c in n_components:
    for t in covariant_type:
        gm = GaussianMixture(n_components=c,covariance_type=t)
        model = gm.fit(all_data)
        
        score = gm.bic(all_data)

        if score<lowest_score:
            lowest_score = score
            best_gm = gm
            
            print('Found new best one!')
            print('Score: ',score)
            print('n_components: ',c)
            print('covariance_type: ',t)

best_gm.fit(all_data)

train = best_gm.predict_proba(train)
test = best_gm.predict_proba(test)

Found new best one!
Score:  915700.7005552652
n_components:  1
covariance_type:  full
Found new best one!
Score:  915700.7005452963
n_components:  1
covariance_type:  tied
Found new best one!
Score:  906832.1377218185
n_components:  2
covariance_type:  full
Found new best one!
Score:  885402.0752501116
n_components:  3
covariance_type:  full
Found new best one!
Score:  854029.7689210605
n_components:  4
covariance_type:  full


In [66]:
# Apparently 4 components is enough 
print(train.shape)
print(test.shape)

(1000, 4)
(9000, 4)


In [73]:
## Now we do a random forest gridsearch like above, but on the reduced dataset

# param grid

param_grid = {
    'n_estimators': [2,4,8,16,32,64,128],
    'max_depth' : [200,100,50,20],
    'criterion' : ['entropy'],
    'min_samples_split' : [0.1,0.3,0.5,0.75,1.0],
    'min_samples_leaf' : np.linspace(0.1,0.5,num=5),
    'max_features' : np.linspace(0.5,1.0,num=5+1)
}

rf = RandomForestClassifier()

In [74]:
gscv = GridSearchCV(rf,param_grid=param_grid,scoring='accuracy',cv=10,n_jobs=-1,verbose=10)

In [75]:
import numpy as np
result = gscv.fit(X=train,y=trainLabels)
result.best_params_

Fitting 10 folds for each of 4200 candidates, totalling 42000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1791s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1175s.) Setting batch_size=6.
[Parallel(n_jobs=-1)]: Done  62 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1165s.) Setting batch_size=20.
[Parallel(n_jobs=-1)]: Done 140 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 286 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 584 tasks      | elapsed:

{'criterion': 'entropy',
 'max_depth': 200,
 'max_features': 0.5,
 'min_samples_leaf': 0.1,
 'min_samples_split': 0.1,
 'n_estimators': 4}

In [76]:
print(result.best_score_)

0.996


In [77]:
best_rf_model = result.best_estimator_

In [78]:
best = best_rf_model.fit(train,trainLabels)
predictions = best.predict(test)

# Submit to Kaggle

In [79]:
file_name = "solution_5.csv"
message = "New attempt of gaussian mixture models + rf CV"
header = ['Id','Solution']



pd.DataFrame(
    data=list(zip([x for x in range(1,len(final)+1)], predictions.tolist()))
).to_csv('{}'.format(file_name), index=False, header=header)

In [80]:
%%bash -s "$file_name" "$message"
kaggle competitions submit -c data-science-london-scikit-learn -f $1 -m "$2"

Successfully submitted to Data Science London + Scikit-learn

  0%|          | 0.00/60.5k [00:00<?, ?B/s] 13%|█▎        | 8.00k/60.5k [00:00<00:00, 71.9kB/s]100%|██████████| 60.5k/60.5k [00:04<00:00, 12.8kB/s]
