In [40]:
import pandas as pd
import numpy as np
import seaborn as sns

np.random.seed(42)

In [41]:
train = pd.read_csv('train.csv')
num_columns = len(train.columns)
pd.set_option("display.max_columns", num_columns)

In [42]:
test = pd.read_csv('test.csv')
num_columns = len(test.columns)
print(test.shape)
train.shape

(2999, 66)


(8677, 66)

In [43]:
train.drop(columns = 'C', inplace = True)
test.drop(columns = 'A', inplace = True)

In [45]:
test.dropna(axis=0, how='any', inplace = True)
train.dropna(axis=0, how='any', inplace = True)

In [47]:
features = train[[x for x in train if x != '0.4']]
target = train['0.4']

In [48]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(
    features,
    target, 
    test_size = .2,
    random_state = 42
)

## Gaussian

In [49]:
from sklearn.naive_bayes import GaussianNB

gaussian = GaussianNB().fit(train_X, train_y)
gaussian.score(test_X, test_y)

0.875

## Bernoulli

In [50]:
from sklearn.naive_bayes import BernoulliNB

bernoulli = BernoulliNB().fit(train_X, train_y)
bernoulli.score(test_X, test_y)

0.4619815668202765

## Robust

In [51]:
from sklearn.preprocessing import RobustScaler

scalar = RobustScaler().fit(train_X)
knn_robust = KNeighborsClassifier()
knn_robust.fit(scalar.transform(train_X), train_y.values.ravel())
knn_robust.score(scalar.transform(test_X), test_y)

0.7753456221198156

## KNN

In [52]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(train_X, train_y.values.ravel())
knn.score(test_X, test_y)

0.6866359447004609

### RandomForestClassifier

In [54]:
from sklearn.ensemble import RandomForestClassifier

ks_rf = RandomForestClassifier().fit(train_X, train_y)
predictions = ks_rf.predict(test_X)
ks_rf.score(test_X, test_y)



0.8922811059907834

In [56]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100).fit(train_X, train_y)

predicitions = rf.predict(test_X)
# predicitions
rf.score(test_X, test_y)

0.9170506912442397

### LogisticRegression

In [57]:
from sklearn.linear_model import LogisticRegression

log_regr = LogisticRegression(max_iter = 1000)
log_regr.fit(train_X, train_y.values.ravel())
log_regr.score(test_X, test_y)



0.33986175115207373

## TEST

In [80]:
features = test[[col for col in test if col != '0.4']]
target = test['0.4']

In [81]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(
    features,
    target, 
    test_size = .2,
    random_state = 42
)

In [82]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100).fit(train_X, train_y)

predicitions = rf.predict(test_X)

predicitions

array([1, 1, 3, 3, 0, 3, 0, 2, 1, 1, 3, 0, 3, 0, 0, 1, 0, 3, 2, 2, 0, 2,
       0, 1, 1, 0, 1, 1, 3, 2, 1, 3, 3, 2, 1, 3, 3, 1, 2, 3, 3, 2, 3, 0,
       2, 2, 1, 2, 0, 3, 2, 1, 3, 3, 2, 2, 1, 0, 2, 0, 3, 3, 2, 0, 3, 2,
       1, 0, 2, 3, 0, 1, 2, 3, 2, 0, 0, 2, 3, 2, 0, 2, 2, 3, 1, 3, 2, 2,
       0, 0, 3, 1, 1, 3, 1, 0, 1, 2, 3, 3, 0, 3, 3, 1, 3, 2, 0, 1, 1, 3,
       1, 2, 2, 3, 2, 1, 0, 0, 0, 0, 0, 2, 3, 3, 0, 2, 1, 0, 3, 0, 1, 1,
       2, 0, 1, 1, 1, 1, 0, 2, 2, 2, 0, 2, 3, 3, 2, 3, 1, 1, 0, 1, 3, 2,
       3, 1, 1, 2, 3, 3, 0, 3, 1, 0, 0, 2, 3, 3, 1, 0, 2, 3, 3, 3, 2, 2,
       1, 2, 2, 3, 0, 3, 1, 1, 3, 3, 3, 1, 2, 1, 2, 3, 2, 2, 0, 0, 2, 2,
       1, 1, 3, 0, 0, 0, 2, 0, 0, 0, 1, 2, 0, 3, 3, 1, 3, 0, 0, 0, 0, 0,
       0, 1, 0, 3, 0, 2, 2, 2, 0, 2, 1, 0, 2, 1, 3, 1, 1, 0, 1, 1, 1, 2,
       2, 2, 3, 0, 1, 1, 2, 0, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 1, 2, 0, 0,
       1, 1, 3, 1, 2, 2, 3, 0, 2, 1, 0, 1, 2, 2, 0, 2, 3, 2, 2, 2, 0, 0,
       1, 3, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 0, 1, 2,

In [84]:
rf.score(test_X, test_y)

0.8896321070234113

In [85]:
# clf = Pipeline([
#   ('feature_selection', SelectFromModel(LinearSVC(C=0.01, penalty="l1", dual=False))),
#   ('classification', RandomForestClassifier())
# ])
# pipe = clf.fit(train_X, train_y)
# pipe.score(test_X, test_y)

In [64]:
#predictions2 = pipe.predict(test_X)
# predictions2

In [79]:
from sklearn import metrics

metrics.r2_score(prediction, test_y)


-6.393340434669295