In [1]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

### Step1 - Load Data

In [2]:
""" Load train data """
def load_data():
    train_df = pd.read_csv('Data/train.csv')
    test_df = pd.read_csv('Data/test.csv')
    return train_df, test_df

train_df, test_df = load_data()

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3751 entries, 0 to 3750
Columns: 1777 entries, Activity to D1776
dtypes: float64(942), int64(835)
memory usage: 50.9 MB


In [4]:
# Let's examine the first 5 records
train_df.head(5)

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [5]:
def generate_train_data(df):
    # Use [1:] to remove header
    X_train = df[df.columns.drop('Activity')]   
    y_train = df['Activity']
    return X_train, y_train

X_train, y_train = generate_train_data(train_df)

In [6]:
print('X_train: {}, y_train: {}'.format(X_train.shape, y_train.shape))

X_train: (3751, 1776), y_train: (3751,)


In [7]:
X_train.head(5)

Unnamed: 0,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,0.243144,...,0,0,0,0,0,0,0,0,0,0
1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,0.10648,...,1,1,1,1,0,1,0,0,1,0
2,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,0.352308,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,0.208989,...,0,0,0,0,0,0,0,0,0,0
4,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,0.125177,...,0,0,0,0,0,0,0,0,0,0


In [8]:
y_train.head(5)

0    1
1    1
2    1
3    1
4    0
Name: Activity, dtype: int64

## Step2: Train

In [9]:
def train_with_RandomForestClassifier(X_train, y_train, test):
    
    cfr = RandomForestClassifier(n_estimators=100, n_jobs=2)
    
    cfr.fit(X_train, y_train)
    predicted_probs = [[(index + 1), x[1]] for index, x in enumerate(cfr.predict_proba(test))]
    
    # cross validation
    skf = StratifiedKFold(n_splits=5, random_state=0)
    scores = cross_val_score(cfr, X_train, y_train, cv=skf,scoring='accuracy')
    print('Accuracy: {}'.format(scores.mean()))
    
    return predicted_probs, scores
    
predicted_probs, scores = train_with_RandomForestClassifier(X_train, y_train, test_df)

Accuracy: 0.7944520482110488


In [10]:
def prepare_submission(predicted_probs):
    result = [['MoleculeId', 'PredictedProbability']]
    result.extend(predicted_probs)
    return result
result = prepare_submission(predicted_probs)

In [11]:
result[0:5]

[['MoleculeId', 'PredictedProbability'],
 [1, 0.91000000000000003],
 [2, 0.81000000000000005],
 [3, 0.47999999999999998],
 [4, 0.97999999999999998]]

## Step3: Save data for submission

In [12]:
def save_submission(submission_data):
    print('data saved')
    np.savetxt('Submissions/submission.csv', submission_data, delimiter=',', fmt='%s')
    
save_submission(result)

data saved
