In [9]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline

In [11]:
# Load in data
train_data = pd.read_csv('train_dataset.csv')
test_data = pd.read_csv('test_dataset.csv')

# Remove missing values
train_data.dropna(axis=0, inplace=True)

# Split into attributes and labels
X = train_data.iloc[:, :-1]
y = train_data.iloc[:, -1]

# Save id column for later Kaggle submission
id_col = test_data['id']
X = X.drop(['id'], axis=1)
test_data = test_data.drop(['id'], axis=1)

# Drop categorical/nominal data
numeric_attributes = X.select_dtypes(include='number').columns
X = X[numeric_attributes]
test_data = test_data[numeric_attributes]

In [12]:
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=200, criterion='entropy'))
pipeline.fit(X, y)

predictions = pipeline.predict(test_data)

kfolds = StratifiedKFold(n_splits = 10, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X, y, cv=kfolds, scoring='accuracy')

print("CV Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())


CV Scores: [0.74086379 0.74418605 0.72425249 0.68       0.74       0.72
 0.72333333 0.72666667 0.73333333 0.73      ]
Mean CV Score: 0.7262635658914729
