In [2]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

%matplotlib inline 

# define project path
projectpath = 'projectpath'
train_data_csv = projectpath + 'train.csv'
test_data_csv = projectpath + 'test.csv'

df_train = pd.read_csv(train_data_csv,header=None)

# save target values with index into new dataframe 'labels'
df_labels = pd.read_csv(train_data_csv, names=['target'], header=None, usecols=[1])

# save as array
df_labels = df_labels['target'].values

# save data into new dataframe 'training_data'
df_training_data_raw = df_train.drop(columns=[0,1])

# split dataset into train (80%) and test data (20%); random_state=1 makes it reproducible (could be any number); stratify ensures that the proportion stays in test and training data sets
data_train, data_test, labels_train, labels_test = train_test_split(df_training_data_raw, df_labels, test_size=0.2, random_state=1, shuffle=True, stratify=df_labels)

## PCA

In [3]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
data_train_reduced = pca.fit_transform(data_train)

## Gradient Boost Classifier

In [4]:
from sklearn.ensemble import GradientBoostingClassifier

## fit model on training data

In [None]:
clf = GradientBoostingClassifier(n_estimators=9300, learning_rate=0.1, max_depth=7, random_state=0)
clf.fit(data_train_reduced, labels_train)

## check results

In [None]:
clf.score(data_test, labels_test)

In [None]:
# predict the labels for the test slice
labels_pred = clf.predict(data_test)
print(labels_pred[:100])

In [None]:
# check the accuracy for the test data
print("Accuracy:", metrics.accuracy_score(labels_test, labels_pred))
# check the precision for the test data
print("Precision:",metrics.precision_score(labels_test, labels_pred, average=None))
# check the recall for the test data
print("Recall:",metrics.recall_score(labels_test, labels_pred, average=None))

In [None]:
df_test=pd.read_csv(test_data_csv, sep=',', header=None)
test_data_new = df_test.drop(columns=[0])
# predict values for new data
predicted = clf.predict(test_data_new)
print(predicted[0:100])

In [None]:
df_submission = pd.DataFrame(columns=['id','target'])
predicted = pd.Series(predicted)
df_submission.head()

In [None]:
# add calculated target values to csv and format for submission
df_submission['target'] = predicted
df_submission['id'] = df_submission.index
submission_file = projectpath + 'submission.csv'
df_submission.head()

In [None]:
df_submission.to_csv(submission_file, index=False, columns=['id','target'])