procedure:
    0. prepare the training data for training a model
    1. use SVC
    2. use Random Forest
    3. use the model on new data (test)
    4. write to submission file (csv)

# import statements

In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# declare paths and variables for training data

In [52]:
# import the data from a csv file and show first lines
train_data_csv = "path_to_training_data.csv"
labels = "path_to_target_info_training_data.csv"

train_data= train_data_csv
df_train=pd.read_csv(train_data, sep=',',)

# create a dataframe with all training data except the target column y and the column Id which is not used here
X = df_train.drop(columns=['img_name'])
# check that the columns has been removed
X.head()

Unnamed: 0,x000,x001,x002,x003,x004,x005,x006,x007,x008,x009,...,x090,x091,x092,x093,x094,x095,x096,x097,x098,x099
0,1,1,1,0,1,1,1,1,1,1,...,1,1,1,1,0,1,1,1,1,1
1,1,1,1,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,1,1,1
2,0,0,1,1,1,1,1,1,0,0,...,1,0,0,1,1,1,1,1,0,0
3,0,1,1,1,1,1,1,1,1,1,...,1,0,0,1,1,1,1,0,0,1
4,1,1,1,1,0,1,1,1,1,1,...,1,1,1,1,0,1,1,1,1,1


# prepare the training data for the training

In [53]:
# target values
df_labels=pd.read_csv(labels, sep=';',)
df_labels.head()

Unnamed: 0,id,target
0,0,2
1,1,1
2,2,0
3,3,0
4,4,2


In [55]:
# separate target values
y = df_labels['target'].values

# show first 5 target values
y[0:5]

array([2, 1, 0, 0, 2], dtype=int64)

In [56]:
# split dataset into train (80%) and test data (20%); random_state=1 makes it reproducible; stratify=y ensures that the proportion stays in test and training data sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

# SVC

In [57]:
from sklearn.svm import SVC

# define the SVC model with training and testing slices from above
svm_clf = SVC(gamma="auto")
svm_clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [58]:
# predict the labels for the test slice
y_pred = svm_clf.predict(X_test)
print(y_pred)

[2 0 0 2 1 1 0 1 1 1 1 0 2 2 2 0 1 0 2 0 0 0 0 1 1 2 0 1 2 0 2 0 2 2 0 2 0
 2 0 2 2 1 0 0 1 2 1 0 0 2 1 1 0 1 0 1 2 2 1 0 2 1 2 1 2 2 1 2 1 1 2 2 0 2
 0 2 2 0 0 2 0 0 1 2 1 1 1 0 0 0 0 2 1 0 0 0 2 2 2 1 2 2 1 0 0 1 1 2 2 2 2
 1 1 2 1 1 1 1 0 2 1 0 1 1 2 2 0 1 1 2 0 1 0 0 0 1 2 0 2 1 1 0 0]


In [59]:
from sklearn.model_selection import cross_val_score

# define the SVC model with cross validation
svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_scores.mean()

0.9877484659925676

# Random Forest

In [60]:
# define the random forest model
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

0.9912572379223923

In [61]:
# create a new model to use it with cross validation
model_cv = RandomForestClassifier(n_estimators=100, random_state=42)

# train model with cross validation of 5 
cv_scores = cross_val_score(model_cv, X, y, cv=5)

# print each cv score (=accuracy) and average them
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))

[0.99305556 0.99305556 0.98601399 1.         1.        ]
cv_scores mean:0.9944250194250195


In [62]:
# create a new KNN model to see how GridSearchCV works
csv_model = RandomForestClassifier(random_state=42)

# create a dictionary of all values we want to test 
param_grid = {'n_estimators': np.arange(1, 100)}

# use gridsearch to test all values for n neighbors
csv_gscv = GridSearchCV(csv_model, param_grid, cv=5)

# fit model to data
csv_gscv.fit(X, y)

# check top performing value for n
csv_gscv.best_params_



{'n_estimators': 19}

In [63]:
# check mean score for the top performing value of n
csv_gscv.best_score_

0.9944055944055944

# Use the model on the test data

In [64]:
# make predictions on the unseen test data
test_data= 'path_to_new_data.csv'
df_test=pd.read_csv(test_data, sep=',',)
df_test.head()

Unnamed: 0,img_name,x000,x001,x002,x003,x004,x005,x006,x007,x008,...,x090,x091,x092,x093,x094,x095,x096,x097,x098,x099
0,0715-u014.png,0,1,1,1,1,1,1,1,0,...,1,0,1,1,1,1,1,1,1,0
1,0716-u001.png,1,1,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,1,1
2,0717-u014.png,1,1,1,1,1,1,1,0,0,...,1,0,1,1,1,1,1,1,1,0
3,0718-u003.png,1,1,1,0,0,1,1,1,1,...,0,1,1,1,1,1,1,0,0,1
4,0719-u027.png,1,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1


In [65]:
# create a dataframe with all test data except the img_name column which is not used here
X_new = df_test.drop(columns=['img_name'])

# check that the columns has been removed
X_new.head()

Unnamed: 0,x000,x001,x002,x003,x004,x005,x006,x007,x008,x009,...,x090,x091,x092,x093,x094,x095,x096,x097,x098,x099
0,0,1,1,1,1,1,1,1,0,1,...,1,0,1,1,1,1,1,1,1,0
1,1,1,0,0,0,0,0,0,1,1,...,1,0,0,0,0,0,0,0,1,1
2,1,1,1,1,1,1,1,0,0,1,...,1,0,1,1,1,1,1,1,1,0
3,1,1,1,0,0,1,1,1,1,0,...,0,1,1,1,1,1,1,0,0,1
4,1,0,0,0,0,0,0,0,1,1,...,1,0,0,0,0,0,0,0,0,1


In [66]:
# predict values for new data
csv_gscv.predict(X_new)[0:10]

array([0, 1, 0, 0, 1, 2, 0, 1, 1, 2], dtype=int64)

In [67]:
# if it looks good, use it
model_cv.fit(X, y)
model_cv.predict(X_new)[0:5]
predicted = model_cv.predict(X_new)
print(predicted)

[0 1 0 0 1 2 0 1 1 2 0 1 1 0 0 2 2 0 0 1 2 1 0 0 2 0 1 1 1 0 1 0 1 2 2 2 1
 0 2 2 2 0 1 0 2 0 0 1 0 0 0 0 2 0 2 0 2 1 2 0 2 1 2 1 2 0 1 1 1 1 0 2 0 0
 0 2 0 1 0 0 1 0 1 1 1 2 1 0 0 1 1 0 2 1 0 1 2 0 0 2 0 2 0 0 1 0 2 0 1 1 2
 2 1 2 0 2 0 0 0 1 0 2 0 1 1 0 0 1 2 1 2 0 1 2 2 0 1 1 2 0 2 0 0 1 2 0 1 1
 0 1 0 1 0 2 2 2 2 2 2 1 2 2 2 0 0 1 0 0 2 0 1 2 1 0 0 0 0 0 2 1 1 1 0 0 1
 0 2 1 0 2 2 1 0 2 2 1 2 2 0 1 1 2 2 2 0 2 2 0 2 0 0 1 1 0 2 0 1 2 1 0 2 2
 0 0 2 0 2 0 0 2 1 2 0 2 2 2 0 0 2 2 1 2 2 2 0 0 1 2 1 2 0 2 2 0 0 2 2 0 2
 2 0 2 1 2 2 0 0 1 2 1 2 1 1 1 1 1 2 0 0 1 0 0 2 1 1 1 2 1 1 0 1 1 2 1 2 0
 2 2 1 2 0 0 0 1 1 0 1 1 1 1 0 0 0 1 0 0 2 0 2 0 0 1 2 2 0 2 0 2 2 2 0 0 0
 1 2 2 2 2 2 2 1 0 1 1 2 0 0 0 0 2 0 1 1 1 0 2 1 1 0 1 1 2 2 1 2 0 2 0 1 0
 0 1 0 2 2 1 2 0 1 1 1 0 0 0 2 0 1 2 2 2 2 0 2 0 1 2 0 1 2 1 1 1 0 0 2 1 2
 0 0 2 0 1 1 1 0 1 1 1 2 0 2 1 1 1 1 0 1 0 2 2 0 1 2 1 1 2 0 1 2 1 2 2 2 2
 1 0 0 0 1 0 2 0 1 0 2 0 1 2 2 1 1 2 1 1 2 1 1 2 2 2 1 0 2 0 2 1 0 1 2 0 2
 1 0 2 0 1 0 2 2 2 1 0 0 

In [68]:
# add calculated target values to csv and format for submission
df_new['target'] = predicted
df_submission = df_new
df_submission['id'] = df_submission.index
df_submission['id'] += 715
submission_file = "path_to_submission_file.csv"
df_submission.to_csv(submission_file, index=False, columns=['id','target'])