In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

%matplotlib inline 

# define project path
projectpath = 'projectpath'
train_data_csv = projectpath + 'train.csv'
test_data_csv = projectpath + 'test.csv'

df_train = pd.read_csv(train_data_csv,header=None)

# save target values with index into new dataframe 'labels'
df_labels = pd.read_csv(train_data_csv, names=['target'], header=None, usecols=[1])

# save as array
df_labels = df_labels['target'].values

# save data into new dataframe 'training_data'
df_training_data_raw = df_train.drop(columns=[0,1])

# split dataset into train (80%) and test data (20%); random_state=1 makes it reproducible (could be any number); stratify ensures that the proportion stays in test and training data sets
data_train, data_test, labels_train, labels_test = train_test_split(df_training_data_raw, df_labels, test_size=0.2, random_state=1, shuffle=True, stratify=df_labels)

# Stochastic Gradient Descent

In [2]:
from sklearn import linear_model

In [3]:
clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
clf.fit(data_train, labels_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=1000, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=0.001, verbose=0, warm_start=False)

## see how model performs on test data

In [4]:
# predict the labels for the test slice
labels_pred = clf.predict(data_test)
print(labels_pred[:100])

['E' 'I' 'D' 'B' 'H' 'B' 'H' 'H' 'G' 'D' 'F' 'D' 'G' 'E' 'E' 'F' 'C' 'I'
 'G' 'G' 'H' 'F' 'J' 'I' 'F' 'C' 'I' 'I' 'A' 'J' 'C' 'F' 'J' 'I' 'E' 'D'
 'I' 'C' 'G' 'B' 'A' 'H' 'H' 'C' 'J' 'I' 'J' 'A' 'C' 'B' 'F' 'D' 'J' 'J'
 'A' 'F' 'A' 'I' 'C' 'C' 'B' 'A' 'H' 'D' 'C' 'C' 'C' 'I' 'H' 'F' 'C' 'F'
 'I' 'A' 'A' 'E' 'G' 'E' 'J' 'H' 'H' 'C' 'H' 'I' 'I' 'B' 'H' 'D' 'D' 'I'
 'D' 'B' 'D' 'D' 'B' 'D' 'D' 'H' 'I' 'B']


In [5]:
# check the accuracy for the test data
print("Accuracy:", metrics.accuracy_score(labels_test, labels_pred))
# check the precision for the test data
print("Precision:",metrics.precision_score(labels_test, labels_pred, average=None))
# check the recall for the test data
print("Recall:",metrics.recall_score(labels_test, labels_pred, average=None))

Accuracy: 0.8370726495726496
Precision: [0.76444444 0.88636364 0.79512195 0.68897638 0.8951049  0.93103448
 0.96078431 0.96551724 0.74107143 0.9132948 ]
Recall: [0.9197861  0.8342246  0.87165775 0.93582888 0.68449198 0.86170213
 0.78609626 0.7486631  0.88297872 0.84491979]


In [6]:
df_test=pd.read_csv(test_data_csv, sep=',', header=None)
test_data_new = df_test.drop(columns=[0])
# predict values for new data
predicted = clf.predict(test_data_new)
print(predicted[0:100])

['A' 'A' 'A' 'A' 'D' 'B' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'D' 'A' 'A' 'A' 'A'
 'A' 'A' 'A' 'A' 'A' 'I' 'A' 'A' 'A' 'A' 'A' 'A' 'D' 'A' 'A' 'B' 'A' 'A'
 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A'
 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'H' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A'
 'A' 'A' 'A' 'A' 'A' 'A' 'J' 'A' 'A' 'I' 'E' 'A' 'A' 'A' 'A' 'A' 'A' 'D'
 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'I' 'A']


In [7]:
df_submission = pd.DataFrame(columns=['id','target'])
predicted = pd.Series(predicted)
df_submission.head()

Unnamed: 0,id,target


In [8]:
# add calculated target values to csv and format for submission
df_submission['target'] = predicted
df_submission['id'] = df_submission.index
submission_file = projectpath + 'submission.csv'
df_submission.head()

Unnamed: 0,id,target
0,0,A
1,1,D
2,2,A
3,3,D
4,4,D


In [9]:
df_submission.to_csv(submission_file, index=False, columns=['id','target'])