In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, FunctionTransformer
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from scipy.ndimage import shift


In [2]:
train_df = pd.read_csv(r'D:\Kaggle\digit-recognizer\train.csv')
test_df = pd.read_csv(r'D:\Kaggle\digit-recognizer\test.csv')
train_df_copy = train_df.copy()
test_df_copy = test_df.copy()

In [3]:
train_df.columns

Index(['label', 'pixel0', 'pixel1', 'pixel2', 'pixel3', 'pixel4', 'pixel5',
       'pixel6', 'pixel7', 'pixel8',
       ...
       'pixel774', 'pixel775', 'pixel776', 'pixel777', 'pixel778', 'pixel779',
       'pixel780', 'pixel781', 'pixel782', 'pixel783'],
      dtype='object', length=785)

In [6]:
def row_operation(row, direction):
    label = row.iloc[0]
    image = row.iloc[1:].values.reshape((28,28))
    
    if direction == 'left':
        shifted_img = shift(image, shift=[0, -1], mode='constant', cval=0)
    elif direction == 'right':
        shifted_img = shift(image, shift=[0, 1], mode='constant', cval=0)
    elif direction == 'top':
        shifted_img = shift(image, shift=[-1, 0], mode='constant', cval=0)
    elif direction == 'bottom':
        shifted_img = shift(image, shift=[1, 0], mode='constant', cval=0)
    
    
    return [label]+shifted_img.flatten().tolist()

# Apply the function row-wise for different shifts
df_left = train_df.apply(lambda row: row_operation(row, 'left'), axis=1,result_type='expand')
df_right = train_df.apply(lambda row: row_operation(row, 'right'), axis=1,result_type='expand')
df_top = train_df.apply(lambda row: row_operation(row, 'top'), axis=1,result_type='expand')
df_bottom = train_df.apply(lambda row: row_operation(row, 'bottom'), axis=1,result_type='expand')





In [7]:
df_left.columns= df_right.columns= df_top.columns= df_bottom.columns = train_df.columns

train_df = pd.concat([train_df,df_left,df_right,df_bottom,df_top],axis = 0).reset_index(drop=True)

In [8]:
X = train_df.drop(['label'],axis=1)
y = train_df['label']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
x_out = scaler.transform(test_df)

In [None]:
log_reg = LogisticRegression(max_iter=100, solver='lbfgs', multi_class='multinomial')
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
out = log_reg.predict(test_df)
submission = pd.DataFrame({
    'ImageId': range(1, len(out) + 1),
    'Label': out
})

submission.to_csv(r'D:\Kaggle\digit-recognizer\reg_submission.csv', index=False)

In [10]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier()

tree_clf.fit(X_train, y_train)

y_pred = tree_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

out = tree_clf.predict(test_df)

submission = pd.DataFrame({
    'ImageId': range(1, len(out) + 1),
    'Label': out
})

submission.to_csv(r'D:\Kaggle\digit-recognizer\tree_submission.csv', index=False)


Accuracy: 0.84


In [13]:
from sklearn.ensemble import RandomForestClassifier


# Train the model
rf_clf = RandomForestClassifier(n_estimators=100)
rf_clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.5f}")

# Predict on the competition test data
out = rf_clf.predict(test_df)

# Prepare submission
submission = pd.DataFrame({
    'ImageId': range(1, len(out) + 1),
    'Label': out
})

submission.to_csv(r'D:\Kaggle\digit-recognizer\random_submission.csv', index=False)


Accuracy: 0.97529


In [12]:
print(f"Accuracy: {accuracy:.5f}")


Accuracy: 0.97521


In [18]:
from sklearn.svm import SVC


# Train the model
svm_clf = SVC(kernel='rbf', C=1, max_iter=10000,verbose=True)
svm_clf.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = svm_clf.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.5f}")

# Predict on the competition test data
out = svm_clf.predict(x_out)

# Prepare submission
submission = pd.DataFrame({
    'ImageId': range(1, len(out) + 1),
    'Label': out
})

submission.to_csv(r'D:\Kaggle\digit-recognizer\svc_submission.csv', index=False)


[LibSVM]Accuracy: 0.97129


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Train the model
gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = gb_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Predict on the competition test data
out = gb_clf.predict(test_df)

# Prepare submission
submission = pd.DataFrame({
    'ImageId': range(1, len(out) + 1),
    'Label': out
})

submission.to_csv(r'D:\Kaggle\digit-recognizer\gb_submission.csv', index=False)


In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

# Train the model
knn_clf = KNeighborsClassifier(n_neighbors=3, weights = 'distance')
knn_clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.5f}")

# Predict on the competition test data
out = knn_clf.predict(test_df)

# Prepare submission
submission = pd.DataFrame({
    'ImageId': range(1, len(out) + 1),
    'Label': out
})

submission.to_csv(r'D:\Kaggle\digit-recognizer\kn_submission.csv', index=False)


Accuracy: 0.98236


In [7]:
knn = KNeighborsClassifier()

param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance']
}

grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy',verbose=2)
grid_search.fit(X_train, y_train)

print("Best parameters found:", grid_search.best_params_)
best_knn = grid_search.best_estimator_ # or random_search.best_estimator_
y_pred = best_knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {accuracy:.2f}")


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END .....................n_neighbors=3, weights=uniform; total time=   3.6s
[CV] END .....................n_neighbors=3, weights=uniform; total time=   3.7s
[CV] END .....................n_neighbors=3, weights=uniform; total time=   3.3s
[CV] END .....................n_neighbors=3, weights=uniform; total time=   3.9s
[CV] END .....................n_neighbors=3, weights=uniform; total time=   3.5s
[CV] END ....................n_neighbors=3, weights=distance; total time=   3.2s
[CV] END ....................n_neighbors=3, weights=distance; total time=   3.4s
[CV] END ....................n_neighbors=3, weights=distance; total time=   3.5s
[CV] END ....................n_neighbors=3, weights=distance; total time=   3.1s
[CV] END ....................n_neighbors=3, weights=distance; total time=   3.3s
[CV] END .....................n_neighbors=5, weights=uniform; total time=   3.3s
[CV] END .....................n_neighbors=5, wei

In [None]:
from sklearn.svm import SVC


# Train the model
svm_clf = SVC(kernel='poly', degree=9, C=1, max_iter=10000,verbose=True)
svm_clf.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = svm_clf.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.5f}")

# Predict on the competition test data
out = svm_clf.predict(x_out)

# Prepare submission
submission = pd.DataFrame({
    'ImageId': range(1, len(out) + 1),
    'Label': out
})

submission.to_csv(r'D:\Kaggle\digit-recognizer\svc_submission.csv', index=False)


In [None]:
SVC(kernel='poly', degree=9, C=1, random_state=42)