# Letter Recognition

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, mean_squared_error, r2_score


In [2]:
# Load in data
df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m13/lesson_3/datasets/letter-recognition.csv')
df

Unnamed: 0,lettr,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,D,2,2,3,3,2,7,7,7,6,6,6,4,2,8,3,7
19996,C,7,10,8,8,4,4,8,6,9,12,9,13,2,9,3,7
19997,T,6,9,6,7,5,6,11,3,7,11,9,5,2,12,2,4
19998,S,2,3,4,2,1,8,7,2,6,10,6,8,1,9,5,8


In [3]:
# Drop the label to create the X data
X = df.drop('lettr', axis=1)
X

Unnamed: 0,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx
0,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,2,2,3,3,2,7,7,7,6,6,6,4,2,8,3,7
19996,7,10,8,8,4,4,8,6,9,12,9,13,2,9,3,7
19997,6,9,6,7,5,6,11,3,7,11,9,5,2,12,2,4
19998,2,3,4,2,1,8,7,2,6,10,6,8,1,9,5,8


In [4]:
# Create the y set from the "lettr" column
y = df["lettr"]
y

0        T
1        I
2        D
3        N
4        G
        ..
19995    D
19996    C
19997    T
19998    S
19999    A
Name: lettr, Length: 20000, dtype: object

In [5]:
# Split the data into training and testing sets using random_state=1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
# Encode the y data with the label encoder
# Create an instance of the label encoder
le = LabelEncoder()

# Fit and transform the y training and testing data using the label encoder
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)
y_train_encoded

array([ 1, 22, 19, ..., 19, 12,  5])

In [7]:
# Scale the X data by using StandardScaler()
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled

array([[-1.05709539, -0.61854094, -0.55159253, ..., -0.86818471,
         0.51106956,  0.73411898],
       [ 0.51134711, -0.31504697, -0.05606112, ...,  1.71736562,
        -1.04162544, -0.49700458],
       [-1.05709539, -0.92203491, -1.54265534, ...,  1.71736562,
        -1.04162544, -0.49700458],
       ...,
       [ 1.55697544,  0.59543493,  0.43947029, ..., -0.22179713,
        -0.26527794, -2.34368991],
       [-0.53428122, -1.83251681, -0.55159253, ..., -2.16095988,
        -0.65345169,  0.1185572 ],
       [-0.01146706,  0.29194096, -0.05606112, ...,  1.07097804,
        -0.26527794, -1.72812814]])

In [8]:
# Transform the test dataset based on the fit from the training dataset
X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[-0.01146706, -0.31504697, -0.55159253, ..., -0.22179713,
        -0.26527794,  0.1185572 ],
       [-1.05709539, -1.22552887, -1.54265534, ..., -0.22179713,
        -0.65345169, -0.49700458],
       [-0.53428122, -0.92203491, -0.55159253, ...,  1.71736562,
        -1.04162544,  0.1185572 ],
       ...,
       [ 1.03416128,  1.20242287,  0.43947029, ...,  1.07097804,
         0.8992433 ,  0.1185572 ],
       [-0.53428122, -0.61854094, -0.05606112, ..., -0.22179713,
        -1.42979918,  0.1185572 ],
       [ 0.51134711,  0.29194096,  1.4305331 , ..., -0.22179713,
         0.8992433 , -1.72812814]])

In [9]:
# Define Models
models = [
    ("Logistic Regression", LogisticRegression()),
    ("SVM", SVC()),
    ("KNN", KNeighborsClassifier()),
    ("Decision Tree", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()),
    ("Extremely Random Trees", ExtraTreesClassifier()),
    ("Gradient Boosting", GradientBoostingClassifier()),
    ("AdaBoost", AdaBoostClassifier())
]

# Create a Pipeline and Evaluate Each Model
results = []
names = []

for name, model in models:
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])
    
    cv_scores = cross_val_score(pipeline, X_train_scaled, y_train_encoded, cv=5, scoring='accuracy')
    
    results.append(cv_scores)
    names.append(name)
    
    print(f"{name}: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
    
    
    
# Detailed Evaluation: 
# Example for a single model, e.g., the Random Forest
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier())
])

pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)

print(classification_report(y_test, predictions))

Logistic Regression: 0.7766 (+/- 0.0076)
SVM: 0.9366 (+/- 0.0045)
KNN: 0.9339 (+/- 0.0008)
Decision Tree: 0.8623 (+/- 0.0038)
Random Forest: 0.9561 (+/- 0.0031)
Extremely Random Trees: 0.9661 (+/- 0.0028)
Gradient Boosting: 0.9131 (+/- 0.0037)




AdaBoost: 0.2515 (+/- 0.0181)
              precision    recall  f1-score   support

           A       0.98      1.00      0.99       202
           B       0.89      0.96      0.93       200
           C       0.99      0.98      0.98       182
           D       0.91      0.97      0.94       197
           E       0.94      0.96      0.95       178
           F       0.95      0.95      0.95       177
           G       0.97      0.96      0.97       188
           H       0.95      0.90      0.92       181
           I       0.96      0.96      0.96       170
           J       0.98      0.95      0.97       174
           K       0.91      0.94      0.92       164
           L       0.99      0.98      0.99       174
           M       0.96      0.97      0.97       188
           N       0.98      0.93      0.95       200
           O       0.94      0.93      0.94       218
           P       0.97      0.95      0.96       215
           Q       0.94      0.99      0.96       1

## Model and Fit to a Logistic Regression Classifier

In [None]:
# Create the logistic regression classifier model with a random_state of 1
lr_model = LogisticRegression(random_state=1, max_iter=500)

# Fit the model to the training data
lr_model.fit(X_train_scaled, y_train_encoded)

In [None]:
# Validate the model by checking the model accuracy with model.score
print(f"Training Data Score: {lr_model.score(X_train_scaled, y_train_encoded)}")
print(f"Testing Data Score: {lr_model.score(X_test_scaled, y_test_encoded)}")

## Model and Fit to a Support Vector Machine

In [None]:
# Create the support vector machine classifier model with a 'rbf' kernel
svm_model = SVC(kernel='rbf')
svm_model = SVC(kernel="")

# Fit the model to the training data
svm_model.fit(X_train_scaled, y_train_encoded)

In [None]:
# Validate the model by checking the model accuracy with model.score
print(f"Training Data Score: {svm_model.score(X_train_scaled, y_train_encoded)}")
print(f"Testing Data Score: {svm_model.score(X_test_scaled, y_test_encoded)}")

## Model and Fit to a KNN Model

In [None]:
import matplotlib.pyplot as plt
# Loop through different k values to find which has the highest accuracy.
# Note: We use only odd numbers because we don't want any ties.
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train_encoded)
    train_score = knn.score(X_train_scaled, y_train_encoded)
    test_score = knn.score(X_test_scaled, y_test_encoded)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
# Plot the results
plt.plot(range(1, 20, 2), train_scores, marker='o', label="training scores")
plt.plot(range(1, 20, 2), test_scores, marker="x", label="testing scores")
plt.xlabel("k neighbors")
plt.ylabel("accuracy score")
plt.legend()
plt.show()

In [None]:
# Create the KNN model with 3 neighbors
knn_model = KNeighborsClassifier(n_neighbors=3)

# Fit the model to the training data
knn_model.fit(X_train_scaled, y_train_encoded)

In [None]:
# Validate the model by checking the model accuracy with model.score
print(f"Training Data Score: {knn_model.score(X_train_scaled, y_train_encoded)}")
print(f"Testing Data Score: {knn_model.score(X_test_scaled, y_test_encoded)}")

## Model and Fit to a Decision Tree Classifier

In [None]:
# Create the decision tree classifier model
dt_model = DecisionTreeClassifier()

# Fit the model to the training data
dt_model.fit(X_train_scaled, y_train_encoded)

In [None]:
# Validate the model by checking the model accuracy with model.score
print(f"Training Data Score: {dt_model.score(X_train_scaled, y_train_encoded)}")
print(f"Testing Data Score: {dt_model.score(X_test_scaled, y_test_encoded)}")

## Model and Fit to a Random Forest Classifier

In [None]:
# Create the random forest classifier model
# with n_estimators=128 and random_state=1
rf_model = RandomForestClassifier(n_estimators=128, random_state=1)

# Fit the model to the training data
rf_model.fit(X_train_scaled, y_train_encoded)

In [None]:
# Validate the model by checking the model accuracy with model.score
print(f"Training Data Score: {rf_model.score(X_train_scaled, y_train_encoded)}")
print(f"Testing Data Score: {rf_model.score(X_test_scaled, y_test_encoded)}")

## Model and Fit to a Gradient Boosting Classifier

In [None]:
# Train the Gradient Boosting classifier
clf = GradientBoostingClassifier(random_state=1).fit(X_train_scaled, y_train_encoded)

# Evaluate the model
print(f'Training Score: {clf.score(X_train_scaled, y_train_encoded)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test_encoded)}')

## Model and Fit to an Adaptive Boosting Classifier

In [None]:
# Train the AdaBoostClassifier
clf = AdaBoostClassifier(random_state=1).fit(X_train_scaled, y_train_encoded)

# Evaluate the model
print(f'Training Score: {clf.score(X_train_scaled, y_train_encoded)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test_encoded)}')