In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

df = pd.read_csv("grain-training.csv")
df.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,Class
0,11366,423.11499,171.906647,85.5793,0.867278,11599,0.611404,Osmancik
1,16523,531.892029,224.995422,94.417702,0.907689,16911,0.577041,Cammeo
2,11088,418.208008,172.02742,82.935669,0.876112,11284,0.624993,Osmancik
3,14528,475.447998,192.198563,97.417427,0.862029,14795,0.62949,Cammeo
4,8990,389.377014,157.749603,73.919182,0.883418,9297,0.625261,Osmancik


In [59]:
df.dtypes

Area                   int64
Perimeter            float64
Major_Axis_Length    float64
Minor_Axis_Length    float64
Eccentricity         float64
Convex_Area            int64
Extent               float64
Class                 object
dtype: object

In [60]:
# Encoding categorical target variable
df['Class'] = df['Class'].astype('category')
df['Class'] = df['Class'].cat.codes
print(df['Class'])

0       1
1       0
2       1
3       0
4       1
       ..
3043    0
3044    0
3045    1
3046    0
3047    1
Name: Class, Length: 3048, dtype: int8


In [61]:
# Separating features and target variable
X = df.drop(columns=['Class'])
y = df['Class']

In [62]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [63]:

# Define the parameter grid for grid search
param_grid_1 = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3]
}
# Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 150}
param_grid_2 = {
    'n_estimators': [150, 200, 250],
    'max_depth': [3, 2, 9],
    'learning_rate': [0.01, 0.02, 0.03]
}
# Best Parameters: {'learning_rate': 0.02, 'max_depth': 2, 'n_estimators': 200}
param_grid_3 = {
    'n_estimators': [175, 200, 225],
    'max_depth': [3, 2, 9],
    'learning_rate': [0.015, 0.02, 0.025]
}
# Best Parameters: {'learning_rate': 0.02, 'max_depth': 2, 'n_estimators': 200}

# Instantiate XGBClassifier
model = XGBClassifier()

# Instantiate GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid_3, cv=3, scoring='accuracy', n_jobs=-1)

# Perform grid search
grid_result = grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_result.best_params_
print("Best Parameters:", best_params)


Best Parameters: {'learning_rate': 0.02, 'max_depth': 2, 'n_estimators': 200}


In [64]:
model = XGBClassifier(**best_params)
model.fit(X_train, y_train)

In [65]:
# Making predictions
y_pred = model.predict(X_test)

# Calculating accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9180327868852459


In [69]:
import pandas as pd
from xgboost import XGBClassifier

holdout_data = pd.read_csv('grain-holdout.csv')  

X_holdout = holdout_data

holdout_predictions = model.predict(X_holdout)

prediction_labels = ['Osmancik' if pred == 1 else 'Cammeo' for pred in holdout_predictions]

holdout_data['predictions'] = prediction_labels

print(holdout_data['predictions'])
holdout_data['predictions'].to_csv('Alec_Day-ice-grain-predictions.csv', index=False)

0        Cammeo
1      Osmancik
2      Osmancik
3      Osmancik
4      Osmancik
         ...   
757    Osmancik
758    Osmancik
759      Cammeo
760      Cammeo
761    Osmancik
Name: predictions, Length: 762, dtype: object
