## **Setup and Data Preparation**

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Set the aesthetic style of the plots
sns.set()

from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_log_error



In [None]:
# Load your training and test datasets
train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')



In [None]:
# Preview the data
print(train_data.head())
print(test_data.head())



   id Sex  Length  Diameter  Height  Whole weight  Whole weight.1  \
0   0   F   0.550     0.430   0.150        0.7715          0.3285   
1   1   F   0.630     0.490   0.145        1.1300          0.4580   
2   2   I   0.160     0.110   0.025        0.0210          0.0055   
3   3   M   0.595     0.475   0.150        0.9145          0.3755   
4   4   I   0.555     0.425   0.130        0.7820          0.3695   

   Whole weight.2  Shell weight  Rings  
0          0.1465        0.2400     11  
1          0.2765        0.3200     11  
2          0.0030        0.0050      6  
3          0.2055        0.2500     10  
4          0.1600        0.1975      9  
      id Sex  Length  Diameter  Height  Whole weight  Whole weight.1  \
0  90615   M   0.645     0.475   0.155        1.2380          0.6185   
1  90616   M   0.580     0.460   0.160        0.9830          0.4785   
2  90617   M   0.560     0.420   0.140        0.8395          0.3525   
3  90618   M   0.570     0.490   0.145        0.874

In [None]:
# Select features and target from the training data
X_train = train_data.drop('Rings', axis=1)  # replace 'target_column_name' with the name of your target column
y_train = train_data['Rings']



In [None]:
# Initialize and train the XGBRegressor or any suitable model
# Convert 'Sex' to categorical type explicitly
X_train['Sex'] = X_train['Sex'].astype('category')

# Confirm the conversion
print("After conversion:", X_train['Sex'].dtype)

# Initialize the XGBRegressor with enable_categorical=True
model = XGBRegressor(learning_rate=0.001, n_estimators=1000, enable_categorical=True)

# Fit the model on the training data
model.fit(X_train, y_train)


After conversion: category


In [None]:
# Select features and target from the test data
X_test = test_data.copy()  # ensure this is the same in the training set
y_test = test_data['Sex']

In [None]:
# Pandas get_dummies for one-hot encoding
#X_test = pd.get_dummies(X_test, columns=['Sex'])
X_test['Sex'] = X_test['Sex'].astype('category')  # Assuming you will predict on this later
y_test['Sex'] = X_test['Sex'].astype('category')  # Assuming you will predict on this later
# Confirm the conversion
print("After conversion:", X_test['Sex'].dtype)



After conversion: category


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test['Sex'] = X_test['Sex'].astype('category')  # Assuming you will predict on this later


In [None]:
# Predict the missing target values in the test set
y_pred = model.predict(X_test)
# Create a new DataFrame with just the feature and the predictions
test_data = pd.DataFrame({
    'id': X_test['id'],  # Ensure this column exists in X_test
    'Rings': y_pred               # This column will contain your model's predictions
})

test_data.to_csv('predictions.csv', index=False)


In [10]:
#Check predictions file for conformity
prediction = pd.read_csv('/content/predictions.csv')
print(prediction.head(10))

      id      Rings
0  90615   9.803586
1  90616   9.738421
2  90617  10.107829
3  90618  10.145141
4  90619   8.347319
5  90620   9.638489
6  90621  10.809762
7  90622   7.574483
8  90623   8.739467
9  90624  10.756768
