In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from warnings import filterwarnings
filterwarnings('ignore')

plt.style.use('seaborn-whitegrid')
plt.rc('figure', autolayout=True)
plt.rc(
    'axes',
    labelweight='bold',
    labelsize='large',
    titleweight='bold',
    titlesize=14,
    titlepad=10
)

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


In [2]:
train = pd.read_csv('train.csv', verbose=True, index_col='id')

In [3]:
train.head()

In [4]:
train.tail()

In [5]:
train.info()

In [6]:
train.shape

In [7]:
train.describe(include='all').T

In [8]:
sns.countplot(x='Sex', data=train)
plt.show()

In [9]:
px.box(data_frame=train, x='Age', y='Sex')

In [10]:
plt.scatter(x='Diameter', y='Weight', data=train, c='g', marker='+')
plt.show()

In [11]:
corr = train.corr()
sns.heatmap(corr, annot=True,)
plt.show()

Most of the features are linearly correlated

### Feature engineering

In [12]:
train['Lenght/Weight'] = train['Length'] / train['Weight']
train['Weight__sqrt'] = train['Weight'] ** 2
train['Height/Weight'] = train['Height'] / train['Weight']
train['Diameter_red'] = (train['Diameter'] * 0.5) * 5
train['Weight_sum'] = np.sum(train[['Shucked Weight', 'Viscera Weight', 'Shell Weight']], axis=1)
train['Length_sqrt'] = train['Length'] ** 2

In [13]:
train.head()

In [14]:
encoder = LabelEncoder()
train['Sex'] = encoder.fit_transform(train['Sex'])

In [15]:
train.columns

In [16]:
train.describe(include='all').T

In [17]:
train.var()

In [18]:
X = train.drop('Age', axis=1)
y = train['Age']

In [19]:
scaler = StandardScaler()
scale_X = scaler.fit_transform(X)

In [20]:
scale_X = pd.DataFrame(scale_X, columns=X.columns)
scale_X

In [21]:
scale_X.corr()

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [23]:
def get_mae(X_train, y_train, X_test, y_test, models):
    results = []
    for i, (model_name, model) in enumerate(models.items()):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        
        results.append([model_name, mae, mse])
    return  pd.DataFrame(results, columns=['model_name', 'mae', 'mse'])

In [24]:
models = {
    'gradBoost': GradientBoostingRegressor(),
    'randomForest': RandomForestRegressor(),
    'XGBoost': XGBRegressor(),
    'tree': DecisionTreeRegressor()
}

In [None]:
get_mae(X_train, y_train, X_test, y_test, models)

In [None]:
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.4],  # The learning rate controls the contribution of each tree in the ensemble.
    'n_estimators': [100, 300, 500, 700, 100],  # The number of boosting stages to perform.
    'min_samples_split': [2, 3, 4, 5, 6, 7, 8],  # The minimum number of samples required to split an internal node.
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8],  # The minimum number of samples required to be at a leaf node.
    'max_depth': [2, 3, 4, 5, 6, 7, 8],  # The maximum depth of the individual regression estimators.
    'max_features': [2, 3, 4, 5, 6, 7, 8, 9, 10],  # The number of features to consider when looking for the best split.
    'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.90]  # The shrinkage parameter for the learning rate.
}

In [None]:
model = GradientBoostingRegressor()
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

In [None]:
# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

In [None]:
best_model, best_score

In [None]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [None]:
test = pd.read_csv('test.csv', )
test['Sex'] = encoder.fit_transform(test['Sex'])

In [None]:
test['Lenght/Weight'] = test['Length'] / test['Weight']
test['Weight__sqrt'] = test['Weight'] ** 2
test['Height/Weight'] = test['Height'] / test['Weight']
test['Diameter_red'] = (test['Diameter'] * 0.5) * 5
test['Weight_sum'] = np.sum(test[['Shucked Weight', 'Viscera Weight', 'Shell Weight']], axis=1)
test['Length_sqrt'] = test['Length'] ** 2

In [None]:
test.head()

In [None]:
predictions = best_model.predict(test.drop('id', axis=1))
predictions = [round(age, ndigits=0) for age in predictions]

In [None]:
submission = pd.DataFrame()
submission['id'] = test[['id']]
submission['Age'] = predictions
submission = submission.set_index('id')

In [None]:
submission.to_csv('submission.csv')