In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error

In [2]:
original_df = pd.read_csv('./train.csv')
main_test_df = pd.read_csv('./test.csv')

In [3]:
no_outlier_df = original_df[original_df['Height'] <= 0.3]
no_outlier_df = no_outlier_df[no_outlier_df['Whole weight'] <= 2.25]
no_outlier_df = no_outlier_df[no_outlier_df['Whole weight.1'] <= 1]
no_outlier_df = no_outlier_df[no_outlier_df['Whole weight.2'] <= 0.5]
no_outlier_df = no_outlier_df[no_outlier_df['Shell weight'] <= 0.6]

In [4]:
def generate_features(df):
    df['Min Whole Weight'] = df[['Whole weight', 'Whole weight.1', 'Whole weight.2']].min(axis=1)
    df['Max Whole Weight'] = df[['Whole weight', 'Whole weight.1', 'Whole weight.2']].max(axis=1)
    df['CV Length'] = df['Length'] / df['Length'].mean()

    df['Aspect Ratio'] = df['Length'] / df['Diameter']
    df['Shell Weight Ratio'] = df['Shell weight'] / df['Whole weight']
    df['Density'] = df['Whole weight'] / (df['Length'] * df['Diameter'] * df['Height'])  # Assuming cylinder
    df['Density'] = np.where(df['Density'] == np.inf, 0, df['Density'])
    df['Volume'] = np.pi * (df['Diameter'] / 2) ** 2 * df['Height']  # Assuming cylinder
    df['Length to Height Ratio'] = df['Length'] / df['Height']
    df['Length to Height Ratio'] = np.where(df['Length to Height Ratio'] == np.inf, 0, df['Length to Height Ratio'])

    df['Weight Diff Max-Min'] = df['Max Whole Weight'] - df['Min Whole Weight']
    df['Avg Whole Weight'] = (df['Whole weight'] + df['Whole weight.1'] + df['Whole weight.2']) / 3
    weight_std = df[['Whole weight', 'Whole weight.1', 'Whole weight.2']].std(axis=1)
    df['Std Weight Measures'] = weight_std
    df['CV Whole Weight'] = np.where(weight_std == 0, 0, weight_std / df['Avg Whole Weight'])

    df['Is Male'] = df['Sex'].apply(lambda x: 1 if x == "M" else 0)
    df['Is Female'] = df['Sex'].apply(lambda x: 1 if x == "F" else 0)
    df['Is Infant'] = df['Sex'].apply(lambda x: 1 if x == "I" else 0)
    

    df['Length x Diameter'] = df['Length'] * df['Diameter']
    df['Length x Height'] = df['Length'] * df['Height']
    df['Diameter x Height'] = df['Diameter'] * df['Height']
    df['Length x Shell Weight'] = df['Length'] * df['Shell weight']

    return df

In [5]:
df = generate_features(no_outlier_df.copy())
test_df = generate_features(main_test_df.copy())

In [6]:
scaler = MinMaxScaler()
numerical_features = ['Length', 'Diameter', 'Height', 'Whole weight',
       'Whole weight.1', 'Whole weight.2', 'Shell weight',
       'Min Whole Weight', 'Max Whole Weight', 'CV Length', 'Aspect Ratio',
       'Shell Weight Ratio', 'Density', 'Volume', 'Length to Height Ratio',
       'Weight Diff Max-Min', 'Avg Whole Weight', 'Std Weight Measures',
       'CV Whole Weight','Length x Diameter', 'Length x Height', 'Diameter x Height',
       'Length x Shell Weight']
df[numerical_features] = scaler.fit_transform(df[numerical_features])
test_df[numerical_features] = scaler.transform(test_df[numerical_features])

In [7]:
train_df, valid_df = train_test_split(df, test_size=0.15, random_state=42)
features = ['Is Male', 'Is Female', 'Is Infant'] + numerical_features
target = ['Rings']

In [8]:
X_train, y_train = train_df[features], train_df[target]
X_valid, y_valid = valid_df[features], valid_df[target]
X_test = test_df[features]

In [9]:
rf_regressor = RandomForestRegressor(n_estimators = 600,
                                     max_depth = 20,
                                     min_samples_split = 10,
                                     min_samples_leaf = 10,
                                     random_state=42)
rf_regressor.fit(X_train, y_train.values.ravel())
y_pred = rf_regressor.predict(X_valid)

rmsle = mean_squared_log_error(y_valid, y_pred) ** 0.5

print("Root Mean Squared Logarithmic Error:", rmsle)

KeyboardInterrupt: 

In [None]:
feature_importance = rf_regressor.feature_importances_.argsort()
sorted_indices = np.argsort(feature_importance)[::-1]
sorted_features = [features[i] for i in sorted_indices]
sorted_importances = feature_importance[sorted_indices]

plt.figure(figsize=(12, 6))
plt.bar(range(len(sorted_importances)), sorted_importances, tick_label=sorted_features, color = 'slategray', width=0.5)
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.title('Feature Importances')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


y_test = rf_regressor.predict(X_test)
y_test = np.ceil(y_test)

submission = pd.DataFrame()
submission['id'] = test_df['id']
submission['Rings'] = y_test
submission.to_csv('submission.csv', index=False)