In [146]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
import itertools
import joblib 
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel

In [147]:
df= pd.read_csv("train.csv",index_col=0 )

In [148]:
# Cleaning the data 



# Dropping rows with negative pH values
# Dropping rows where data is missing. 
#df_cleaned = df_cleaned.dropna()

#df_cleaned = df_cleaned.fillna(df.mode().iloc[0]) #0.9153
df_cleaned= df.fillna(df.median()) #0.9208
#df_filled = df.fillna(df.mean())

df_cleaned = df_cleaned.drop(df_cleaned[df_cleaned['pH'] < 0].index)


#Dropping rows with feature values that clearly looks like outliers.
# I did this manually by just looking at the plots, but i could've also removed data with z-score
df_cleaned= df_cleaned.drop(df_cleaned[df_cleaned["Acoustic Firmness Index"] > 50].index)
#df_cleaned= df_cleaned.drop(df_cleaned[df_cleaned["Luminescence Intensity (a.u.)"] > 0.017].index)
df_cleaned= df_cleaned.drop(df_cleaned[df_cleaned["Odor index (a.u.)"] > 85].index)
# For checking how many rows i've removed



df_cleaned

Unnamed: 0,Acoustic Firmness Index,Atmospheric Pressure at Harvest (Pa),Bitterness Scale,Circumference (mm),Color Intensity (a.u.),Find Distance from Main Vulcano (km),Length (mm),Luminescence Intensity (a.u.),Magnetic orientation (degree),Odor index (a.u.),Seed Count,Skin Thickness (mm),Soil pH where Grown,Sugar Content (mg),Weight (mg),pH,Edible
0,28.7,98741.0,0.0,222.897985,70.28,103.145212,35.481123,0.002636,61.877407,2.700236,71.355713,2.5,8.14,13792.0,87411.1826,5.090000,1.0
1,19.4,96077.0,0.0,205.832386,66.48,102.005624,32.609013,0.015950,297.191998,9.618586,46.291493,2.0,7.63,12985.0,72608.1826,4.990000,1.0
2,22.1,109154.0,2.0,234.406576,69.92,37.847317,37.926057,0.000279,61.103057,7.970050,54.442385,1.5,8.03,10008.0,109375.1826,5.219156,0.0
3,28.0,105277.0,5.0,214.999623,64.05,100.118399,34.610539,0.002323,58.677047,68.116450,106.401880,4.5,7.43,11303.0,84958.7826,3.228828,0.0
4,22.6,103898.0,2.0,180.542626,53.63,63.590438,28.948902,0.008611,317.261190,7.953544,48.057754,5.0,6.71,11475.0,49717.1826,4.800000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1242,17.1,91630.0,4.0,232.216559,58.31,20.549281,37.306247,0.003299,159.307920,2.108987,46.818059,6.0,7.14,8689.0,104461.1826,3.563446,0.0
1243,16.8,99423.0,0.0,212.988102,61.08,40.910882,33.728356,0.003155,195.342578,25.475841,7.033815,3.5,6.67,12224.0,77045.1826,5.000000,1.0
1245,13.6,93981.0,0.0,181.435023,69.64,48.416290,29.071256,0.000355,118.170049,7.579661,7.756912,2.0,6.99,10423.0,51560.1826,4.880000,1.0
1246,26.7,98269.0,1.0,228.399719,76.16,46.333693,37.776037,0.002577,217.742170,14.873524,66.630218,2.0,6.95,6753.0,105907.1826,3.383773,0.0


In [149]:
# Removing the target column from the DataFrame and uses the rest as features
X = df_cleaned.drop('Edible', axis=1)  
# Selcts Edible as target columns 
y = df_cleaned['Edible']

#Splitting up into training and test sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Initialise standard scaler and compute mean and stddev from training data
sc = StandardScaler()
sc.fit(X_train)


# Transform (standardise) both X_train and X_test with mean and stddev from
# training data, to avoid leakage 
X_train_sc = sc.transform(X_train)
X_test_sc = sc.transform(X_test)

In [150]:
# No need to standardize the data for Random Forest 

# Trying ot RandomForestClassifier
#rfc = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=4, max_depth=100, criterion='gini')
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)

# Making predictions with training data
y_pred_train = rfc.predict(X_train)
#computing accuracy from training data
accuracy_train = accuracy_score(y_train, y_pred_train)
print(f'Accuracy on training data: {accuracy_train:.2f}')

# Making predictions with test data
y_pred = rfc.predict(X_test)
#computing accuracy from test data
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy on test data: {accuracy:.2f}')


Accuracy on training data: 1.00
Accuracy on test data: 0.92


In [151]:
# Getting feature importance
feature_importances = rfc.feature_importances_

# Creating a DataFrame for feature importances
features_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sorting the features by importance
features_df = features_df.sort_values(by='Importance', ascending=False)
print("Feature importances:\n", features_df)


Feature importances:
                                  Feature  Importance
15                                    pH    0.234173
13                    Sugar Content (mg)    0.126118
2                       Bitterness Scale    0.117119
14                           Weight (mg)    0.080775
6                            Length (mm)    0.069948
7          Luminescence Intensity (a.u.)    0.068749
9                      Odor index (a.u.)    0.056343
3                     Circumference (mm)    0.047983
0                Acoustic Firmness Index    0.038648
8          Magnetic orientation (degree)    0.026284
1   Atmospheric Pressure at Harvest (Pa)    0.025742
4                 Color Intensity (a.u.)    0.025164
5   Find Distance from Main Vulcano (km)    0.024629
12                   Soil pH where Grown    0.023813
10                            Seed Count    0.018422
11                   Skin Thickness (mm)    0.016090


In [152]:
# Define the number of top features you want to select
n = 15 # Change this to the number of top features you want to select

# Select the top n features
selected_features = features_df.head(n)['Feature'].tolist()

# Create new X_train and X_test datasets with only the selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Print the selected features
print("Selected features:", selected_features)

Selected features: ['pH', 'Sugar Content (mg)', 'Bitterness Scale', 'Weight (mg)', 'Length (mm)', 'Luminescence Intensity (a.u.)', 'Odor index (a.u.)', 'Circumference (mm)', 'Acoustic Firmness Index', 'Magnetic orientation (degree)', 'Atmospheric Pressure at Harvest (Pa)', 'Color Intensity (a.u.)', 'Find Distance from Main Vulcano (km)', 'Soil pH where Grown', 'Seed Count']


In [156]:
# Hyperparameters to try
#n_estimators_list = [100]
n_estimators_list= [100, 200]

max_depth_list = [5,6,7]
#max_depth_list= [8, 10, 12, 15, 20]
min_samples_split_list = [2,3]
#min_samples_split_list = [8, 10, 12, 15 ,20]

#random_states = [10, 42, 50, 60, 100]
random_states= [42]
criterion_list = ["gini"]




# Store the best configuration
best_accuracy = 0
best_config = {}

# Iterate over all combinations of hyperparameters and random states
for criterion in criterion_list:
    for random_state in random_states:
        for max_depth in max_depth_list:
            for min_samples_split in min_samples_split_list:
                for n_estimators in n_estimators_list:
                    
                        # Initialize and train the RandomForest model
                        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, random_state=random_state)
                        model.fit(X_train, y_train)
                        
                        # Evaluate the model
                        predictions = model.predict(X_test)
                        accuracy = accuracy_score(y_test, predictions)
                        
                        # Update the best configuration if necessary
                        if accuracy > best_accuracy:
                            best_accuracy = accuracy
                            best_config = {
                                'n_estimators': n_estimators,
                                'max_depth': max_depth,
                                'min_samples_split': min_samples_split,
                                'random_state': random_state,
                                'criterion': criterion,
                            }

# Print the best configuration and its accuracy
print("Best Configuration:")
print(best_config)
print(f"Best Validation Accuracy: {best_accuracy:.4f}")

Best Configuration:
{'n_estimators': 100, 'max_depth': 6, 'min_samples_split': 3, 'random_state': 42, 'criterion': 'gini'}
Best Validation Accuracy: 0.9187


In [157]:
final_model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=4, max_depth=100, criterion='gini')
#final_model= RandomForestClassifier(**best_config)

# Training the classifierf
#final_model.fit(X_reduced, y_reduced)
final_model.fit(X, y)

x_val= pd.read_csv("test.csv",index_col=0 )

#x_val_selected = x_val[best_feature_set]

predictions= final_model.predict(x_val).astype(int)


In [158]:
# Create a DataFrame with predictions
predictions_df = pd.DataFrame(predictions, columns=['Edible'])
predictions_df.index.name = 'index'

# Save to a CSV file
predictions_df.to_csv('predictions.csv')