In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

train_data = pd.read_csv('train.csv')

train_data['pSat_Pa_log10'] = np.log10(train_data['pSat_Pa']) #log_10 transformation
y = train_data['pSat_Pa_log10']
X = train_data.drop(['pSat_Pa', 'pSat_Pa_log10','Id'], axis=1)

# The categorical features
categorical_features = ['parentspecies']

# Replacing the "None" entries by the mode/most frequent entry (tested without this step and result is so close)
X['parentspecies'] = X['parentspecies'].replace('None', X['parentspecies'].mode().iloc[0])

# One-hot encoding the cateogrical feature
one_hot = OneHotEncoder()
X_encoded = one_hot.fit_transform(X[categorical_features]).toarray()

# Feature scaling for the numerical features only (cruical for the SVR in particular)
numerical_features = [col for col in X.columns if col not in categorical_features]
scaler = StandardScaler()
X_scaled = X.copy()
X_scaled[numerical_features] = scaler.fit_transform(X_scaled[numerical_features])

X_encoded = np.concatenate([X_scaled[numerical_features].values, X_encoded], axis=1)

# Split the data into train and test sets (70/30 division)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=136)

# The SVR model
# Here trained with 70% training set
model = SVR(kernel='rbf', C=0.3, gamma=0.1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

test_data = pd.read_csv('test.csv') 
X_test_data = test_data.drop(['Id'], axis=1) 
 
X_test_data['parentspecies'] = X_test_data['parentspecies'].replace('None', X_test_data['parentspecies'].mode().iloc[0]) 
X_test_encoded = one_hot.transform(X_test_data[categorical_features]).toarray() 
X_test_scaled = X_test_data.copy() 
X_test_scaled[numerical_features] = scaler.transform(X_test_scaled[numerical_features]) 
X_test_encoded = np.concatenate([X_test_scaled[numerical_features].values, X_test_encoded], axis=1) 
 
y_test_pred = model.predict(X_test_encoded) 
 
test_data['target'] = y_test_pred 
test_data[['Id', 'target']].to_csv('predictions_svr_c_value.csv', index=False)


# The R^2 score
r2 = r2_score(y_test, y_pred)
print(f'R^2 Score: {r2}')