In [1]:
# import libraries
# sklearn reference: https://scikit-learn.org/
# pandas reference: https://pandas.pydata.org/
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import metrics
import pandas as pd

In [2]:
# text and numeric classes that use sklearn base libaries
class TextTransformer(BaseEstimator, TransformerMixin):
    """
    Transform text features
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None, *parg, **kwarg):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberTransformer(BaseEstimator, TransformerMixin):
    """
    Transform numeric features
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [3]:
# read in your dataframe
df = pd.read_csv(r"C:\Users\klaas.braga\Downloads\AIDI Georgian\AIDI1002 MLP\Final Project\nlp-example\data\DummyDataNLP.csv")

In [4]:
# take a look at the first 5 observations
df.head()

Unnamed: 0,Confirmed_Test,Confirmed_Recovery,Confirmed_New,Text_Feature,Text_Predictor
0,36.434962,82.983704,34.824456,Blue,Label_1
1,75.349163,81.735731,71.921676,Red,Label_2
2,76.678489,33.588094,14.124835,Yellow,Label_3
3,73.356221,79.441778,56.910324,Orange,Label_1
4,74.451946,59.228624,20.042376,Blue,Label_2


In [5]:
# use the term-frequency inverse document frequency vectorizer to transfrom count of text
# into a weighed matrix of term importance
vec_tdidf = TfidfVectorizer(ngram_range=(1,1), analyzer='word', norm='l2')

In [6]:
# compile both the TextTransformer and TfidfVectorizer 
# to the text 'Text_Feature' 
color_text = Pipeline([
                ('transformer', TextTransformer(key='Text_Feature')),
                ('vectorizer', vec_tdidf)
                ])

In [7]:
# compile the NumberTransformer to 'Confirmed_Test', 'Confirmed_Recovery', 
# and 'Confirmed_New' numeric features
test_numeric = Pipeline([
                ('transformer', NumberTransformer(key='Confirmed_Test')),
                ])
recovery_numeric = Pipeline([
                ('transformer', NumberTransformer(key='Confirmed_Recovery')),
                ])
new_numeric = Pipeline([
                ('transformer', NumberTransformer(key='Confirmed_New')),
                ])

In [8]:
# combine all of the features, text and numeric together
features = FeatureUnion([('Text_Feature', color_text),
                      ('Confirmed_Test', test_numeric),
                      ('Confirmed_Recovery', recovery_numeric),
                      ('Confirmed_New', new_numeric)
                      ])

In [9]:
# create the classfier from RF
clf = RandomForestClassifier()

In [10]:
# unite the features and classfier together
pipe = Pipeline([('features', features),
                 ('clf',clf)
                 ])

In [11]:
# transform the categorical predictor into numeric
predicted_dummies = pd.get_dummies(df['Text_Predictor'])

In [12]:
# split the data into train and test
# isolate the features from the predicted field
text_numeric_features = ['Text_Feature', 'Confirmed_Test', 'Confirmed_Recovery', 'Confirmed_New']
predictor = 'Text_Predictor'

X_train, X_test, y_train, y_test = train_test_split(df[text_numeric_features], df[predictor], 
                                                    test_size=0.25, random_state=42)

In [13]:
# fit the model
pipe.fit(X_train, y_train)

In [14]:
# predict from the test set
preds = pipe.predict(X_test)

In [15]:
# see how you did, since this is randomly generated data,
# I would say this accuracy is pretty good :D
print("Accuracy:",metrics.accuracy_score(y_test, preds))

Accuracy: 0.4473684210526316


# Contribution: Integrating a Multi-Layer Perceptron (MLP) model into the existing pipeline and compare it with the RandomForest model

In [16]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

# Combine all features using FeatureUnion
features = FeatureUnion([
    ('Text_Feature', color_text),
    ('Confirmed_Test', test_numeric),
    ('Confirmed_Recovery', recovery_numeric),
    ('Confirmed_New', new_numeric)
])

# Setup the classifiers
rf_clf = RandomForestClassifier()
mlp_clf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, activation='relu', solver='adam', random_state=42)

# Create pipelines for each classifier
pipeline_rf = Pipeline([
    ('features', features),
    ('clf', rf_clf)
])

pipeline_mlp = Pipeline([
    ('features', features),
    ('clf', mlp_clf)
])

# Prepare the data
X = df[['Text_Feature', 'Confirmed_Test', 'Confirmed_Recovery', 'Confirmed_New']]
y = pd.get_dummies(df['Text_Predictor'])  # Assuming the target needs to be transformed as such

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Fit the RandomForest model
pipeline_rf.fit(X_train, y_train)
preds_rf = pipeline_rf.predict(X_test)
print("RandomForest Accuracy:", metrics.accuracy_score(y_test, preds_rf))

# Fit the MLP model
pipeline_mlp.fit(X_train, y_train)
preds_mlp = pipeline_mlp.predict(X_test)
print("MLP Accuracy:", metrics.accuracy_score(y_test, preds_mlp))


RandomForest Accuracy: 0.34210526315789475
MLP Accuracy: 0.10526315789473684




The results indicate that the RandomForest model significantly outperforms the MLP model in this setup, with accuracies of approximately 31.6% and 10.5%

Adjusting the MLP settings and including data scaling in the pipeline to achieve better performance:

In [17]:
from sklearn.preprocessing import StandardScaler

# Define a scaler for numeric features (it's efficient to define one scaler and use it for all numeric features)
scaler = StandardScaler()

# Modify the NumberTransformer to include scaling within its pipeline
class ScaledNumberTransformer(BaseEstimator, TransformerMixin):
    """
    Transform and scale numeric features
    """
    def __init__(self, key):
        self.key = key
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        self.scaler.fit(X[[self.key]])
        return self

    def transform(self, X):
        return self.scaler.transform(X[[self.key]])

# Create pipelines for each numeric feature using the revised transformer that includes scaling
test_numeric = Pipeline([
    ('transformer', ScaledNumberTransformer(key='Confirmed_Test'))
])
recovery_numeric = Pipeline([
    ('transformer', ScaledNumberTransformer(key='Confirmed_Recovery'))
])
new_numeric = Pipeline([
    ('transformer', ScaledNumberTransformer(key='Confirmed_New'))
])

# Combine all features using FeatureUnion
features = FeatureUnion([
    ('Text_Feature', color_text),  # Assuming color_text is defined correctly and does not need scaling
    ('Confirmed_Test', test_numeric),
    ('Confirmed_Recovery', recovery_numeric),
    ('Confirmed_New', new_numeric)
])

# Redefine the MLP with suitable parameters
mlp_clf = MLPClassifier(hidden_layer_sizes=(150, 100), max_iter=500, activation='tanh', solver='adam', random_state=42)

# Create the pipeline for MLP
pipeline_mlp = Pipeline([
    ('features', features),
    ('clf', mlp_clf)
])

# Prepare data and fit the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

pipeline_mlp.fit(X_train, y_train)
preds_mlp = pipeline_mlp.predict(X_test)
print("Revised MLP Accuracy:", metrics.accuracy_score(y_test, preds_mlp))


Revised MLP Accuracy: 0.39473684210526316




The revised MLP model has shown improvement with an accuracy of approximately 39.47%. That is still lower than the original achieved accuracy using only the RF Classifier. Now, let's apply Cross-Validation and Hyperparameter Tuning to the MLP:

In [20]:
from sklearn.model_selection import GridSearchCV, cross_val_score

# Define parameter grid
parameter_space = {
    'clf__hidden_layer_sizes': [(100,), (150, 100), (100, 100, 100)],
    'clf__activation': ['tanh', 'relu'],
    'clf__solver': ['sgd', 'adam'],
    'clf__alpha': [0.0001, 0.05],
    'clf__learning_rate': ['constant','adaptive'],
}

# Create GridSearchCV to find the best parameters
clf = GridSearchCV(pipeline_mlp, parameter_space, n_jobs=-1, cv=3)
clf.fit(X_train, y_train)

print("Best parameters found:\n", clf.best_params_)

# Evaluate the model with best parameters
best_mlp = clf.best_estimator_
scores = cross_val_score(best_mlp, X, y, cv=5)
print("Average accuracy from CV: ", scores.mean())


Best parameters found:
 {'clf__activation': 'relu', 'clf__alpha': 0.05, 'clf__hidden_layer_sizes': (150, 100), 'clf__learning_rate': 'constant', 'clf__solver': 'adam'}




Average accuracy from CV:  0.3




Since the hyperparameter tuning through Cross-Validation did not yield a better accuracy than before, we can set up a simple voting ensemble with RandomForest and MLP:

In [25]:
from sklearn.ensemble import VotingClassifier

# Define the voting classifier combining RandomForest and MLP
voting_clf = VotingClassifier(estimators=[
    ('rf', pipeline_rf),  # assuming pipeline_rf is your RandomForest pipeline
    ('mlp', best_mlp)],   # best_mlp from GridSearchCV
    voting='soft')

# Fit the ensemble model
voting_clf.fit(X_train, y_train)
preds_ensemble = voting_clf.predict(X_test)

# Evaluate the ensemble model
print("Ensemble Accuracy:", metrics.accuracy_score(y_test, preds_ensemble))


Ensemble Accuracy: 0.5


The ensemble model, combining both RandomForest and MLP classifiers, has shown an improvement in accuracy to about 50%. This is better than either of the individual MLP models that have been tested. This suggests that leveraging the strengths of both classifiers through ensemble methods can yield a more robust prediction system.

Now, let's apply Cross-Validation and Hyperparameter Tuning to the ensemble model:

In [28]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'weights': [[1, 1], [1.5, 1], [1, 1.5], [2, 1], [1, 2]],  # Varying weights between RandomForest and MLP
    'rf__clf__n_estimators': [100, 200],  # Varying the number of trees in RandomForest
    'mlp__clf__hidden_layer_sizes': [(100,), (150, 100)],  # Varying architecture of the MLP
    'mlp__clf__activation': ['tanh', 'relu']  # Different activation functions for MLP
}

# Note: Ensure the classifier names in the parameter grid match those in the voting classifier's setup.

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=voting_clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validated accuracy:", grid_search.best_score_)

# Use the best estimator to make predictions
best_voting_clf = grid_search.best_estimator_
predictions = best_voting_clf.predict(X_test)

# Evaluate the best model
from sklearn.metrics import accuracy_score
print("Test Accuracy:", accuracy_score(y_test, predictions))


Fitting 5 folds for each of 40 candidates, totalling 200 fits
Best parameters: {'mlp__clf__activation': 'relu', 'mlp__clf__hidden_layer_sizes': (150, 100), 'rf__clf__n_estimators': 200, 'weights': [1.5, 1]}
Best cross-validated accuracy: 0.48142292490118577
Test Accuracy: 0.5


The results from the hyperparameter tuning and cross-validation indicate that the ensemble model has improved significantly. The enhanced model achieved the best cross-validated accuracy of approximately 48.14% and a test accuracy of 50%. This shows a notable improvement over the individual performances of the RandomForest and MLP models and a similar one to the ensemble configuration.

The best configuration used relu activation for the MLP, a two-layer structure with 150 and 100 neurons, and 200 trees in the RandomForest classifier. The weights favoring RandomForest slightly more (1.5 vs. 1) indicate that the RandomForest is slightly more influential in this optimal setup.

# Conclusion

Although we were able to significantly enhance the accuracy achieved by the MLP model from around 10% to a final 50% tested accuracy in our final model, given that the initial model described in the article had an accuracy of approximately 44.73%, we can see that there has been some 5% improvement.

The enhancement in performance by adding the MLP and using an ensemble method is relatively small (about 5.27% improvement). This suggests that while the ensemble method has provided a better model, the improvement may not justify the increased complexity and computational cost, depending on the specific application and performance requirements.

In practical applications, it's essential to consider the trade-off between the complexity of the model and the performance gain. The more complex the model, the more resources and time are required for training and inference, which might not be ideal for all scenarios.