In [40]:
import pickle
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [13]:
# Load data csv format
dataset = pd.read_csv(r'C:\Users\Hp\Desktop\Chatbot\data\data_Labeled.csv', sep=';', header=0) 
#print(dataset)

In [33]:
# Labels are the values we want to predict
labels = dataset["label"].fillna(' ')
features = dataset["clean"].fillna(' ')

In [23]:
# One-hot encode the data using pandas get_dummies
labels = pd.get_dummies(labels)

In [34]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.3, random_state = 42)

In [35]:
#    ______ Building model "Random Forest Model" ______

# Pipeline for transforming data, fitting to model and predicting
model_pipeline = Pipeline(steps=[
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()), 
        ('clf', RandomForestClassifier())])
# Instantiate model with 100 decision trees  
parameters = {
        'clf__min_samples_split': [10],
        'clf__n_estimators':      [100]
       #'clf__max_leaf_nodes':    [25],
       #'clf__max_depth':         [20],
       #'clf__min_samples_leaf':  [5]
}

#GridSearch with the above parameters
model = GridSearchCV(
        model_pipeline, param_grid=parameters, scoring='accuracy', verbose=2
   )

In [36]:
# Training model "Random Forest Model"
model.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] clf__min_samples_split=10, clf__n_estimators=100 ................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] . clf__min_samples_split=10, clf__n_estimators=100, total=   4.9s
[CV] clf__min_samples_split=10, clf__n_estimators=100 ................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.8s remaining:    0.0s


[CV] . clf__min_samples_split=10, clf__n_estimators=100, total=   4.5s
[CV] clf__min_samples_split=10, clf__n_estimators=100 ................
[CV] . clf__min_samples_split=10, clf__n_estimators=100, total=   4.5s
[CV] clf__min_samples_split=10, clf__n_estimators=100 ................
[CV] . clf__min_samples_split=10, clf__n_estimators=100, total=   5.0s
[CV] clf__min_samples_split=10, clf__n_estimators=100 ................
[CV] . clf__min_samples_split=10, clf__n_estimators=100, total=   4.5s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   23.3s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        p

In [37]:
#    ______ Evaluating model ______

y_pred = model.predict(X_test)

print("Test accuracy:\n",accuracy_score(y_test, y_pred)*100)

Test accuracy:
 45.2


In [41]:
#    ______ Saving model ______

model_filepath = 'C:/Users/Hp/Desktop/Precious_Model/Random_Forest_Model.pkl'
def save_model(model: GridSearchCV, model_filepath: str) -> None:
    """
    save the model to the designated model_filepath
    :param model: GridSearchCV model to be saved
    :param model_filepath: the file path for the saved model
    :return: None
    """

    with open(model_filepath, 'wb') as file:
        pickle.dump(model, file)

print('Saving model...\n    MODEL: {}'.format(model_filepath))
save_model(model, model_filepath)
print('Trained model saved!')

Saving model...
    MODEL: C:/Users/Hp/Desktop/Precious_Model/Random_Forest_Model.pkl
Trained model saved!


In [42]:
#Comparing the Real Values with Predicted Values

df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred})
df

Unnamed: 0,Real Values,Predicted Values
4122,java,java
4065,java,java
1731,cplusplus,cplusplus
4740,javascript,java
6391,php,php
...,...,...
5580,matlab,matlab
1074,system,system
3063,html,javascript
4554,javascript,no_meaning
