Importing libraries

In [31]:
import pandas as pd, numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from DataProcessing import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix,  accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt 
import pickle
import seaborn as sns

Reading the data

In [2]:
df = pd.read_csv('Data/Training-Manually-Classified.csv')
df.drop(['id'], axis=1, inplace=True)
#df.head()

Data Preprocessing

In [4]:
df['processed'] = df['body'].apply(preprocessing)
df.head()
df.to_csv('processed.csv')

Transforming the data

In [5]:
#this's to transform the reviews into vector space model (VSM)
tfidfvectorizer = TfidfVectorizer(min_df= 5, stop_words="english", sublinear_tf=True)
features = tfidfvectorizer.fit_transform(df['processed']).toarray()
features.shape

(359, 220)

Splitting the data

In [6]:
# split the data into input and output set
#input will be the processed reviews, output will be the categories
X = df['processed']
Y = df['category']

#Split the data into traninng and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

Building a pipeline

In [7]:
# building a pipeline to perform the classification 
pipeline = Pipeline([('vect', tfidfvectorizer),
                     ('chi',  SelectKBest(chi2, k='all')),
                     ('clf', RandomForestClassifier())])

Fitting the model and making prediction

In [8]:
#fitting the model using the traning data
model = pipeline.fit(X_train, y_train)

#Making prediction
prediction = model.predict(X_test)

In [9]:
df['result'] = pd.DataFrame(prediction)
df.to_csv('recults.csv')

Evaluating the model

Confusion Matrix

In [None]:
CM = confusion_matrix(y_test, prediction)

sns.heatmap(CM, annot = True, cmap='YlGnBu', xticklabels = y_test.unique(), yticklabels = y_test.unique())
plt.title('Confusion Matrix', fontsize=15, pad=20)
plt.xlabel('Prediction', fontsize=11)
plt.ylabel('Actual', fontsize=11)

plt.show()


Performance Metrics

In [None]:
# Accuracy
accuracy = accuracy_score(y_test, prediction)
print(f"Accuracy: {accuracy}")

# Precision
precision = precision_score(y_test, prediction, average=None, zero_division=0) #average=None calculates the metric for each class separately, providing detailed performance for all classes.
print(f"Precision: {precision}")

# Recall
recall = recall_score(y_test, prediction, average='weighted')
print(f"Recall: {recall}")

# F1 Score
f1 = f1_score(y_test, prediction, average='weighted')
print(f"F1 Score: {f1}")

# Micro Precision
micro_precision = precision_score(y_test, prediction, average='micro')
print(f"Micro Precision: {micro_precision}")

# Macro Precision
macro_precision = precision_score(y_test, prediction, average='micro')
print(f"Macro Precision: {macro_precision}")

Classification Report

In [None]:
CR = classification_report(y_test, prediction, target_names=y_test.unique(), zero_division=0)
print(CR)

Exporting the pipeline

In [28]:
with open('RandomForest.pickle', 'wb') as f:
    pickle.dump(model, f)