# Random Forest

In [1]:
from sklearn.ensemble import RandomForestClassifier
#from imblearn.over_sampling import SMOTE
import pandas as pd 
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [2]:
df1=pd.read_csv('df4.csv', encoding='ISO-8859-1')
df1.head()

Unnamed: 0,cleaned_text,sentiment
0,what said,neutral
1,plus youve added commercials to the experience...,positive
2,i didnt today must mean i need to take another...,neutral
3,its really aggressive to blast obnoxious enter...,
4,and its a really big bad thing about it,negative


## turn unlabled to -1 *rules of selfTrainClassifier

In [3]:
X = df1["cleaned_text"]
y = df1["sentiment"]

y[y.isna()] = -1
print(y.isna().sum())

0


In [4]:
y = y.map({
    -1:-1,
    "negative":0,
    "neutral": 1,
    "positive": 2
})

In [5]:
X_labeled = X[y != -1]
y_labeled = y[y != -1]

X_unlabeled = X[y == -1]
y_unlabeled = y[y == -1]

In [6]:
X_train,X_test,y_train,y_test = train_test_split(X_labeled,y_labeled,test_size=0.2)
X_train = pd.concat([X_train,X_unlabeled])
y_train = pd.concat([y_train,y_unlabeled])

## Applying Random forest

In [7]:
model7 = Pipeline(steps=[
    ("tfidf", TfidfVectorizer()),
    ("model", SelfTrainingClassifier(RandomForestClassifier(n_estimators=100)))
])

In [8]:
# Train the model
model7.fit(X_train, y_train)

In [9]:
# Make predictions on the test data
y_pred = model7.predict(X_test)

In [10]:
# Compute F1 score
f1 = f1_score(y_test, y_pred, average='weighted')  # You can change the average parameter based on your needs
print("F1 Score is : ", f1)

F1 Score is :  0.6768654681641824


In [11]:
# Print the accuracy score
print("Accuracy Score is : ", accuracy_score(y_test, y_pred))

Accuracy Score is :  0.7256687535571997


In [12]:
# Print the classification report
classification_rep = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_rep)

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.97      0.83      1139
           1       0.65      0.21      0.32       369
           2       0.80      0.36      0.50       249

    accuracy                           0.73      1757
   macro avg       0.73      0.52      0.55      1757
weighted avg       0.72      0.73      0.68      1757



## Applying Random forest using Over-sampling with SMOTE

In [13]:
model7 = Pipeline(steps=[
    ("tfidf", TfidfVectorizer()),
    ("smote", SMOTE()),  # Over-sampling using SMOTE
    ("model", SelfTrainingClassifier(RandomForestClassifier(n_estimators=100)))
])

In [14]:
# Train the model
model7.fit(X_train, y_train)

In [15]:
# Make predictions on the test data
y_pred = model7.predict(X_test)

In [16]:
# Compute F1 score
f1 = f1_score(y_test, y_pred, average='weighted')  # You can change the average parameter based on your needs
print("F1 Score is : ", f1)

F1 Score is :  0.7150548233867742


In [17]:
# Print the accuracy score
print("Accuracy Score is : ", accuracy_score(y_test, y_pred))

Accuracy Score is :  0.742743312464428


In [18]:
# Print the classification report
classification_rep = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_rep)

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.94      0.84      1139
           1       0.64      0.37      0.47       369
           2       0.74      0.39      0.51       249

    accuracy                           0.74      1757
   macro avg       0.71      0.57      0.61      1757
weighted avg       0.73      0.74      0.72      1757

