# Logistic regression

In [1]:
import pandas as pd 
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import joblib

In [2]:
df1=pd.read_csv('df4.csv', encoding='ISO-8859-1')
df1.head()

Unnamed: 0,cleaned_text,sentiment
0,what said,neutral
1,plus youve added commercials to the experience...,positive
2,i didnt today must mean i need to take another...,neutral
3,its really aggressive to blast obnoxious enter...,
4,and its a really big bad thing about it,negative


# turn unlabled to -1 *rules of selfTrainClassifier

In [3]:
X = df1["cleaned_text"]
y = df1["sentiment"]

y[y.isna()] = -1
print(y.isna().sum())


0


In [4]:
y = y.map({
    -1:-1,
    "negative":0,
    "neutral": 1,
    "positive": 2
})

In [5]:
X_labeled = X[y != -1]
y_labeled = y[y != -1]

X_unlabeled = X[y == -1]
y_unlabeled = y[y == -1]

In [6]:
X_train,X_test,y_train,y_test = train_test_split(X_labeled,y_labeled,test_size=0.2)
X_train = pd.concat([X_train,X_unlabeled])
y_train = pd.concat([y_train,y_unlabeled])

# Applying logistic regression

In [7]:
model1 = Pipeline(steps=[
    ("tfidf", TfidfVectorizer()),
    ("model", SelfTrainingClassifier(LogisticRegression(max_iter=200)))
])

In [8]:
model1.fit(X_train, y_train)

In [9]:
y_pred = model1.predict(X_test)

In [10]:
print("Accuracy of logistic regression is :", accuracy_score(y_test, y_pred))

Accuracy of logistic regression is : 0.7865680136596471


In [11]:
f1 = f1_score(y_test, y_pred, average='weighted')

In [12]:
print("F1 Score of logistic regression is :", f1)

F1 Score of logistic regression is : 0.7666861673761528


In [13]:
classification_rep = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_rep)

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.97      0.86      1088
           1       0.76      0.42      0.54       369
           2       0.88      0.58      0.70       300

    accuracy                           0.79      1757
   macro avg       0.81      0.66      0.70      1757
weighted avg       0.79      0.79      0.77      1757



# Applying logistic regression using Over-sampling with SMOTE

In [14]:
model1 = Pipeline(steps=[
    ("tfidf", TfidfVectorizer()),  # Text vectorization using TF-IDF
    ("smote", SMOTE()),  # Over-sampling using SMOTE
    ("model", SelfTrainingClassifier(LogisticRegression(max_iter=200)))  # Self-Training Classifier with Logistic Regression
])

In [15]:
# Train the model on the labeled training data
model1.fit(X_train, y_train)

In [16]:
# Make predictions on the test set
y_pred = model1.predict(X_test)

In [17]:
# Compute and print the F1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score is : ", f1)

F1 Score is :  0.7986946998650918


In [18]:
# Evaluate the model's performance
print("Accuracy is : ", accuracy_score(y_test, y_pred))

Accuracy is :  0.7973819009675583


In [19]:
# Generate and print the classification report
classification_rep = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_rep)

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.87      0.87      1088
           1       0.60      0.64      0.62       369
           2       0.79      0.73      0.76       300

    accuracy                           0.80      1757
   macro avg       0.75      0.75      0.75      1757
weighted avg       0.80      0.80      0.80      1757



In [20]:
joblib.dump(model1, 'BestModel.joblib')

['BestModel.joblib']