Continuation of first notebook to improve accuracy of the logistic regression model

In [60]:
import pandas as pd
import numpy as np
import re
import emoji
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk

In [49]:
df = pd.read_csv('Tweets.csv', encoding='latin1')

In [50]:
df.drop_duplicates(inplace=True)
df = df.set_axis(['target','id','date','flag','user','text'], axis='columns')
df['text'] = df['text'].str.lower()

In [51]:
# Define a function to remove mentions
def remove_mentions(text):
    return re.sub(r'@\w+', '', text)

df['text'] = df['text'].apply(remove_mentions)

def clean_text_v2(text):
    text = re.sub(r"http\S+|www\.\S+", "", text)  # Remove URLs
    text = re.sub(r"\w+@\w+\.com", "", text)     # Remove emails
     # Normalize repeated punctuation (! and ?)
    text = re.sub(r"!{2,}", "!", text)  # Replace multiple exclamation marks with one
    text = re.sub(r"\?{2,}", "?",text)  # Replace multiple question marks with one
    text = re.sub(r"[.,;:\"'`]", "", text)     # Remove punctuation  but keep ! and ?
    text = re.sub(r"[@\$%^&*\(\)\\/\+-_=\[\]\{\}<>]", "", text)  # Remove special chars

    text = emoji.demojize(text, delimiters=(" ", " "))
    return text.strip()
df['text'] = df['text'].apply(clean_text_v2)

In [52]:
#80/20 training split
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['target'])

In [53]:
tfidf_1 = TfidfVectorizer(max_features=20000, ngram_range = (1,2)).fit(x_train)

In [54]:
x_train_vectorized = tfidf_1.transform(x_train)
x_test_vectorized = tfidf_1.transform(x_test)

In [55]:
model = LogisticRegression(solver='saga',max_iter=1000 )
model.fit(x_train_vectorized, y_train)
predictions = model.predict(x_test_vectorized)

print('Confusion Matrix: \n', confusion_matrix(y_test, predictions))

print('\nAccuracy: \n', round(accuracy_score(y_test, predictions) * 100, 2), '%')

Confusion Matrix: 
 [[159717  39998]
 [ 34980 165305]]

Accuracy: 
 81.26 %


In [57]:
df['text_length'] = df['text'].apply(len)
df['word_count'] = df['text'].apply(lambda x: len(x.split()))

In [63]:
#nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
df['sentiment_score'] = df['text'].apply(lambda x: sia.polarity_scores(x)['compound'])

In [64]:
df['exclamation_count'] = df['text'].apply(lambda x: x.count('!'))
df['question_count'] = df['text'].apply(lambda x: x.count('?'))

In [72]:
# Extract structured features for the training data
structured_features_train = df.loc[x_train.index, ['text_length', 'word_count', 'sentiment_score', 'exclamation_count', 'question_count']].values

# Extract structured features for the test data
structured_features_test = df.loc[x_test.index, ['text_length', 'word_count', 'sentiment_score', 'exclamation_count', 'question_count']].values

print("Shape of structured_features_train:", structured_features_train.shape)
print("Shape of structured_features_test:", structured_features_test.shape)

Shape of structured_features_train: (1199999, 5)
Shape of structured_features_test: (400000, 5)


In [83]:
from sklearn.preprocessing import StandardScaler

# Scale structured features
scaler = StandardScaler()
structured_features_train2 = scaler.fit_transform(structured_features_train)
structured_features_test2 = scaler.transform(structured_features_test)

In [85]:
from scipy.sparse import hstack

# Combine TF-IDF features with structured features
structured_features = df[['text_length', 'word_count', 'sentiment_score', 'exclamation_count', 'question_count']].values
x_train_combined = hstack([x_train_vectorized, structured_features_train2])
x_test_combined = hstack([x_test_vectorized, structured_features_test2])

# Train Logistic Regression
model3 = LogisticRegression(solver='saga', max_iter=1000)
model3.fit(x_train_combined, y_train)

# Predict and evaluate
prediction3 = model3.predict(x_test_combined)
accuracy = accuracy_score(y_test, prediction3)
print("Accuracy:", round(accuracy * 100, 2), "%")

Accuracy: 81.46 %


Too slow runtime for only a marginal increase in accuracy. Better to go with the first model