In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import time
from sklearn.feature_extraction.text import TfidfVectorizer

# Load your DataFrame
df = pd.read_csv('PreprocessedDataset.csv')  

# Drop rows with missing labels in 'Party' and missing text in 'preprocessed_tweet', then reset index
df = df.dropna(subset=['Party', 'preprocessed_tweet']).reset_index(drop=True)

# Check the shape of the DataFrame after dropping NaN values
print(f"Number of rows in DataFrame after dropping NaNs: {len(df)}")

# Get the labels and ensure they are in the correct format
y = df['Party'].values  # Extract labels as a numpy array

# Convert the 'preprocessed_tweet' column to strings, if not already
df['preprocessed_tweet'] = df['preprocessed_tweet'].apply(lambda x: str(x))

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), lowercase=True)  # Set max features to 1000
X = tfidf_vectorizer.fit_transform(df['preprocessed_tweet']).toarray()  # Use the 'preprocessed_tweet' column for TF-IDF

# Check the shapes
print("Shape of X:", X.shape)
print("Number of labels (y):", len(y))

# Encode labels to numeric values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Check the shape of the labels after encoding
print("Shape of y_encoded:", y_encoded.shape)

# Ensure that the number of samples match before splitting
if X.shape[0] != len(y_encoded):
    print("Error: Inconsistent number of samples.")
else:
    # Split data into training and testing sets (80% training, 20% testing)
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    # Initialize the Logistic Regression classifier
    logistic_classifier = LogisticRegression(max_iter=3000)

    # Start timing the training process
    start_time = time.time()

    print("Starting Logistic Regression training...")
    logistic_classifier.fit(X_train, y_train)
    print("Logistic Regression training completed.")

    # End timing
    end_time = time.time()

    # Calculate training time
    training_time = end_time - start_time
    print(f"Training time: {training_time:.2f} seconds")

    # Make predictions on the test set
    y_pred = logistic_classifier.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")

    # Print detailed classification report
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
    
    # Compute and print the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(cm)


Number of rows in DataFrame after dropping NaNs: 44432
Shape of X: (44432, 5000)
Number of labels (y): 44432
Shape of y_encoded: (44432,)
Starting Logistic Regression training...
Logistic Regression training completed.
Training time: 1.35 seconds
Accuracy: 96.25%
              precision    recall  f1-score   support

    Democrat       0.97      0.96      0.96      4485
  Republican       0.96      0.97      0.96      4402

    accuracy                           0.96      8887
   macro avg       0.96      0.96      0.96      8887
weighted avg       0.96      0.96      0.96      8887

Confusion Matrix:
[[4286  199]
 [ 134 4268]]


In [8]:
import joblib

# Save the trained model and the TF-IDF vectorizer
joblib.dump(logistic_classifier, 'logistic_model.pkl')
joblib.dump(tfidf_vectorizer, 'logistictfidf_vectorizer.pkl')
joblib.dump(label_encoder, 'logisticlabel_encoder.pkl')

print("Model, TF-IDF vectorizer, and label encoder saved successfully.")


Model, TF-IDF vectorizer, and label encoder saved successfully.


In [9]:
# Load the trained model and TF-IDF vectorizer
import joblib
logistic_classifier = joblib.load('logistic_model.pkl')
tfidf_vectorizer = joblib.load('logistictfidf_vectorizer.pkl')
label_encoder = joblib.load('logisticlabel_encoder.pkl')

# Example function for manual prediction
def predict_party(tweet):
    # Preprocess the tweet text using the loaded TF-IDF vectorizer
    tweet_transformed = tfidf_vectorizer.transform([tweet]).toarray()

    # Predict using the loaded model
    prediction_encoded = logistic_classifier.predict(tweet_transformed)

    # Decode the prediction back to the original label
    prediction = label_encoder.inverse_transform(prediction_encoded)

    return prediction[0]

# Test the function
sample_tweet = "I believe in the importance of limited government, personal responsibility, and a strong national defense."
predicted_party = predict_party(sample_tweet)
print(f"Predicted Party: {predicted_party}")


Predicted Party: Republican
