In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
file_path = 'Dataset.csv'
output_path = 'PreprocessedDataset.csv'
df = pd.read_csv(output_path)

# Ensure 'preprocessed_tweet' has no NaN values
df = df.dropna(subset=['preprocessed_tweet']).reset_index(drop=True)

# Convert the 'preprocessed_tweet' column to strings (if they are not already)
df['preprocessed_tweet'] = df['preprocessed_tweet'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 2), lowercase=True)

# Fit and transform the text data
X = tfidf.fit_transform(df['preprocessed_tweet']).toarray()

# Check the shape
print("Shape of X:", X.shape)



Shape of X: (44432, 1000)


In [2]:
print(df['Party'].value_counts())

Party
Republican    22302
Democrat      22130
Name: count, dtype: int64


In [3]:
import numpy as np

# Save the TF-IDF matrix
np.save('tfidf_vectors.npy', X)

# Save the feature names (terms)
feature_names = tfidf.get_feature_names_out()  # Get the list of feature names (terms)
np.save('tfidf_feature_names.npy', feature_names)

# Later, when needed, load them:
X_loaded = np.load('tfidf_vectors.npy')  # Load TF-IDF matrix

# Load feature names, allowing pickle since it's an object array
feature_names_loaded = np.load('tfidf_feature_names.npy', allow_pickle=True)

# Example: Print terms and their TF-IDF values for the first tweet
for i, value in enumerate(X_loaded[0]):
    if value != 0:
        print(f"Term: {feature_names_loaded[i]}, TF-IDF Value: {value}")


Term: act, TF-IDF Value: 0.8561992486933349
Term: amendment, TF-IDF Value: 0.5166457650430989


In [4]:
print(X_loaded.shape)
print(X_loaded[0])
print(np.count_nonzero(X_loaded[0]))

(44432, 1000)
[0.         0.         0.         0.         0.         0.
 0.         0.85619925 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.51664577 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.      

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import time
from sklearn.feature_extraction.text import TfidfVectorizer

# Load your DataFrame
df = pd.read_csv(output_path)  # Load your actual dataset

# Drop rows with missing labels in 'Party' and missing text in 'preprocessed_tweet', then reset index
df = df.dropna(subset=['Party', 'preprocessed_tweet']).reset_index(drop=True)

# Check the shape of the DataFrame after dropping NaN values
print(f"Number of rows in DataFrame after dropping NaNs: {len(df)}")

# Get the labels and ensure they are in the correct format
y = df['Party'].values  # Extract labels as a numpy array

# Convert the 'preprocessed_tweet' column to strings, if not already
df['preprocessed_tweet'] = df['preprocessed_tweet'].apply(lambda x: str(x))

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2), lowercase=True)  # Set max features to 1000
X = tfidf_vectorizer.fit_transform(df['preprocessed_tweet']).toarray()  # Use the 'preprocessed_tweet' column for TF-IDF

# Check the shapes
print("Shape of X:", X.shape)
print("Number of labels (y):", len(y))

# Encode labels to numeric values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Check the shape of the labels after encoding
print("Shape of y_encoded:", y_encoded.shape)

# Ensure that the number of samples match before splitting
if X.shape[0] != len(y_encoded):
    print("Error: Inconsistent number of samples.")
else:
    # Split data into training and testing sets (80% training, 20% testing)
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    # Initialize the SVM classifier
    svm_classifier = SVC(kernel='rbf', C=1.0, gamma='scale')

    # Start timing the training process
    start_time = time.time()

    print("Starting SVM training...")
    svm_classifier.fit(X_train, y_train)
    print("SVM training completed.")

    # End timing
    end_time = time.time()

    # Calculate training time
    training_time = end_time - start_time
    print(f"Training time: {training_time:.2f} seconds")

    # Make predictions on the test set
    y_pred = svm_classifier.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")

    # Print detailed classification report
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
    
    # Compute and print the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(cm)


Number of rows in DataFrame after dropping NaNs: 44432
Shape of X: (44432, 1000)
Number of labels (y): 44432
Shape of y_encoded: (44432,)
Starting SVM training...
SVM training completed.
Training time: 69.23 seconds
Accuracy: 96.17%
              precision    recall  f1-score   support

    Democrat       0.97      0.96      0.96      4485
  Republican       0.96      0.97      0.96      4402

    accuracy                           0.96      8887
   macro avg       0.96      0.96      0.96      8887
weighted avg       0.96      0.96      0.96      8887

Confusion Matrix:
[[4291  194]
 [ 146 4256]]


In [6]:
import joblib

# Filenames for saving each component
model_filename = 'svm_classifier_model.joblib'
label_encoder_filename = 'svmlabel_encoder.joblib'
tfidf_filename = 'svmtfidf_vectorizer.joblib'

# Save the trained SVM model
joblib.dump(svm_classifier, model_filename)

# Save the label encoder
joblib.dump(label_encoder, label_encoder_filename)

# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, tfidf_filename)

print("Model, label encoder, and TF-IDF vectorizer saved successfully.")


Model, label encoder, and TF-IDF vectorizer saved successfully.


In [7]:
import joblib
# Load the SVM model
loaded_svm_classifier = joblib.load('svm_classifier_model.joblib')

# Load the label encoder
loaded_label_encoder = joblib.load('svmlabel_encoder.joblib')

# Load the TF-IDF vectorizer
loaded_tfidf_vectorizer = joblib.load('svmtfidf_vectorizer.joblib')

# Example new tweet for prediction
new_tweet = ["I believe in the importance of limited government, personal responsibility, and a strong national defense."]

# Transform the tweet using the loaded TF-IDF vectorizer
X_new = loaded_tfidf_vectorizer.transform(new_tweet).toarray()

# Predict the label
y_pred_encoded = loaded_svm_classifier.predict(X_new)

# Decode the label to get the original class name
y_pred_label = loaded_label_encoder.inverse_transform(y_pred_encoded)

# Display the predicted label
print("Predicted Party:", y_pred_label[0])


Predicted Party: Republican
