In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Load the preprocessed dataset
df = pd.read_csv('PreprocessedDataset.csv')

# Encode labels
label_encoder = LabelEncoder()
df['Party'] = label_encoder.fit_transform(df['Party'])

# Clean data: fill or drop missing values in the 'preprocessed_tweet' column
df['preprocessed_tweet'] = df['preprocessed_tweet'].fillna('')

# Split data into features and target
X = df['preprocessed_tweet']
y = df['Party']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize and train the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred = rf_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred, labels=np.unique(y_test)))

# Compute and print the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


Accuracy: 96.39%
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      4452
           1       0.96      0.97      0.96      4437

    accuracy                           0.96      8889
   macro avg       0.96      0.96      0.96      8889
weighted avg       0.96      0.96      0.96      8889

Confusion Matrix:
[[4253  199]
 [ 122 4315]]


In [7]:
import joblib

# Save the trained Random Forest model, vectorizer, and label encoder
joblib.dump(rf_model, 'random_forest_model.joblib')       
joblib.dump(vectorizer, 'RFvectorizer.joblib')              
joblib.dump(label_encoder, 'RFlabel_encoder.joblib')  

print("Model, vectorizer, and label encoder saved successfully.")


Model, vectorizer, and label encoder saved successfully.


In [8]:
import joblib

# Load the trained Random Forest model, vectorizer, and label encoder
rf_model = joblib.load('random_forest_model.joblib')
vectorizer = joblib.load('RFvectorizer.joblib')
label_encoder = joblib.load('RFlabel_encoder.joblib')

# Prediction function for a new tweet
def predict_party(tweet):
    tweet_tfidf = vectorizer.transform([tweet])
    prediction = rf_model.predict(tweet_tfidf)
    return label_encoder.inverse_transform(prediction)[0]

# Test the prediction function
sample_tweet = "I believe in the importance of limited government, personal responsibility, and a strong national defense."
print(f"Prediction for sample tweet: {predict_party(sample_tweet)}")


Prediction for sample tweet: Republican
