In [23]:
import pandas as pd
import numpy as np
import re
from tensorflow.keras import layers, Sequential

In [24]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
data_path = '/Users/kun/code/kun1887/TwitPol/data_cleaned.csv'
data = pd.read_csv(data_path)

In [27]:
data.y.astype(bool).value_counts()

y
False    34452
True     33015
Name: count, dtype: int64

In [28]:
X = data.tweet_clean
y = data.y

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [31]:
X_train

34473    good news dickinson county base company specia...
13316                  year wait guy bring job talk action
63261    congratulation name th president louisville se...
20217    tax cut pay republican want pay goptaxscam cut...
8304     release classified information nunesmemo egreg...
                               ...                        
37194    honor understatement humble thank vietnam vete...
6265         like offer special thank amp leadership issue
54886    colleague urge ambassador lighthizer maintain ...
860      want thank city council staff leadership commu...
15795    today send letter facebook founder ceo mark zu...
Name: tweet_clean, Length: 60720, dtype: object

In [30]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_tok = tokenizer.texts_to_sequences(X_train)
X_test_tok = tokenizer.texts_to_sequences(X_test)

AttributeError: 'float' object has no attribute 'lower'

In [None]:
tokenizer.index_word

In [None]:
np.average([len(x) for x in X_train_tok])

In [None]:
vocal_size = len(tokenizer.word_index)

In [None]:
X_train_pad = pad_sequences(X_train_tok, dtype='int', padding='post', maxlen=10)
X_test_pad = pad_sequences(X_test_tok, dtype='int', padding='post')

In [None]:
print(X_train_pad.shape)
print(X_test_pad.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
X_train_pad[:10]

In [None]:
def init_model():
    model = Sequential()

    model.add(layers.Embedding(input_dim=vocal_size + 1, output_dim=50))
    model.add(layers.LSTM(5))
    model.add(layers.Dense(12, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
model = init_model()

history = model.fit(
    X_train_pad,
    y_train == "True",
    epochs=5,
    validation_split=0.3,
    batch_size=128,
    verbose=2
    )

plot_loss_accuracy(history)

In [None]:
def plot_loss_accuracy(history, title=None):
    fig, ax = plt.subplots(1,2, figsize=(20,7))

    # --- LOSS ---

    ax[0].plot(history.history['loss'])
    ax[0].plot(history.history['val_loss'])

    ax[0].set_title('Model loss')
    ax[0].set_ylabel('Loss')
    ax[0].set_xlabel('Epoch')

    ax[0].set_ylim((0,1))

    ax[0].legend(['Train', 'Test'], loc='best')

    ax[0].grid(axis="x",linewidth=0.5)
    ax[0].grid(axis="y",linewidth=0.5)

    # --- ACCURACY

    ax[1].plot(history.history['accuracy'])
    ax[1].plot(history.history['val_accuracy'])

    ax[1].set_title('Model Accuracy')
    ax[1].set_ylabel('Accuracy')
    ax[1].set_xlabel('Epoch')

    ax[1].legend(['Train', 'Test'], loc='best')

    ax[1].set_ylim((0,1))

    ax[1].grid(axis="x",linewidth=0.5)
    ax[1].grid(axis="y",linewidth=0.5)

    if title:
        fig.suptitle(title)

In [None]:
model.summary()

In [None]:
embedding = model.layers[0].weights[0]._value


In [None]:
example = ["I vote for Trump"]

tokenized = tokenizer.texts_to_sequences(example)
tokenized

In [None]:
padded = pad_sequences(tokenized, dtype='int', padding='post', maxlen=10)

In [None]:
model.predict(padded)

In [None]:
import lime
from lime.lime_text import LimeTextExplainer

class KerasClassifierWrapper:
    def __init__(self, model, tokenizer, maxlen):
        self.model = model
        self.tokenizer = tokenizer
        self.maxlen = maxlen

    def predict_proba(self, texts):
        sequences = self.tokenizer.texts_to_sequences(texts)
        padded = pad_sequences(sequences, maxlen=self.maxlen, padding='post')
        return np.hstack((1 - self.model.predict(padded), self.model.predict(padded)))

wrapper = KerasCtweet = data.tweet_clean[0]lassifierWrapper(model, tokenizer, X_train_pad.shape[1])
ls_X_test = cleaned_texts  # Your test data should be prepared similarly
class_names = {0: 'non-democratic', 1: 'democratic'}

explainer = LimeTextExplainer(class_names=class_names)

idx = np.random.randint(200)
explanation = explainer.explain_instance(ls_X_test[idx], wrapper.predict_proba, num_features=6)
print('Document id: %d' % idx)
print('Text: ', ls_X_test[idx])
print('Probability democratic =', wrapper.predict_proba([ls_X_test[idx]]).round(3)[0,1])
print('True class: %s' % class_names[data.iloc[idx]['y']])

# Show the explainability results with highlighted text
print("1 = democratuc class, 0 = non-democratic class")
explanation.show_in_notebook(text=True)


In [20]:
tweet = data.tweet_clean[33]

In [21]:
tweet

'let face planet b'

In [None]:
from openai import OpenAI
import os 
# Set your OpenAI API key
api_key = os.environ['OPENAI_API_KEY']

client = OpenAI(api_key=api_key)
query = f"Classify this tweet as Democratic or Republican: {tweet}"
response = client.chat.completions.create(model='gpt-3.5-turbo', messages=[
    {'role':'system', 'content': 'You are a tweet classifier for political sentiment. Be brief and concise.'},
    {'role':'user', 'content':query}])