<h1>
<font color="#FA8072">Install Requirements</font>
</h1>

In [31]:
!pip install sentencepiece
!pip install simpletransformers
!pip install openai

Collecting openai
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.27.8


<h1>
<font color="#FA8072">Mount Drive</font>
</h1>

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


<h1>
<font color="#FA8072">Import Libraries</font>
</h1>

In [32]:
import os
import pandas as pd
import numpy as np
import nltk
import sentencepiece as spm
import shutil
import torch
import matplotlib.pyplot as plt
import tensorflow as tf
import openai
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from transformers import TFDistilBertModel, DistilBertTokenizer
from tensorflow.keras.layers import Dense, Input, Conv1D, GlobalMaxPooling1D, LSTM
from tensorflow.keras.models import Model
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
from simpletransformers.language_modeling import LanguageModelingModel
from sklearn.model_selection import KFold
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [4]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

<h1>
<font color="#FA8072">1. Word2Vec</font>
</h1>

<h2>
<font color="#FA8072">1.1 Train Word2Vec Models for Each Data Category
</font>
</h2>

In [21]:
categories = ["true", "mostly-true", "half-true", "barely-true", "false", "pants-fire"]
data_dir = "/content/gdrive/MyDrive"
clean_data_dir = os.path.join(data_dir, "data/clean/")
wordbroken_dir = os.path.join(data_dir, "data/wordbroken/")
sentenceborken = os.path.join(data_dir, "data/sentencebroken/")
models_dir = os.path.join(data_dir, "models/word2vec/")
tokenization_dir = os.path.join(data_dir, "models/tokenization/")
language_dir = os.path.join(data_dir, "models/language/")
architecture_dir = os.path.join(data_dir, "models/architecture/")
stats_dir = os.path.join(data_dir, "stats/")

In [16]:
for category in categories:
    category_sentence_dir = os.path.join(sentenceborken, f"{category}-sentences.csv")
    model_save_path = f"{models_dir}{category}.word2vec.npy"
    sentencebroken_data = pd.read_csv(category_sentence_dir)
    sentences = [nltk.word_tokenize(sentence) for sentence in sentencebroken_data['Statement']]
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    model.wv.save(model_save_path)

<h2>
<font color="#FA8072">1.2 Compare and Analyze Vectors of Common Words</font>
</h2>

In [33]:
category_models = {}
for category in categories:
    category_model_path = os.path.join(models_dir, f"{category}.word2vec.npy")
    if os.path.exists(category_model_path):
        category_model = KeyedVectors.load(category_model_path)
        category_models[category] = category_model

common_words = set.intersection(*[set(category_model.index_to_key) for category_model in category_models.values()])

same_vectors = []
different_vectors = []
for word in common_words:
    vectors = [category_models[category].get_vector(word) for category in category_models]
    similarity_scores = cosine_similarity(vectors)
    if np.all(similarity_scores >= 0.8):
        same_vectors.append(word)
    else:
        different_vectors.append(word)

print("Common words with the same vectors:")
print(same_vectors)

print("Common words with different vectors:")
print(different_vectors)

Common words with the same vectors:
['increase', 'men', 'Democrats', 'great', 'part', 'drug', 'home', 'state', 'including', 'things', 'done', 'Under', 'if', 'me', 'Hillary', 'Jim', 'won', '7', 'give', 'between', 'Parenthood', 'better', 'their', 'After', 'An', 'taxes', 'ID', 'County', 'girls', 'mayor', '5', 'I', 'poll', 'due', 'wants', 'she', 'family', 'Americas', 'youre', 'Mexico', 'came', 'around', 'bill', 'closed', 'these', '30', 'long', ',', 'Says', 'know', 'Medicare', 'everyone', 'Ohio', 'check', 'so', '60', 'cases', 'went', 'wouldnt', 'over', 'reduce', 'just', 'George', 'includes', 'attack', 'its', 'income', 'Act', 'money', 'study', 'nation', 'World', '6', 'votes', 'line', 'water', 'born', 'started', 'New', 'how', 'international', 'after', 'name', 'risk', 'using', 'proposed', 'gay', 'it', 'giving', '3', 'there', 'covered', 'think', 'Jersey', 'crime', 'thats', 'like', 'nearly', 'without', 'are', 'police', 'always', 'plan', 'lost', 'much', 'Carolina', 'We', 'time', '10', 'leader', '

<h2>
<font color="#FA8072">1.3 Train Word2Vec Model on All Data</font>
</h2>


In [35]:
all_model_save_path = os.path.join(models_dir, "all.word2vec.npy")

combined_sentences = []
for category in categories:
    category_sentence_dir = os.path.join(sentenceborken, f"{category}-sentences.csv")
    if os.path.exists(category_sentence_dir):
        category_sentence_data = pd.read_csv(category_sentence_dir)
        combined_sentences.extend(category_sentence_data['Statement'])

tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in combined_sentences]

model_combined = Word2Vec(tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)
model_combined.wv.save(all_model_save_path)

print("Model for all labels saved at", all_model_save_path)

Model for all labels saved at /content/gdrive/MyDrive/models/word2vec/all.word2vec.npy


<h1>
<font color="#FA8072">2. Tokenization</font>
</h1>

In [8]:
tokenizer_sizes = [100, 500, 1000, 5000]

In [9]:
def split_data(data, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    return list(kf.split(data))

In [10]:
def train_and_evaluate_tokenizer(sentences, tokenizer_size):
    tokenizer_results = []
    data_splits = split_data(sentences)
    for size in tokenizer_size:
        unk_percentages = []
        for i, (train_index, eval_index) in enumerate(data_splits):
            train_data = [sentences[idx] for idx in train_index]
            eval_data = [sentences[idx] for idx in eval_index]
            train_data_file = f"train_data_{size}_{i}.txt"
            with open(train_data_file, "w") as f:
                f.write("\n".join(train_data))
            model_prefix = f"tokenizer_{size}_{i}"
            spm.SentencePieceTrainer.train(
                f"--input={train_data_file} --model_prefix={model_prefix} --vocab_size={size} --hard_vocab_limit=false")
            tokenizer = spm.SentencePieceProcessor()
            tokenizer.load(f"{model_prefix}.model")
            total_tokens = 0
            unk_tokens = 0
            for text in eval_data:
                tokens = tokenizer.encode_as_pieces(text)
                total_tokens += len(tokens)
                unk_tokens += tokens.count("▁UNK")
            unk_percentage = (unk_tokens / total_tokens) * 100 if total_tokens != 0 else 0
            unk_percentages.append(unk_percentage)
        tokenizer_results.append({"Size": size, "Unk Percentages": sum(unk_percentages) / len(unk_percentages)})
    return tokenizer_results

In [67]:
cleaned_data = {}
for category in categories:
    category_csv = os.path.join(clean_data_dir, f"{category}-cleaned.csv")
    if os.path.exists(category_csv):
        cleaned_data[category] = pd.read_csv(category_csv)
combined_sentences = []
for category in categories:
    category_sentence_dir = os.path.join(sentenceborken, f"{category}-sentences.csv")
    if os.path.exists(category_sentence_dir):
        category_sentence_data = pd.read_csv(category_sentence_dir)
        combined_sentences.extend(category_sentence_data['Statement'])
tokenizer_results = train_and_evaluate_tokenizer(combined_sentences, tokenizer_sizes)
results_df = pd.DataFrame(tokenizer_results)
results_path = os.path.join(stats_dir, "tokenizer_results.csv")
results_df.to_csv(results_path, index=False)
mean_percentages = [np.mean(percentages) for percentages in results_df["Unk Percentages"]]
best_size = results_df["Size"].iloc[np.argmin(mean_percentages)]
print("Best Tokenizer Size:", best_size)
best_model_prefix = f"tokenizer_{best_size}_0"
best_tokenizer_path = os.path.join(tokenization_dir, f"best_tokenizer_{best_size}.model")
os.makedirs(os.path.dirname(best_tokenizer_path), exist_ok=True)
shutil.move(f"{best_model_prefix}.model", best_tokenizer_path)

Best Tokenizer Size: 100


'/content/gdrive/MyDrive/models/tokenization/best_tokenizer_100.model'

<h1>
<font color="#FA8072">3. Language Model</font>
</h1>

In [5]:
language_models = {
    "true": "gpt2",
    "mostly-true": "gpt2",
    "half-true": "gpt2",
    "barely-true": "gpt2",
    "false": "gpt2",
    "pants-fire": "gpt2"
}

In [9]:
for category in categories:
    clean_data_file = os.path.join(clean_data_dir, f"{category}-cleaned.csv")
    df = pd.read_csv(clean_data_file)
    language_model_save_path = os.path.join(language_dir, f"{category}.language_model")
    tokenizer = GPT2Tokenizer.from_pretrained(language_models[category])
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model = GPT2LMHeadModel.from_pretrained(language_models[category])
    tokenized_data = tokenizer(df["Statement"].tolist(), truncation=True, padding=True, return_tensors="pt")
    model.train()
    model.resize_token_embeddings(len(tokenizer))
    model.forward = model.generate
    model.save_pretrained(language_model_save_path)
    print(f"Language model for category '{category}' saved at {language_model_save_path}")

Language model for category 'true' saved at /content/gdrive/MyDrive/models/language/true.language_model
Language model for category 'mostly-true' saved at /content/gdrive/MyDrive/models/language/mostly-true.language_model
Language model for category 'half-true' saved at /content/gdrive/MyDrive/models/language/half-true.language_model
Language model for category 'barely-true' saved at /content/gdrive/MyDrive/models/language/barely-true.language_model
Language model for category 'false' saved at /content/gdrive/MyDrive/models/language/false.language_model
Language model for category 'pants-fire' saved at /content/gdrive/MyDrive/models/language/pants-fire.language_model


In [16]:
generated_sentences = []
for category in categories:
    generated_sentences = []
    for _ in range(5):
        input_ids = tokenizer.encode("Generate a sentence", return_tensors="pt")
        outputs = model.generate(input_ids, max_length=100, num_return_sequences=1)
        generated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_sentences.append(generated_sentence)
    output_file = os.path.join(stats_dir, "txts", f"{category}_sentences.txt")
    with open(output_file, "w") as f:
        for sentence in generated_sentences:
            f.write(sentence + "\n")
    print(f"Generated sentences for category '{category}' saved at {output_file}")

Generated sentences for category 'true' saved at /content/gdrive/MyDrive/stats/txts/true_sentences.txt
Generated sentences for category 'mostly-true' saved at /content/gdrive/MyDrive/stats/txts/mostly-true_sentences.txt
Generated sentences for category 'half-true' saved at /content/gdrive/MyDrive/stats/txts/half-true_sentences.txt
Generated sentences for category 'barely-true' saved at /content/gdrive/MyDrive/stats/txts/barely-true_sentences.txt
Generated sentences for category 'false' saved at /content/gdrive/MyDrive/stats/txts/false_sentences.txt
Generated sentences for category 'pants-fire' saved at /content/gdrive/MyDrive/stats/txts/pants-fire_sentences.txt


<h1>
<font color="#FA8072">4. Feature Engineering</font>
</h1>

In [6]:
def sentence_length_classification(X_train, X_test, y_train, y_test):
    X_train = [[len(sentence)] for sentence in X_train]
    X_test = [[len(sentence)] for sentence in X_test]
    model = LogisticRegression()
    model.fit(X_train, y_train)
    train_acc = accuracy_score(y_train, model.predict(X_train))
    test_acc = accuracy_score(y_test, model.predict(X_test))
    return train_acc, test_acc

In [7]:
def word_length_classification(X_train, X_test, y_train, y_test):
    X_train = [[len(word) for word in sentence.split()] for sentence in X_train]
    X_test = [[len(word) for word in sentence.split()] for sentence in X_test]
    X_train = [length for sublist in X_train for length in sublist]
    X_test = [length for sublist in X_test for length in sublist]
    model = DecisionTreeClassifier()
    model.fit(np.array(X_train).reshape(-1, 1), y_train)
    train_acc = accuracy_score(y_train, model.predict(np.array(X_train).reshape(-1, 1)))
    test_acc = accuracy_score(y_test, model.predict(np.array(X_test).reshape(-1, 1)))
    return train_acc, test_acc

In [13]:
data = {}
for category in categories:
    category_csv = os.path.join(clean_data_dir, f"{category}-cleaned.csv")
    if os.path.exists(category_csv):
        data[category] = pd.read_csv(category_csv)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)
train_accs = []
val_accs = []
test_accs = []
for feature in ['sentence_length', 'word_length']:
    train_acc, val_acc, test_acc = train_and_evaluate_tokenizer(train_data, val_data, test_data, feature)
    train_accs.append(train_acc)
    val_accs.append(val_acc)
    test_accs.append(test_acc)
epochs = range(1, len(train_accs) + 1)
plt.plot(epochs, train_accs, label='Train')
plt.plot(epochs, val_accs, label='Validation')
plt.plot(epochs, test_accs, label='Test')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Classification Accuracy by Feature')
plt.legend()
plt.show()
plot_path = os.path.join(stats_dir, "classification_accuracy.png")
plt.savefig(plot_path)
plt.close()
print("Classification accuracy plot saved at", plot_path)

Classification accuracy plot saved at /content/gdrive/MyDrive/stats/classification_accuracy.png


<h1>
<font color="#FA8072">5. Model Architecture</font>
</h1>

In [15]:
features = ["sentence_length", "word2vec", "word2vec_bigram"]

In [26]:
data = {}
for category in categories:
    file_path = os.path.join(clean_data_dir, f"{category}-cleaned.csv")
    if os.path.exists(file_path):
        data[category] = pd.read_csv(file_path)

In [22]:
def create_transformer_model():
    input_ids = Input(shape=(100,), dtype=tf.int32)
    transformer = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
    output = transformer(input_ids)[0]
    output = tf.keras.layers.GlobalAveragePooling1D()(output)
    output = Dense(len(categories), activation="softmax")(output)
    model = Model(inputs=input_ids, outputs=output)
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

In [23]:
def create_cnn_model():
    input_shape = (100, len(features))
    model = tf.keras.Sequential()
    model.add(Conv1D(128, 5, activation='relu', input_shape=input_shape))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(len(categories), activation='softmax'))
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

In [24]:
def create_rnn_model():
    input_shape = (100, len(features))
    model = tf.keras.Sequential()
    model.add(LSTM(128, input_shape=input_shape))
    model.add(Dense(len(categories), activation='softmax'))
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

In [29]:
for category in categories:
    train_data, test_data = train_test_split(data[category], test_size=0.2, random_state=42)
    train_features = train_data[features].values
    train_labels = train_data["label"].values
    test_features = test_data[features].values
    test_labels = test_data["label"].values
    model = create_transformer_model()
    model.fit(train_features, train_labels, epochs=10, batch_size=32, validation_split=0.2)
    predictions = model.predict(test_features)
    report = classification_report(test_labels, predictions)
    print(report)
    model.save(os.path.join(architecture_dir, f"{category}_model"))

<h1>
<font color="#FA8072">6. Data Augmentation</font>
</h1>

In [33]:
openai.api_key = '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'
prompts = {
    'category1': 'Prompt for category 1',
    'category2': 'Prompt for category 2',
    'category3': 'Prompt for category 3',
    'category4': 'Prompt for category 4',
    'category5': 'Prompt for category 5',
    'category6': 'Prompt for category 6',
}
generated_data = {}
for category, prompt in prompts.items():
    response = openai.Completion.create(
        engine='text-davinci-003',
        prompt=prompt,
        max_tokens=100,
        temperature=0.7,
        n=5,
        stop=None
    )
    generated_data[category] = response.choices

for category, generated_text in generated_data.items():
    print(f"Category: {category}")
    print("Original data: ...")
    print("Generated data: ...")
    print("Analysis: ...")
    print()

RateLimitError: ignored

<h1>
<font color="#FA8072">7. Rating by OpenAI</font>
</h1>

In [34]:
prompt = "Classify the following statement as Shot-Zero or Shot-Few:"
examples = [
    ['Shot-Zero', 'This is a statement with very few shots.'],
    ['Shot-Few', 'This statement has a reasonable number of shots.'],
]

In [35]:
statements = [
    'This is a statement with very few shots.',
    'This statement has a reasonable number of shots.',
    'The number of shots in this statement is quite high.',
]

In [38]:
classifications = openai.Completion.create(
    engine="davinci",
    prompt=prompt,
    logit_bias={
        "Shot-Zero": -10,
        "Shot-Few": 0,
    },
    max_tokens=2,
    n=1,
)
for i, classification in enumerate(classifications.choices):
    statement = statements[i]
    label = classification.text.strip()
    print(f"Statement: {statement}")
    print(f"Classification: {label}")
    print()

RateLimitError: ignored