# GPT for Sentiment Analysis

# Imports and installs

Imports for Simple Transformers

In [None]:
import os
# Higher versions have problems with CUDA
!pip install transformers==2.11.0
!pip install simpletransformers==0.41.1
!git clone https://github.com/NVIDIA/apex
os.chdir('apex')
!pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
os.chdir('..')

Imports for regular transformers

In [None]:
%tensorflow_version 1.x
!pip install gpt-2-simple 

In [None]:
import gpt_2_simple as gpt2

model_name = "345M" # The GPT-2 model we're using

gpt2.download_gpt2(model_name=model_name) # Download the model

In [1]:
import time
import numpy as np
import pandas as pd
import tensorflow as tf
import string
import re


from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split


TensorFlow 1.x selected.
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



Fetching checkpoint: 1.05Mit [00:00, 67.8Mit/s]                                                     
Fetching encoder.json: 1.05Mit [00:00, 47.9Mit/s]                                                   
Fetching hparams.json: 1.05Mit [00:00, 229Mit/s]                                                    
Fetching model.ckpt.data-00000-of-00001: 1.42Git [00:09, 151Mit/s]                                  
Fetching model.ckpt.index: 1.05Mit [00:00, 188Mit/s]                                                
Fetching model.ckpt.meta: 1.05Mit [00:00, 98.2Mit/s]                                                
Fetching vocab.bpe: 1.05Mit [00:00, 170Mit/s]                                                       


# Read files
Only execute one of these

## Apple Sentiment

In [None]:
data = pd.read_csv("data/sentiment/datasets_652925_1154930_apple-twitter-sentiment-texts.csv")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [None]:
#For SimpleTransformers
data.sentiment = data.sentiment.apply(lambda x: x + 1)

## US Airline Sentiment

In [None]:
data = pd.read_csv("data/sentiment/Tweets.csv")

data = data[['text', 'airline_sentiment']]
data.rename({'airline_sentiment' : 'sentiment'}, inplace=True)

In [None]:
#For SimpleTransformers
thisdict =	{
  "negative": 0,
  "neutral": 1,
  "positive": 2
}
data.sentiment = data.sentiment.apply(lambda x: thisdict[x])

## T4SA

In [None]:
tweets = pd.read_csv("data/sentiment/raw_tweets_text.csv")
sentiments = pd.read_csv(".data/sentiment/t4sa_text_sentiment.csv",delimiter = "\t")

tweets.set_index(tweets.id, inplace=True)
sentiments.set_index(sentiments.TWID, inplace=True)
data=tweets.join(sentiments)
data.dropna(inplace=True)
data.drop(columns=['id', 'TWID'], inplace=True)
data["sentiment"] = data[['NEU', 'NEG', 'POS']].idxmax(axis=1)

data = data[['text', 'sentiment']]


In [None]:
#For SimpleTransformers
thisdict =	{
  "NEG": 0,
  "NEU": 1,
  "POS": 2
}
data.sentiment = data.sentiment.apply(lambda x: thisdict[x])

General Text Cleaning

In [None]:
data.text = data.text.str.lower()

data.text = data.text.apply(lambda x:re.sub(r'http\S+', '', x))

tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
data.text = data.text.apply(lambda x: tokenizer.tokenize(x))

data.text = data.text.apply(lambda x: ' '.join(x))

data.text = data.text.map(lambda x : x.translate(str.maketrans('', '', string.punctuation)))

data.text = data.text.str.replace("[0-9]", " ")

data.text = data.text.str.strip(string.whitespace)

df_train, df_test = train_test_split(data, test_size=0.33, random_state=42)

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Training (non simple)

In [2]:
#needs text file with specific format 
sess = gpt2.start_tf_sess()
gpt2.finetune(sess,
              'data/gpt_apple_train.txt',
              model_name=model_name,
              steps=10)   # steps is max number of training steps

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Please use tensorflow.python.ops.op_selector.get_backward_walk_ops.
Loading checkpoint checkpoint/run1/model-6
INFO:tensorflow:Restoring parameters from checkpoint/run1/model-6


  0%|          | 0/1 [00:00<?, ?it/s]

Loading dataset...


100%|██████████| 1/1 [00:00<00:00,  4.16it/s]


dataset has 24026 tokens
Training...
[7 | 103.57] loss=4.31 avg=4.31
[8 | 193.70] loss=4.50 avg=4.41
[9 | 285.07] loss=4.40 avg=4.40
[10 | 371.54] loss=4.14 avg=4.34
[11 | 458.29] loss=4.00 avg=4.27
[12 | 545.75] loss=3.91 avg=4.21
[13 | 632.53] loss=3.98 avg=4.17
[14 | 718.38] loss=3.87 avg=4.13
[15 | 805.41] loss=3.56 avg=4.07
[16 | 892.85] loss=3.76 avg=4.04
Saving checkpoint/run1/model-16


## Evaluation

In [3]:
def interact_model(
    model_name,
    seed,
    nsamples,
    batch_size,
    length,
    temperature,
    top_k,
    models_dir
):
    models_dir = os.path.expanduser(os.path.expandvars(models_dir))
    if batch_size is None:
        batch_size = 1
    assert nsamples % batch_size == 0

    enc = encoder.get_encoder(model_name, models_dir)
    hparams = model.default_hparams()
    with open(os.path.join(models_dir, model_name, 'hparams.json')) as f:
        hparams.override_from_dict(json.load(f))

    if length is None:
        length = hparams.n_ctx // 2
    elif length > hparams.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx)

    with tf.Session(graph=tf.Graph()) as sess:
        context = tf.placeholder(tf.int32, [batch_size, None])
        np.random.seed(seed)
        tf.set_random_seed(seed)
        output = sample.sample_sequence(
            hparams=hparams, length=length,
            context=context,
            batch_size=batch_size,
            temperature=temperature, top_k=top_k
        )

        saver = tf.train.Saver()
        ckpt = tf.train.latest_checkpoint(os.path.join(models_dir, model_name))
        saver.restore(sess, ckpt)

        while True:
            raw_text = input("Model prompt >>> ")
            while not raw_text:
                print('Prompt should not be empty!')
                raw_text = input("Model prompt >>> ")
            context_tokens = enc.encode(raw_text)
            generated = 0
            for _ in range(nsamples // batch_size):
                out = sess.run(output, feed_dict={
                    context: [context_tokens for _ in range(batch_size)]
                })[:, len(context_tokens):]
                for i in range(batch_size):
                    generated += 1
                    text = enc.decode(out[i])
                    print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
                    print(text)
            print("=" * 80)

In [4]:
!git clone https://github.com/openai/gpt-2.git
import os
os.chdir("gpt-2/src/")
import tensorflow as tf
import model, sample, encoder
os.chdir('../../')

fatal: destination path 'gpt-2' already exists and is not an empty directory.


#### This is for an interactive evaluation. It's enough to see that this training doesn't work very well...

In [5]:
!ls
interact_model(
    'run1',
    None,
    1,
    1,
    2,
    1,
    0,
    './checkpoint'
)

checkpoint  drive  gpt-2  models  sample_data




Instructions for updating:
Use `tf.cast` instead.
Instructions for updating:
Use `tf.random.categorical` instead.
INFO:tensorflow:Restoring parameters from ./checkpoint/run1/model-16
Model prompt >>> // x bonus airmiles right now  airmilesshops ||
 noon 
Model prompt >>> // since when did stop replacing iphone units due to defect  just because i have some scratches on my phone doesnt mean i dropped it ||
  //
Model prompt >>> // zoonova whatif analysis aapl lnkd fb googl tsla stocks options bonds frn mm       bps vol      nasdaq ||
  
Model prompt >>> // another fucking software update  really ||
  
Model prompt >>> // i had to get my logic board completely replaced  if its under warranty theyll do it for free  make sure to back up b ||
  
Model prompt >>> // thanks for pushing me into yosemite it has been the worst so slow and clunky  cantgoback misssnowleopard whathappened ||
 || cm


KeyboardInterrupt: ignored

# Using SimpleTransformers

In [None]:
model = ClassificationModel('gpt2', 'gpt2-medium', num_labels=3, use_cuda=True, args={
    'learning_rate':3e-5,
    'num_train_epochs': 5,
    'reprocess_input_data': True,
    'overwrite_output_dir': True,
    'process_count': 10,
    'train_batch_size': 4,
    'eval_batch_size': 4,
    'max_seq_length': 512,
    'fp16': True
})

model.train_model(df_train)

## Evaluation

In [None]:
import numpy as np
_, model_outputs_test, _ = model.eval_model(df_test)

preds_test = np.argmax(model_outputs_test, axis=1)

In [None]:
from sklearn.metrics import f1_score, accuracy_score


print(f1_score(df_test.sentiment, preds_test, average=None))
print(accuracy_score(df_test.sentiment, preds_test))