In [18]:
import transformers as hug
import tensorflow as tf
import pandas as pd
import warnings
import os

from tqdm.auto import tqdm
import numpy as np
import seaborn as sns
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

warnings.filterwarnings("ignore")
# config = tf.ConfigProto()
# config.gpu_options.allow_growth = True
# sess = tf.Session(config=config)
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'



In [19]:
tf.config.list_physical_devices('GPU') 

[]

In [None]:
%matplotlib inline
sns.set_context('notebook', rc={'figure.figsize': (10, 6)}, font_scale=1.5)

In [None]:
def pre_process_input_data(filepath='./data/tweets.csv',encoding='cp1252'): #Change encoding if not on windows
    tweets = pd.read_csv(filepath,encoding=encoding,header=None)
    tweets.columns = ['target','id','date','flag','username','text'] #Change column names to things that make sense
    tweets = tweets.drop(columns=['id','date','flag','username']) #Remove unneeded columns from memory

    tweets = tweets.replace({'target':{0:0,4:1}}) #Dataset has only 0=negative sent, 4=positive sent, remappping to 0,1 respectivly
    return tweets

In [None]:
input_data = pre_process_input_data(filepath= "../data/tweets.csv") #Change this to the filepath of the tweets file

input_data['target'].hist()
# input_data.visualize()

## HuggingFace Transformer
<hr>

In [None]:
from transformers import BertTokenizer, TFBertModel
TOKENIZER = BertTokenizer.from_pretrained('bert-base-uncased')
MODEL = TFBertModel.from_pretrained("bert-base-uncased")

In [None]:
num_samples = 30000
subset = input_data.iloc[:num_samples]
subset = pd.concat([subset,input_data.iloc[-1 * num_samples:]])

# subset = input_data

subset['target'].hist()

In [None]:

# tens,attention = convert_to_encoded("Hello World I am")
def convert_df_to_encoded(df,text_col='text',model=MODEL,tokenizer=TOKENIZER,batch_size=500):

    batches = [(i,min(i+batch_size,len(df))) for i in range(0,len(df),batch_size)] #Split into smaller chunks
    _df = pd.DataFrame()
    max_twt_len = np.max([len(v) for v in df[text_col]])
    print(max_twt_len)
    for lower,upper in tqdm(batches):
        chunk = df.iloc[lower:upper]
        features = tokenizer(chunk[text_col].values.tolist(),padding='max_length', truncation=True, return_tensors='tf',max_length=max_twt_len)
        features = model(**features).last_hidden_state[:,0,:]
        chunk['features'] = features.numpy().tolist()
        _df = pd.concat([_df,chunk])
    return _df
#TODO: PCA compression on vectors down to 250 space for memory reasons


# input_data.sort_values(by='target')

tmp = convert_df_to_encoded(subset)
# tmp = input_data['text'].apply(convert_to_encoded)

#### PCA


In [None]:
x = np.array([x for x in tmp['features']])

In [None]:
x.shape

In [None]:
from sklearn.model_selection import train_test_split
vectors = x
targets = tmp['target']
labels = tmp['text']

train_vectors, test_vectors, train_targets, test_targets, train_labels, test_labels = \
    train_test_split(vectors, targets, labels, test_size=0.1, random_state=0)

In [None]:
train_vectors, val_vectors, train_targets, val_targets, train_labels,val_labels = \
    train_test_split(train_vectors,train_targets,train_labels,test_size = 0.2, random_state=0)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=300)
train_vectors = pca.fit_transform(train_vectors)
val_vectors = pca.transform(val_vectors)
test_vectors = pca.transform(test_vectors)


In [None]:
print(f"""
Train Vector Shape: {train_vectors.shape}
Validation Vector Shape: {val_vectors.shape}
Test Vector Shape: {test_vectors.shape}
"""
     )

In [None]:
classifier = SGDClassifier(loss='log_loss', random_state=0, max_iter=500)
classifier.fit(train_vectors, train_targets)

In [None]:
accuracy_score(classifier.predict(test_vectors), test_targets)

In [16]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Input, Dropout, Embedding

In [17]:
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization
from tensorflow.keras.layers import Layer
import tensorflow as tf

In [None]:
def generateNetwork():
    model = Sequential()
    #embedding layer 
    model.add(tf.keras.Input(shape=(300,)))
    model.add(layers.Dense(64,activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(1,activation='sigmoid'))

    optimizer = tf.keras.optimizers.Adam(
        learning_rate=1e-3,
    )
    callbacks = []
    model.compile(
        optimizer=optimizer,
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    callbacks.append(tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, min_delta=0.001)) #Early stop
    return model, callbacks
mdl,callbacks = generateNetwork()

In [None]:
def cnn_seq_noGlove():
        EMBED_SIZE = 100  # same size as loaded from GLOVE
        sequence_input = Input(shape=(300,), dtype='int32')
        x = Conv1D(128, 5, activation='relu', kernel_initializer='he_uniform')(sequence_input)
        x = MaxPooling1D(5)(x)
        x = Dropout(0.2)(x)
        x = Conv1D(128, 5, activation='relu', kernel_initializer='he_uniform')(x)
        x = MaxPooling1D(5)(x)
        x = Dropout(0.2)(x)

        x = Flatten()(x)
        x = Dropout(0.2)(x)
        x = Dense(128, activation='relu', kernel_initializer='he_uniform')(x)
        preds = Dense(3, activation='softmax', kernel_initializer='glorot_uniform')(x)

        model = Model(sequence_input, preds)

        model.compile(loss='categorical_crossentropy', 
                optimizer='rmsprop',
                metrics=['acc'])
        return model

In [None]:
mdl.output_shape


In [None]:
mdl.fit(train_vectors,train_targets,epochs=200,validation_data=(val_vectors,val_targets),callbacks=callbacks)


In [None]:
# accuracy_score(mdl.predict(test_vectors),test_targets)
mdl.evaluate(test_vectors,test_targets)

In [None]:
mdl.metrics_names

In [None]:
res = mdl.predict(test_vectors)

In [None]:
viewer = pd.DataFrame({'predict':res.flatten(),'target':test_targets,'sentence':test_labels})

In [None]:
pd.set_option('max_colwidth', 400)
viewer['diff'] = abs(viewer['predict'] - viewer['target'])
viewer.sort_values(by='diff').tail(30)

In [None]:
input_data.iloc[0:30]

In [None]:
input_data.iloc[-30:]

In [None]:
#Stolen From Josh SYlvester
# def make_neural_net_no_sentiment():
#     # Create the model
#     model = keras.Sequential([
#         layers.Dense(units=128, activation='relu'),
#         layers.Dense(units=64, activation='relu'),
#         layers.Dense(units=1, activation='sigmoid')
#     ])

#     reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
#                                 patience=4, min_lr=0.000001, verbose=1)
#     early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, min_delta=0.001)

#     callbacks = [early_stop, reduce_lr]
#     optimizer = tf.keras.optimizers.Adam(
#         learning_rate=1e-3,
#     )

#     # Compile the model
#     model.compile(optimizer=optimizer,
#                 loss='binary_crossentropy',
#                 metrics=['accuracy'])

#     return model, callbacks