In [4]:
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



import os

import pandas as pd
import numpy as np

from tqdm import tqdm


import sentencepiece

import seaborn as sns
from pylab import *
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib import rc

from sklearn.metrics import confusion_matrix, classification_report
%matplotlib inline
%config InlineBackend.figure_format='retina'

from tensorflow.keras.models import model_from_yaml


RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [9]:
#loading model
from tensorflow.keras.models import model_from_json
#10 hours later...
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")

Loaded model from disk


In [10]:
# evaluate loaded model on test data
#in first jupyter notebook: model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [11]:
rand_dataset = pd.read_csv('rand_dataset.csv')

In [13]:
del rand_dataset['Unnamed: 0']

In [42]:
rand_dataset.describe()

Unnamed: 0,Ideology
count,86.0
mean,0.5
std,0.502933
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [19]:
vocab_size = 10000
embedding_dim = 32
max_length = 9120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 70

In [20]:
speeches = []
ideology = []
for thing in rand_dataset.values:
    speeches.append(thing[0])
    ideology.append(thing[1])



training_size = 70
training_speeches = speeches[0:training_size]
testing_speeches = speeches[training_size:]
training_ideologies = ideology[0:training_size]
testing_ideologies = ideology[training_size:]

In [24]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_speeches)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_speeches)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_speeches)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)



i=0

training_predictions = []

for i in range(training_size):
    sentence = [training_speeches[i]]
    sequences = tokenizer.texts_to_sequences(sentence)
    padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    training_predictions.append(loaded_model.predict(padded))
    print(training_ideologies[i])
    print(loaded_model.predict(padded))
    print("\n")

0
[[0.46674457]]


1
[[0.52235556]]


0
[[0.46737805]]


1
[[0.4890468]]


1
[[0.5153033]]


0
[[0.47180894]]


1
[[0.50166976]]


1
[[0.49034956]]


1
[[0.49778813]]


0
[[0.47333908]]


0
[[0.46786433]]


0
[[0.46954525]]


0
[[0.46614182]]


0
[[0.46076554]]


0
[[0.46753195]]


0
[[0.4761136]]


1
[[0.523831]]


0
[[0.46526924]]


0
[[0.465584]]


0
[[0.49178466]]


1
[[0.5086514]]


1
[[0.51066184]]


1
[[0.50956196]]


1
[[0.49425682]]


1
[[0.47956464]]


0
[[0.4655992]]


1
[[0.5256023]]


1
[[0.5235768]]


1
[[0.5113399]]


1
[[0.5230159]]


0
[[0.5090483]]


0
[[0.5077477]]


1
[[0.47758213]]


1
[[0.52385277]]


1
[[0.52210534]]


1
[[0.514751]]


0
[[0.4664477]]


0
[[0.5010162]]


1
[[0.46918887]]


0
[[0.49546155]]


1
[[0.49465472]]


0
[[0.47760347]]


1
[[0.5247227]]


1
[[0.5172978]]


1
[[0.48348343]]


1
[[0.4918564]]


1
[[0.5026001]]


1
[[0.51603854]]


0
[[0.48939362]]


1
[[0.52397275]]


0
[[0.47980157]]


1
[[0.47929353]]


0
[[0.47204566]]


0
[[0.4663048]]


In [39]:
training_predictions_list = []
for i in range(training_size):
    training_predictions_list.append(training_predictions[i][0][0])
#rand_dataset['predictions']= training_predictions

In [43]:
i=0

testing_predictions = []

for i in range(16):
    sentence = [testing_speeches[i]]
    sequences = tokenizer.texts_to_sequences(sentence)
    padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    testing_predictions.append(loaded_model.predict(padded))
    
testing_predictions_list = []
for i in range(16):
    testing_predictions_list.append(testing_predictions[i][0][0])

In [44]:


# Use numpy to convert to arrays
import numpy as np
labels = np.array(training_ideologies)

features = training_predictions_list
features = np.array(features)

test_features = testing_predictions_list
test_features = np.array(test_features)
test_labels = np.array(testing_ideologies)

In [51]:
from sklearn.ensemble import RandomForestClassifier
features = features.reshape(-1, 1)
test_features = test_features.reshape(-1,1)

clf = RandomForestClassifier(max_depth=2, random_state=42)


clf.fit(features, labels)

predictions = clf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)

print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')


Mean Absolute Error: 0.31 degrees.


In [53]:
import pickle

filename = 'random_forest_model.sav'
pickle.dump(clf, open(filename, 'wb'))