## DSC650-T301 Big Data (2235-1)
## 5/16/2023
## Joshua Greenert

## 10.1.a

In [1]:
# Create a tokenize function that splits a sentence into words. Ensure that your tokenizer removes basic punctuation.
import string

testString = "the ! fat cat sat on the stoop, and then went up."

def tokenize(sentence):
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    tokens = sentence.split()
    
    return tokens

In [2]:
tokenize(testString)

['the', 'fat', 'cat', 'sat', 'on', 'the', 'stoop', 'and', 'then', 'went', 'up']

## 10.1.b

In [3]:
# Implement an `ngram` function that splits tokens into N-grams. 
def ngram(tokens, n):
    ngrams = []
    # Create n-grams using a sliding window approach
    for i in range(len(tokens) - n + 1):
        ngram = tokens[i:i+n]
        ngrams.append(ngram)
    return ngrams

## 10.1.c

In [4]:
# Implement an one_hot_encode function to create a vector from a numerical vector from a list of tokens.
def one_hot_encode(tokens, num_words):
    token_index = {}
    results = []
    
    # Assign unique index to each unique token
    for token in tokens:
        if token not in token_index:
            token_index[token] = len(token_index) + 1
    
    # Create the one-hot encoded vector
    for token in tokens:
        if token in token_index:
            encoding = [0] * num_words
            token_idx = token_index[token]
            if token_idx <= num_words:
                encoding[token_idx - 1] = 1
            results.append(encoding)
    
    return results

## 10.2

In [5]:
# Using listings 6.16, 6.17, and 6.18 in Deep Learning with Python as a guide, 
# train a sequential model with embeddings on the IMDB data found in data/external/imdb/. 
# Produce the model performance metrics and training and validation accuracy curves within the Jupyter notebook
import os
import pandas as pd

folder_path_pos = '../../../data/external/imdb/aclImdb/train/pos'
folder_path_neg = '../../../data/external/imdb/aclImdb/train/neg'

# Create an empty list to store the dataset
dataset = []
labels = []

# Iterate through the positive files
for filename in os.listdir(folder_path_pos):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path_pos, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            dataset.append(content)
            labels.append(1) 

# Iterate through the negative files
for filename in os.listdir(folder_path_neg):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path_neg, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            dataset.append(content)
            labels.append(0)
            
# Create a DataFrame from the dataset
df_train = pd.DataFrame({'Text': dataset, 'Label': labels})

df_train.head(5)

Unnamed: 0,Text,Label
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1


In [6]:
# Perform the same operation with the test data
folder_path_pos = '../../../data/external/imdb/aclImdb/test/pos'
folder_path_neg = '../../../data/external/imdb/aclImdb/test/neg'

# Create an empty list to store the dataset
dataset = []
labels = []

# Iterate through the positive files
for filename in os.listdir(folder_path_pos):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path_pos, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            dataset.append(content)
            labels.append(1) 

# Iterate through the negative files
for filename in os.listdir(folder_path_neg):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path_neg, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            dataset.append(content)
            labels.append(0)
            
# Create a DataFrame from the dataset
df_test = pd.DataFrame({'Text': dataset, 'Label': labels})

df_test.head(5)

Unnamed: 0,Text,Label
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1


In [7]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train['Text'])
sequences = tokenizer.texts_to_sequences(df_train['Text'])

# Pad sequences to ensure consistent length
max_seq_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_seq_length)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    padded_sequences, df_train['Label'], test_size=0.2, random_state=42
)

# Create the sequential model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=32, input_length=max_seq_length))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [9]:
# Convert labels to integers
df_test['Label'] = df_test['Label'].astype(int)

# Tokenize the text data
test_sequences = []
for text in df_test['Text']:
    tokens = word_tokenize(text)
    test_sequences.append(tokens)

# Pad sequences to ensure consistent length
test_padded_sequences = pad_sequences(test_sequences, maxlen=max_seq_length)

# Evaluate the model on the test data
test_loss, test_acc = model.evaluate(test_padded_sequences, df_test['Label'])

# Print the test accuracy
print("Model Accuracy:", test_acc)

ValueError: invalid literal for int() with base 10: 'I'

In [None]:
import matplotlib.pyplot as plt

# Creating training and validation loss and accuracy curves
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

## 10.3

In [None]:
# Using listing 6.27 in Deep Learning with Python as a guide, fit the same data with an LSTM layer. 
# Produce the model performance metrics and training and validation accuracy curves within the Jupyter notebook.


## 10.4

In [None]:
# Using listing 6.46 in Deep Learning with Python as a guide, fit the same data with a simple 1D convnet. 
# Produce the model performance metrics and training and validation accuracy curves within the Jupyter notebook.
