In [None]:
# Mount drive to access stored data
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Set the data path to where the datasets are stored
dataset_path = '/content/drive/My Drive/Difficult Words Data/DHH Adult Data/'

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# Load data from the Excel file
data = pd.read_excel(dataset_path + 'complete_adult_data2.xlsx')

In [None]:
# Create and view a sample of the data, make sure it loaded correctly
sampled_data = data.sample(n=10)
print(sampled_data)

            word  length  syllables  senses  synonyms  hyponyms  hypernyms  \
1074       awake       5          2       3         8         0          1   
2689     college       7          2       3         0         3          3   
2729      comedy       6          3       2         3        10          2   
7716       laura       5          2       0         0         0          0   
1579     boiling       7          2       8         7         9          8   
10203       plot       4          1       8         7        11          8   
14886       whip       4          1      11        16        17         11   
955    attempted       9          3       3         7         8          2   
1575        body       4          2      12        10        52         10   
5060   extracted       9          3       8        14         5          8   

       subtitles  simple  subtlex  average  overall_label  
1074       -14.2   -17.1    -14.0   1.8182              0  
2689       -12.5   -1

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Split data into input (X) and output (y)
X = data['word'].astype(str)
y = data['average'].astype(float)

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)

# Padding sequences
max_seq_length = 20  # Adjust as needed
X_padded = pad_sequences(X_seq, maxlen=max_seq_length, padding='post', truncating='post')

# Split data into training and validation/test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# Define the model
model = Sequential()

# Add an Embedding layer for word embeddings
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size + 1 for out-of-vocabulary words
embedding_dim = 50  # Adjust as needed
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_length))

# Add a SimpleRNN layer
rnn_units = 32  # Adjust as needed
model.add(SimpleRNN(units=rnn_units, activation='relu'))

# Add an output layer
model.add(Dense(units=1, activation='linear'))  # Linear activation for regression

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Summary of the model architecture
model.summary()

# Train the model
batch_size = 64
epochs = 10
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 20, 50)            759050    
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 32)                2656      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 761739 (2.91 MB)
Trainable params: 761739 (2.91 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x789acf481240>

In [None]:
from sklearn.metrics import mean_absolute_error

y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(mae)

0.5085964440291735
