### Opdracht 3


## load imports

In [134]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

## load data

In [135]:
with open("wiki.txt", "r") as file:
    wiki_text = [line.strip() for line in file if len(line.strip().split()) >= 10]

print(f"wiki_text len: {len(wiki_text)}")

with open("stopwoorden.txt", "r") as file:
    stopwoorden = [line.strip() for line in file if line]

wiki_text len: 62


## preprocess sentence

In [136]:
def preprocess_sentence(sentence):
    to_exclude = "/.%-,'\":;()[]0123456789"
    # Verwijder ongewenste tekens
    sentence = "".join([char if char not in to_exclude else " " for char in sentence])
    # Verwijder stopwoorden
    sentence = " ".join([word for word in sentence.split() if word.lower() not in stopwoorden])
    return sentence


In [137]:
corpus  = [preprocess_sentence(sentence) for sentence in wiki_text]
vocab = sorted(set(" ".join(corpus).split()))
print(f"vocab len: {len(vocab)}")
print(corpus)


vocab len: 732
['Kanker medisch Latijn neoplasma malignum kwaadaardig nieuwgevormd weefsel ziekte', 'cellen onbeheerst vermenigvuldigen', 'woekerende cellen omliggend weefsel richten schade invasieve groei infiltratie', 'woekerende cellen verspreiden plaatsen lichaam metastasering uitzaaiing lymfevaten lymfogene metastasering bloed hematogene metastasering verschillende lichaamsholten bijvoorbeeld buikholte', 'Nagenoeg medische specialismen behandeling kanker medisch specialisten oncologie radiotherapie kanker doodsoorzaak', 'Lezing Erasmus kanker Universiteit', 'kanker klassiek Latijnse cancer vanuit Romeinse encyclopedieschrijver Celsus', 'Kanker aandoening gekenmerkt onbeheerste groei abnormaal weefsel neoplasie aanhoudende ongecontroleerde celdeling Gezonde cellen lichaam prolifereren bepaalde organen vernieuwing herstel Tijdens celproliferatie cellen specifieke grootte afhankelijk functie proces celdifferentiatie deling differentiatie invloed extracellulaire cel afkomstige factore

## CBOW pairen maken

In [138]:
def create_pairs(corpus, w_size):
    X = []  # Context_word (input)
    y = []  # Target_word (output)
    
    for sentence in corpus:
        words = sentence.split()  # Verdeel de zin in woorden
        for index, target_word in enumerate(words):
            # Bepaal de context range (links en rechts van target_word)
            start = max(index - w_size, 0)
            end = min(index + w_size + 1, len(words))
            
            # Maak een lijst met alle woorden in het window, behalve target_word zelf
            context_words = [words[i] for i in range(start, end) if i != index]
            
            # Voeg de context en target toe aan de lijsten
            X.append(" ".join(context_words))  # Combineer contextwoorden tot een string
            y.append(target_word)              # Doelwoord
    
    return X, y

In [139]:
# Roep de functie aan met window size 2
X, y = create_pairs(corpus, w_size=2)

# Resultaat tonen
for context, target in zip(X[:5], y[:5]):
    print(f"Context: {context} -> Target: {target}")


Context: medisch Latijn -> Target: Kanker
Context: Kanker Latijn neoplasma -> Target: medisch
Context: Kanker medisch neoplasma malignum -> Target: Latijn
Context: medisch Latijn malignum kwaadaardig -> Target: neoplasma
Context: Latijn neoplasma kwaadaardig nieuwgevormd -> Target: malignum


In [140]:
# Assuming create_pairs function works and provides X (contexts) and y (targets)
# For example, you get these contexts from your create_pairs function
X, y = create_pairs(corpus, w_size=2)  # Run create_pairs function with your corpus

# Initialize CountVectorizer
vectorizer = CountVectorizer(max_features=len(vocab), tokenizer=lambda x: x.split())

# Transform the context sentences (X) into vectors
X_sentences = vectorizer.fit_transform(X).toarray()



In [141]:
# Now let's inspect the first 5 contexts
example_contexts = X[:5]  # First 5 context sentences
example_vectors = vectorizer.transform(example_contexts).toarray()

# Display the vectors and corresponding contexts
for i, (context, vector) in enumerate(zip(example_contexts, example_vectors)):
    print(f"Context {i+1}: '{context}'")
    print(f"Vector: {vector}")
    print()

# Display word-to-index mapping from the vectorizer
vocab_dict = vectorizer.vocabulary_
print("Word-to-index mapping:", vocab_dict)


Context 1: 'medisch Latijn'
Vector: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

### training 


In [145]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_sentences, y, test_size=0.3, random_state=42)

# Create the MLPClassifier model with 100 hidden nodes in one hidden layer
model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

### model evaluation

In [146]:
# Evaluate the model (you can also use other metrics like precision, recall, etc.)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

Model Accuracy: 0.0351
