In [29]:
!pip install gensim



In [30]:
import os
import json
import re
import random

import pandas as pd

from collections import Counter

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.utils.rnn as rnn_utils

from torch.utils.data import Dataset, DataLoader

from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [31]:
class Vocabulary:
    def __init__(self, min_freq=1):
        self.word2idx = {'<UNK>': 0}  # Add a special token for unknown words
        self.idx2word = {0: '<UNK>'}
        self.min_freq = min_freq

    def build_vocab(self, texts):
        word_counts = Counter(re.findall(r'\w+', ' '.join(texts).lower()))

        idx = 1 # 0 is reserved for '<UNK>'
        for word, count in word_counts.items():
            if count >= self.min_freq:
                self.word2idx[word] = idx
                self.idx2word[idx] = word
                idx += 1

    def encode(self, text):
        # Return word index or 0 if word is not found in vocab
        return [self.word2idx.get(word, 0) for word in re.findall(r'\w+', text.lower())]

    def vocab_size(self):
        return len(self.word2idx)


class FastTextVocabulary(Vocabulary):
    def __init__(self, min_freq=1, ngram_range=(3, 6)):
        super().__init__(min_freq)
        self.ngram_range = ngram_range
        self.ngram2idx = {}
        self.idx2ngram = {}
        self.ngram_count = 1 # as with the old version, we start at 1, and leave 0 for <UNK>

    def _get_ngrams(self, word):
        ngrams = []
        word = f'<{word}>'
        for n in range(self.ngram_range[0], self.ngram_range[1] + 1):
            ngrams.extend([word[i:i+n] for i in range(len(word) - n + 1)])
        return ngrams

    def build_vocab(self, texts):
        super().build_vocab(texts)

        for word in self.word2idx:
            ngrams = self._get_ngrams(word)
            for ngram in ngrams:
                if ngram not in self.ngram2idx:
                    self.ngram2idx[ngram] = self.ngram_count
                    self.idx2ngram[self.ngram_count] = ngram
                    self.ngram_count += 1

    def encode_word(self, word):
        word_idx = self.word2idx.get(word, 0)
        ngram_idxs = [self.ngram2idx.get(ng, 0) for ng in self._get_ngrams(word)]
        return word_idx, ngram_idxs

    def ngram_vocab_size(self):
        return len(self.ngram2idx)

In [32]:
class CustomColeridgeDataset(Dataset):
    def __init__(self, csv_file, json_dir, vocab, window_size=2, n_samples=1000, random_state=42):
        self.train = pd.read_csv(csv_file)
        self.train_items = self.train.sample(n=n_samples, random_state=random_state)
        self.json_dir = json_dir
        self.vocab = vocab
        self.window_size = window_size

    def __len__(self):
        return len(self.train_items)

    def __getitem__(self, idx):
        train_id = self.train_items.iloc[idx]['Id']
        curr_path = os.path.join(self.json_dir, train_id + '.json')

        with open(curr_path, 'r') as file:
            curr_json = json.load(file)

        text = ''.join([cj['text'] for cj in curr_json])
        word_indices = self.vocab.encode(text)

        center_context_pairs = []
        for i, center_word_idx in enumerate(word_indices):
            for j in range(max(0, i - self.window_size), min(len(word_indices), i + self.window_size + 1)):
                if i != j:
                    context_word_idx = word_indices[j]
                    center_context_pairs.append((center_word_idx, context_word_idx))

        return center_context_pairs



class FastTextColeridgeDataset(CustomColeridgeDataset):
    def __getitem__(self, idx):
        train_id = self.train_items.iloc[idx]['Id']
        curr_path = os.path.join(self.json_dir, train_id + '.json')
        with open(curr_path, 'r') as file:
            curr_json = json.load(file)

        text = ''.join([cj['text'] for cj in curr_json])
        word_indices = self.vocab.encode(text)

        center_context_pairs = []
        for i, center_word_idx in enumerate(word_indices):
            center_word, center_ngrams = self.vocab.encode_word(center_word_idx)
            for j in range(max(0, i - self.window_size), min(len(word_indices), i + self.window_size + 1)):
                if i != j:
                    context_word, context_ngrams = self.vocab.encode_word(word_indices[j])
                    center_context_pairs.append((center_word, center_ngrams, context_word, context_ngrams))

        return center_context_pairs


In [33]:
def skipgram_loss(scores, true_labels):
    loss = nn.BCEWithLogitsLoss()(scores, true_labels)
    return loss

In [34]:
class FastTextSkipGramModel(nn.Module):
    def __init__(self, vocab_size, ngram_vocab_size, embedding_dim, num_negative_samples=5):
        super(FastTextSkipGramModel, self).__init__()
        self.word_embeddings = nn.Parameter(torch.randn(vocab_size, embedding_dim) * 0.01)
        self.ngram_embeddings = nn.Parameter(torch.randn(ngram_vocab_size, embedding_dim) * 0.01)
        self.num_negative_samples = num_negative_samples

    def forward(self, center_word_idx, center_ngram_idxs, context_word_idx, context_ngram_idxs, negative_word_idxs):
        # Positive pair embeddings (center and context)
        center_word_embedding = self.word_embeddings[center_word_idx]
        context_word_embedding = self.word_embeddings[context_word_idx]

        center_ngram_embeddings = self.ngram_embeddings.index_select(0, center_ngram_idxs.view(-1))
        center_ngram_embeddings = center_ngram_embeddings.view(center_ngram_idxs.size(0), center_ngram_idxs.size(1), -1)
        center_ngram_embeddings = torch.sum(center_ngram_embeddings, dim=1)

        context_ngram_embeddings = self.ngram_embeddings.index_select(0, context_ngram_idxs.view(-1))
        context_ngram_embeddings = context_ngram_embeddings.view(context_ngram_idxs.size(0), context_ngram_idxs.size(1), -1)
        context_ngram_embeddings = torch.sum(context_ngram_embeddings, dim=1)

        # Combine word and n-gram embeddings for center and context
        center_embedding = center_word_embedding + center_ngram_embeddings
        context_embedding = context_word_embedding + context_ngram_embeddings

        # Compute positive score (dot product between center and context embeddings)
        pos_score = torch.sum(center_embedding * context_embedding, dim=1)

        # Negative sampling
        neg_word_embeddings = self.word_embeddings[negative_word_idxs]  # (batch_size, num_negative_samples, embedding_dim)
        neg_score = torch.bmm(neg_word_embeddings, center_embedding.unsqueeze(2)).squeeze(2)  # (batch_size, num_negative_samples)

        return pos_score, neg_score

# Negative Sampling Loss function
def negative_sampling_loss(pos_score, neg_score):
    # Labels: 1 for positive samples, 0 for negative samples
    pos_labels = torch.ones_like(pos_score)
    neg_labels = torch.zeros_like(neg_score)

    # Use BCEWithLogitsLoss to calculate loss for both positive and negative pairs
    bce_loss = nn.BCEWithLogitsLoss()

    pos_loss = bce_loss(pos_score, pos_labels)
    neg_loss = bce_loss(neg_score, neg_labels)
    return pos_loss + neg_loss


def generate_negative_samples(batch_size, vocab_size, num_negative_samples, true_context_word_idxs):
    negative_samples = []
    for i in range(batch_size):
        neg_words = []
        while len(neg_words) < num_negative_samples:
            neg_sample = random.randint(0, vocab_size - 1)
            # Make sure the negative sample is not the true context word
            if neg_sample != true_context_word_idxs[i]:
                neg_words.append(neg_sample)
        negative_samples.append(neg_words)

    return torch.tensor(negative_samples, dtype=torch.long)

In [35]:
vocab = FastTextVocabulary(min_freq=5)

train = pd.read_csv('/content/drive/My Drive/Datasets/Coleridge/datasets/train.csv')
train_items = train.sample(n=100, random_state=42)
texts = []

for i in range(len(train_items)):
    curr_path = os.path.join(
        os.getcwd(),
        'drive',
        'My Drive',
        'Datasets',
        'Coleridge',
        'datasets',
        'train',
        train_items.iloc[i]['Id'] + '.json')
    with open(curr_path, 'r') as file:
        curr_json = json.load(file)
        texts.append(''.join([cj['text'] for cj in curr_json]))

vocab.build_vocab(texts)

dataset = FastTextColeridgeDataset(csv_file='/content/drive/My Drive/Datasets/Coleridge/datasets/train.csv', json_dir='/content/drive/My Drive/Datasets/Coleridge/datasets/train/', vocab=vocab, n_samples=100)

dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=lambda x: [item for sublist in x for item in sublist])

In [36]:
# Shared hyperparameters between my FastText and Gensim's FastText
vocab_size = vocab.vocab_size()
ngram_vocab_size = vocab.ngram_vocab_size()
embedding_dim = 100
num_negative_samples = 5
model = FastTextSkipGramModel(vocab_size, ngram_vocab_size, embedding_dim)

optimizer = optim.Adam(model.parameters(), lr=0.01)
import torch.nn.utils.rnn as rnn_utils
num_epochs = 10

for epoch in range(num_epochs):
    for batch in dataloader:
        center_word_idxs, center_ngram_idxs, context_word_idxs, context_ngram_idxs = zip(*batch)

        # Convert word indices to tensors
        center_word_idxs = torch.tensor(center_word_idxs, dtype=torch.long)
        context_word_idxs = torch.tensor(context_word_idxs, dtype=torch.long)

        # Convert n-gram indices to tensors and pad them
        center_ngram_idxs = [torch.tensor(ngrams, dtype=torch.long) for ngrams in center_ngram_idxs]
        context_ngram_idxs = [torch.tensor(ngrams, dtype=torch.long) for ngrams in context_ngram_idxs]

        center_ngram_idxs_padded = rnn_utils.pad_sequence(center_ngram_idxs, batch_first=True, padding_value=0)
        context_ngram_idxs_padded = rnn_utils.pad_sequence(context_ngram_idxs, batch_first=True, padding_value=0)

        # Generate negative samples for the current batch
        negative_word_idxs = generate_negative_samples(center_word_idxs.size(0), vocab_size, num_negative_samples, context_word_idxs)

        optimizer.zero_grad()
        pos_score, neg_score = model(center_word_idxs, center_ngram_idxs_padded, context_word_idxs, context_ngram_idxs_padded, negative_word_idxs)

        loss = negative_sampling_loss(pos_score, neg_score)

        # Backpropagation and optimization
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

Epoch 1/10, Loss: 0.3117831349372864
Epoch 2/10, Loss: 0.013311322778463364
Epoch 3/10, Loss: 0.0003396491811145097
Epoch 4/10, Loss: 1.6996295016724616e-05
Epoch 5/10, Loss: 1.747156147757778e-06
Epoch 6/10, Loss: 3.728202813135795e-08
Epoch 7/10, Loss: 3.781463586705058e-09
Epoch 8/10, Loss: 2.821137767838877e-10
Epoch 9/10, Loss: 2.637541851591152e-10
Epoch 10/10, Loss: 0.0


In [37]:
import torch
import torch.nn.functional as F


def cosine_similarity(embedding_1, embedding_2):
    embedding_1_norm = F.normalize(embedding_1, dim=-1)
    embedding_2_norm = F.normalize(embedding_2, dim=-1)
    similarity = torch.sum(embedding_1_norm * embedding_2_norm, dim=-1)
    return similarity

# Function to find the most similar words to a target word and print their vectors
def most_similar_words(model, vocab, target_word, top_n=5):
    if target_word not in vocab.word2idx:
        print(f"'{target_word}' not found in the vocabulary.")
        return

    target_word_idx = vocab.word2idx[target_word]

    target_embedding = model.word_embeddings[target_word_idx].detach()

    similarities = {}

    # Compare with all words in the vocabulary
    for word, idx in vocab.word2idx.items():
        if word == target_word:
            # Skip the target word itself
            continue
        word_embedding = model.word_embeddings[idx].detach()
        similarity = cosine_similarity(target_embedding, word_embedding).item()
        similarities[word] = (similarity, word_embedding.cpu().numpy())

    # Sort the words by similarity score in descending order
    sorted_words = sorted(similarities.items(), key=lambda x: x[1][0], reverse=True)

    # Print the top N most similar words and their embeddings
    print(f"Most similar words to '{target_word}':")
    for word, (score, vector) in sorted_words[:top_n]:
        print(f"{word}: {score:.4f}")
        print(f"Vector: {vector}")


# Sample
target_words = ['classroom', 'literature', 'naep', 'stratified', 'metacognitive']
for target_word in target_words:
  most_similar_words(model, vocab, target_word, top_n=5)

Most similar words to 'classroom':
profile: 0.9970
Vector: [ 0.14292583  0.15032695 -0.15691712 -0.15461126  0.13417019  0.15974529
  0.14951143 -0.15123655 -0.13756464  0.12680325  0.15100892  0.15403791
 -0.13997401 -0.14505512 -0.12443852 -0.13886957  0.15084319  0.15297073
  0.14030854  0.12267955  0.14129247 -0.13995872 -0.16030414 -0.15222405
  0.1283772   0.1391642  -0.14511241 -0.15249312  0.14651631 -0.13311875
  0.14693607  0.14582665 -0.16442762 -0.1405723   0.13802546  0.13265571
 -0.13758187 -0.1451931  -0.134959   -0.15460187  0.13120957 -0.13611846
  0.14173448  0.14498964  0.14205582 -0.14050551 -0.13747962  0.14400977
 -0.14000544  0.14871751 -0.15788633  0.13006899 -0.14627518  0.12838565
  0.15252477 -0.14287914 -0.13961609 -0.14753674  0.13676248  0.14092992
 -0.13796018 -0.13948952  0.15611134 -0.13442467  0.16385546 -0.14241356
  0.14352903  0.15616445  0.1501236   0.14458868  0.13711983  0.14381458
 -0.14136626  0.1428076   0.13763796 -0.14122877 -0.14148627  0.1

In [38]:
from gensim.models import FastText
import json
import os
import re


def preprocess_dataset(texts):
    sentences = []

    for t in texts:
        tokens = re.findall(r'\w+', t.lower())
        sentences.append(tokens)

    return sentences

sentences = preprocess_dataset(texts)


gensim_fasttext_model = FastText(
    sentences,  # Note: Needs to have each sentence tokenized
    vector_size=100,
    window=5, # just window_size * 2 + 1
    min_count=5,
    sg=1,  # SkipGram model enabled
    negative=5,  # Enabled negative sampling
    epochs=10
)

# Todo: Save the model for future use
# gensim_fasttext_model.save('gensim_fasttext.model')

# Test the Gensim FastText model
word = 'research'

if word in gensim_fasttext_model.wv:
    print(f"Vector for the word '{word}':\n{gensim_fasttext_model.wv[word]}")
    similar_words = gensim_fasttext_model.wv.most_similar(word, topn=5)
    print(f"Most similar words to '{word}': {similar_words}")
else:
    print(f"The word '{word}' is not in the vocabulary.")

Vector for the word 'research':
[-0.254942    0.48115784 -0.23892953  0.5244435   0.447271   -0.03902753
  0.2406784  -0.13899249 -0.56561327  0.27930334 -0.00895796 -0.48417687
  0.07516677 -0.03137812 -0.7787445  -0.07024724  0.10193195  0.17443936
 -0.1721264  -0.9180712  -0.5725755   0.1404766   0.3975858  -0.3881248
  0.08583617 -0.36467636 -0.4811044  -0.31734368 -0.5868701   0.4294628
  0.42036515 -0.02370394  0.42574218 -0.02363656 -0.099308   -0.461011
 -0.28177336 -0.5962125   0.23264235 -0.08943418  0.01158202 -0.17402004
 -0.21714856  0.77285016  0.7557269  -0.03600815 -0.12696071 -0.5417225
 -0.4958403   0.10799832  0.24886343 -0.20708002 -0.6195423  -0.01297011
  0.01055861  0.20285103 -0.18965343 -0.40746862 -0.12984957 -0.24093759
 -0.5056721  -0.34879526 -0.316691   -0.04439788  0.04378431 -0.34858185
  0.40497744 -0.334439    0.02527865 -0.00145206  0.14821291 -0.10891996
 -0.16482162 -0.15793417  0.4022524  -0.45172673 -0.07741972 -0.00856412
 -0.36461326 -0.2801441 

In [26]:
# Test the Gensim FastText model
for word in target_words:
  if word in gensim_fasttext_model.wv:
      print(f"Vector for the word '{word}':\n{gensim_fasttext_model.wv[word]}")
      # Find similar words
      similar_words = gensim_fasttext_model.wv.most_similar(word, topn=5)
      print(f"Most similar words to '{word}': {similar_words}")
  else:
      print(f"The word '{word}' is not in the vocabulary.")

Vector for the word 'classroom':
[ 1.14927493e-01  2.18832284e-01  1.93810537e-01  1.45904720e-01
  1.34278953e-01  3.96253824e-01  2.32477620e-01  1.42275214e-01
  5.15785992e-01 -4.86749291e-01  5.34317553e-01 -3.72785151e-01
  5.74113859e-04  5.16398191e-01  1.47810616e-02 -3.47102016e-01
  1.57027408e-01  3.50618094e-01  4.65778857e-02 -2.12632611e-01
 -3.11619073e-01 -6.59646243e-02 -2.10932083e-03  3.61599773e-02
 -2.23949421e-02 -4.31758463e-01 -7.91100442e-01 -3.62352699e-01
 -2.33461082e-01  2.38631651e-01 -2.85542548e-01  1.39363602e-01
 -1.63172230e-01 -1.12537637e-01 -1.97286569e-02 -7.31421486e-02
  3.11175078e-01  2.91510642e-01 -1.63508698e-01  2.20226705e-01
  4.77128357e-01 -1.10348962e-01 -5.34209967e-01  2.65392303e-01
  1.86157092e-01 -1.07151173e-01 -7.06425458e-02 -4.72375870e-01
  8.03169888e-03 -1.61499660e-02  4.61328685e-01 -3.20486486e-01
  8.35429728e-02 -2.75545239e-01 -2.07125753e-01  1.41580507e-01
 -2.91618466e-01 -3.77678901e-01  3.09428990e-01 -1.21381

In [43]:
# Function to get embeddings from both models and compare them
def compare_embeddings(paragraph, model, vocab, gensim_model):
    # Tokenize the paragraph
    tokens = re.findall(r'\w+', paragraph.lower())

    # Store similarities and embeddings for comparison
    results = []

    for word in tokens:
        if word not in vocab.word2idx:
            print(f"'{word}' not found in your FastText vocabulary, skipping...")
            continue
        if word not in gensim_model.wv:
            print(f"'{word}' not found in Gensim FastText vocabulary, skipping...")
            continue

        # Get word embedding from your FastText model
        word_idx = vocab.word2idx[word]
        your_embedding = model.word_embeddings[word_idx].detach()

        # Get word embedding from Gensim's FastText model
        gensim_embedding = torch.tensor(gensim_model.wv[word])

        similarity = cosine_similarity(your_embedding, gensim_embedding).item()
        results.append({
            'word': word,
            'your_embedding': your_embedding.cpu().numpy(),
            'gensim_embedding': gensim_embedding.cpu().numpy(),
            'similarity': similarity
        })

    for result in results:
        print(f"Word: {result['word']}")
        print(f"Your Embedding: {result['your_embedding']}")
        print(f"Gensim Embedding: {result['gensim_embedding']}")
        print(f"Cosine Similarity: {result['similarity']:.4f}")
        print("="*50)

    return results

# This was just arbitrarily chosen
paragraph = """
For the final version of the models, some of the multiple indicator constructs were turned into single indicator constructs to identify which indicator was responsible for the classroom effect of a construct. Thus, integrating reading and writing and reading materials were divided into their constituent indicators. Student background, school characteristics, and student outcomes were measured as per the teacher background models. The path portion of the classroom effects model related the classroom practice constructs to the student outcome, taking into account student background and school characteristics, as well as relating the student background and school characteristics to each of the classroom practices, thus making it possible to gauge the extent to which classroom practices acted as intervening variables between student background, school characteristics and student outcomes. (Note 2)The factor models and goodness-of-fit statistics reveal that the models fit the data well. For all factor models, the constructs loaded substantially and on all of the corresponding indicators, and all loadings were statistically significant at the .05 level. (Factor models are not presented here, but are available upon request.). All ten of the factor and path models also had adequate goodness-of-fit statistics. For the teacher background models, the RMSEAs were at the .03 level, with normed goodness-of-fit indices of .92 and comparative goodness-of-fit indices at .92 and .93, depending upon the plausible value. For the classroom practice models, the RMSEAs were at the .05 level, with both normed and comparative goodness-of-fit indices at .98. These results suggest that the hypothesized models were confirmed by the observed data.
The path models for teacher background reveal only a modest effect of teaching on student reading comprehension ( Table 3 ). The strongest effects come from students, with SES having the largest effect in the model (b=.37) followed by reading background (b =.14). The school control, class size, also had an effect, albeit a modest one (b =.03). Among the five teacher background variables, only one, years of experience, proved statistically significant, with an unstandardized coefficient of .05. This findings differs somewhat from the literature, in which teacher major tend tends to have an effect and teacher experience tends not to have one. This divergence may be attributable to the fact that this study is of fourth graders and their elementary school teachers, whereas most of the studies of teacher major are at the high school level. (Note 3) The classroom practice path models reveal much more substantial teacher effects (Table 4) . As with the teacher background model, the strongest single effect is of SES (b=.43). This is followed, however, by two teacher effects, the positive effect of metacognitive skill instruction (b= .31) and the negative effect of time spent reading in class (b=.30). Student reading background is next in importance, with students with stronger backgrounds scoring higher on the reading comprehension assessment (b=.13). Teachers' having students write about literature they are reading and using trade books as their primary reading materials had modest positive effects (b=.04 for each). Class size, as in the teacher background model, also had a statistically significant effect of that size. In addition, the classroom practice path models indicate that students are exposed to very different practices depending upon their background characteristics and those of their schools. Affluent students are more likely to be exposed to metacognitive instruction (b=.04), writing about literature (b=.07) and reading trade books (b=.07) than their less affluent peers. There is, however, no difference in time spent reading in class or the use of basal readers between the two groups. Schools with smaller classes also differ from those with larger classes, with small class students more likely to be exposed to metacognitive instruction (b =.03) and writing about literature (b =.06) as well as to spend more time reading in class (b=.05). It thus appears that effective classroom practices act as intervening variables between student SES and reading comprehension performance, with higher SES students more likely to be exposed to those practices that are themselves associated with higher NAEP scores. With class size, the pattern is less clear, as the practices associated with smaller class sizes may or may not have a positive relationship to NAEP scores.
These findings answer the first research question in the negative and the second in the affirmative. The large-scale data do seem to confirm some of the findings from small-scale research but not others. Some practices, namely metacognition, using trade books and a measure of integrating reading and writing, did prove positively related to reading comprehension. Other practices, however, such as having students work in groups, increasing parental involvement, and the use of authentic assessment, did not. And time spent reading in class actually had a negative relationship to student performance. The addition of classroom practices to large-scale models seems to make the overall impact of teachers comparable to that of student background. As with typical production functions, the teacher background model revealed only a single modest teacher effect. The classroom practice model, however, revealed multiple teacher effects, some of them quite strong. The total standardized effect for the four teacher variables (.70) is actually somewhat larger than the total standard effect of the two student background measures (.56).These findings have significant methodological implications for research on teacher effects on reading comprehension. The finding that some of the classroom practices proved effective while others did not suggest the need for synergy between small-scale and large-scale research. The findings of small-scale, highly internally valid, studies should serve as the basis for large-scale, highly externally valid, studies. Only in this way can it be known if small-scale findings are applicable to large populations. (This does not rule out the possibility that small-scale research can, by itself, provide information about small populations.) The finding that the introduction of classroom practices leads to substantial teacher effects suggests the need for large-scale research to embrace such variables. Clearly, the failure of previous large-scale research to uncover substantial teacher effects is in large part due to its not including such variables. In addition, the other methodological advances of the current study over traditional production functions proved useful. The use of multiple indicators improved the quality of the measures employed, and the use of path models led to the finding that classroom practices act as intervening variables between student background and reading comprehension performance.
Yet while the current exploratory study does take some steps to improve the large-scale methodology for the study of teacher effects, much remains to be done. One shortcoming of the current study is the ad hoc manner in which it addressed problems with teacher self-reports. Because it relied on pre-existing data, the study made use of interaction effects to increase the likelihood that teachers reporting the use of certain practices were actually using them. Doing so, however, truncated the sample, and is based on the assumption that the more experienced, better prepared teachers are more likely to accurately assess and report what their practices are. This assumption may or may not hold true for a given teacher. A more effective technique for reducing problems with teacher self-reports would be to begin to design questionnaire items that make clearer what the practices are and minimize the social desirability effects. For instance, a questionnaire might include a scenario in a classroom and ask the respondent to describe how he or she would address it. Respondents could also be asked to rank order the effectiveness of the classroom practices of others, or to draw up a time budget for various practices. Such methods, often employed in small-scale research, need to be applied on a larger scale to enhance the reliability and validity of the large-sample teacher reports.
Another shortcoming of the study is its use of cross-sectional data. Because the data are cross-sectional, it is not clear whether particular practices enhance reading comprehension or high performing students are more likely to have teachers engaging in such practices. The study did address this problem in an ad hoc fashion by controlling for measures of student home reading behavior. Indeed, those controls may have resulted in underestimates of teacher effects, in that teachers may positively influence home reading behavior. Whatever the impact of the ad hoc procedure, it is no substitute for longitudinal data that follow student performance over time, and hence it is crucial for subsequent large-scale studies to collect such data. Indeed, the Early Childhood Longitudinal Study (ECLS), which will follow a national sample of students from kindergarten through fifth grade, testing their reading skills and measuring teacher classroom practices, may address this need.
A third shortcoming of this study is its failure to fully take into account the multilevel nature of its data. This study involved multiple levels of analysis in that it related teacher-level inputs to student-level outputs. Yet students are not selected at random, but are clustered within classrooms. The employment of design effects addressed this issue somewhat by increasing standard errors based upon clustering at the level for the school district. But it did not take into account the impact of classroom level non-independence on standard errors. It also did not distinguish between student-level and contextual effects; the influence of student SES, for instance, may be in part due to the average SES of that student's peers.
To fully address all of these issues, multilevel techniques, such as Hierarchical Linear Modeling or multilevel versions of SEM (MSEMs) need to be employed.
"""

results = compare_embeddings(paragraph, model, vocab, gensim_fasttext_model)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  0.05331811 -0.63805944  0.32856107 -0.09945691]
Cosine Similarity: -0.0215
Word: nature
Your Embedding: [ 0.1365809   0.1545667  -0.13112585 -0.15881632  0.1488428   0.12867482
  0.15622972 -0.14107412 -0.14916325  0.14127125  0.15188739  0.14365034
 -0.13179596 -0.12854238 -0.14236216 -0.1406625   0.13897502  0.14344049
  0.14197706  0.13290933  0.15632619 -0.13548897 -0.13977878 -0.15629365
  0.13447392  0.13585685 -0.13337293 -0.14052808  0.15876523 -0.14655009
  0.14278805  0.14231479 -0.15813313 -0.15694655  0.15523593  0.1454945
 -0.13826193 -0.14448476 -0.14348853 -0.15836994  0.14539069 -0.12763403
  0.14428538  0.1524172   0.12905554 -0.14007115 -0.14465335  0.14359625
 -0.14977793  0.1533882  -0.13846754  0.1372867  -0.13129736  0.13685043
  0.14723378 -0.1414178  -0.15504704 -0.14434811  0.13849072  0.1513197
 -0.15901276 -0.16453896  0.14307904 -0.14676765  0.15424591 -0.13396479
  0.13667892  0.14478481  0.

In [46]:
print('Maximum similarity', max(res['similarity'] for res in results))
print('Minimum similarity', min(res['similarity'] for res in results))

Maximum similarity 0.26352447271347046
Minimum similarity -0.32226940989494324


# Results

Some poignant cosine similarity results from the above:
* nature: 0.0079
* data: 0.0245
* clustered: 0.0455
* employment: -0.1768
* design: 0.0673

Generally, there is low similarity between my method and FastText's method. This might be attributable to the fact that the low values you're seeing (around 0.26 to 0.32) suggest that the embeddings produced by this implementation and Gensim’s are not aligned and capture very different representations.


