In [83]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [84]:
!pip install --upgrade  textblob gensim pytorch-nlp swifter



In [85]:
import multiprocessing
import sys

import numpy as np
import random
import os
import pandas as pd
import gensim
import warnings
import nltk
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
import re
import warnings
from sklearn.model_selection import train_test_split
from textblob import TextBlob

embedding_dim = 100
epochs=100
batch_size = 50
corpus_size=30000

def set_seeds_and_trace():
  os.environ['PYTHONHASHSEED'] = '0'
  np.random.seed(42)
  random.seed(42)


set_seeds_and_trace()
warnings.filterwarnings('ignore')
nltk.download('punkt')
textblob_tokenizer = lambda x: TextBlob(x).words

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [86]:
%%writefile get_data.sh
if [ ! -f news.csv ]; then
  wget -O news.csv https://www.dropbox.com/s/352x7xzivf60zgc/news.csv?dl=0
fi

Overwriting get_data.sh


In [87]:
!bash get_data.sh

In [88]:
path = './news.csv'
news_pre = pd.read_csv(path, header=0).sample(n=corpus_size).reset_index(drop=True)

In [89]:
news_pre

Unnamed: 0,category,title
0,Business,"BBC set for major shake-up, claims newspaper"
1,Business,Marsh averts cash crunch
2,Sports,"Jeter, Yankees Look to Take Control (AP)"
3,Sci/Tech,Flying the Sun to Safety
4,Business,Stocks Seen Flat as Nortel and Oil Weigh
...,...,...
29995,World,Israel #39;s Labour in talks on coalition
29996,World,"Exposed, Japan's Hot Springs Come Clean"
29997,Business,Rajasthan oil takes Cairn into FTSE 100
29998,Sports,"US Open tennis: Tough draw for Agassi, Roddick"


In [90]:
import re
def preprocess_text(text, should_join=True):
    # Use the tokenizer to tokenize into words, lowercase them, remove punctuation, and finally use gensim.utils.simple_preprocess(text)
    text = ' '.join(gensim.utils.tokenize(text, lowercase=True))
    text = re.sub(r"[.,!?]", r" ", text)
    if should_join:
      return ' '.join(gensim.utils.simple_preprocess(text))
    else:
      return gensim.utils.simple_preprocess(text)

In [91]:
preprocess_text('This is the best night of my life! Is it? Well, maybe')

'this is the best night of my life is it well maybe'

In [92]:
import swifter
news = news_pre.title.swifter.apply(preprocess_text)

Pandas Apply:   0%|          | 0/30000 [00:00<?, ?it/s]

In [93]:
news.to_csv('news_pre.csv', header=False, index=False)

In [94]:
!head -n 5 news_pre.csv

bbc set for major shake up claims newspaper
marsh averts cash crunch
jeter yankees look to take control ap
flying the sun to safety
stocks seen flat as nortel and oil weigh


In [95]:

class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        corpus_path = 'news_pre.csv'
        for line in open(corpus_path):
            # assume there's one document per line, tokens separated by whitespace
            yield preprocess_text(line, should_join=False)

from gensim.models import Word2Vec
word2vec_model = Word2Vec(sentences=MyCorpus(), vector_size=embedding_dim,min_count=1, workers=multiprocessing.cpu_count())
# Get a word2vec model using gensim.models and passing the sentences using MyCorpus()

In [96]:
wv_model = word2vec_model.wv

In [97]:
weights = torch.Tensor(wv_model.vectors)  # Get the weights of the model (the embedding) and convert to tensor. Hint: Check word2vec_model.wv
vocab_size = len(wv_model.index_to_key)  # get vocab size from index_to_key in word2vec_model.wv

In [98]:
weights.shape

torch.Size([18948, 100])

In [99]:
news_preprocessed = pd.DataFrame()
news_preprocessed['label'] = news_pre.category.map({'Business': 0, 'Sports': 1, 'Sci/Tech': 2, 'World': 3})
news_preprocessed['title'] = news
news_preprocessed

Unnamed: 0,label,title
0,0,bbc set for major shake up claims newspaper
1,0,marsh averts cash crunch
2,1,jeter yankees look to take control ap
3,2,flying the sun to safety
4,0,stocks seen flat as nortel and oil weigh
...,...,...
29995,3,israel labour in talks on coalition
29996,3,exposed japan hot springs come clean
29997,0,rajasthan oil takes cairn into ftse
29998,1,us open tennis tough draw for agassi roddick


In [100]:
def get_maximum_review_length(df):
    maximum = 0
    for ix, row in df.iterrows():
        candidate = len(textblob_tokenizer(row.title))
        if candidate > maximum:
            maximum = candidate
    return maximum


maximum = get_maximum_review_length(news_preprocessed)   # Since 2 titles may have different number of words, we have to find the max length and fill with 0s if a title is shorter

In [101]:
maximum

19

In [102]:
news_preprocessed[:4]

Unnamed: 0,label,title
0,0,bbc set for major shake up claims newspaper
1,0,marsh averts cash crunch
2,1,jeter yankees look to take control ap
3,2,flying the sun to safety


In [103]:
X = np.zeros((len(news_preprocessed), maximum))   # Here we do what we said above
# Iterate through the news df and for every word, if it exists in the word2vec model, put into X for that review and that word the index of the embedding (check index_to_key)
# HINT: to iterate through a column of a pandas dataframe you do:

# for index, value in df.iterrows():
#      #do something

for index, row in news_preprocessed.iterrows():
  word_ix = 0
  for word in textblob_tokenizer(row.title):
    if word in wv_model.key_to_index and word_ix < maximum:  # Check if word is in vocab and within bounds
      X[index, word_ix] = wv_model.key_to_index[word]
    #token = wv_model.key_to_index[word]
    word_ix += 1

# FILL
y = news_preprocessed.label

In [104]:
X[:4]

array([[2.8360e+03, 7.0000e+01, 2.0000e+00, 3.9200e+02, 1.9200e+03,
        1.5000e+01, 1.9000e+02, 3.2540e+03, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
       [1.1110e+03, 1.0648e+04, 7.3300e+02, 3.8290e+03, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
       [4.6480e+03, 2.7600e+02, 3.0800e+02, 0.0000e+00, 1.2300e+02,
        6.1700e+02, 5.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
       [1.8330e+03, 6.0000e+00, 1.4500e+02, 0.0000e+00, 1.2640e+03,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.000

In [105]:
import torch.nn.functional as F

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = torch.Tensor(X_train).to(torch.long)
X_test = torch.Tensor(X_test).to(torch.long)
# Convert y_train and y_test from an array of values between 0-3 to a one hot matrix tensor
y_train = F.one_hot(torch.Tensor(y_train.to_numpy()).to(torch.long))
y_test = F.one_hot(torch.Tensor(y_test.to_numpy()).to(torch.long))


In [106]:
class MeanLayer(nn.Module):

  def forward(self, x):
    return torch.mean(x, dim=1)

In [107]:
model = nn.Sequential(
    nn.Embedding(vocab_size, embedding_dim,_weight=weights),
    nn.Linear(embedding_dim, 100),
    nn.ReLU(),
    nn.Linear(100, 50),
    nn.ReLU(),
    MeanLayer(),
    nn.Linear(50, 4),
    nn.Softmax()
)

In [108]:
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [109]:
def train_cbow(X, y, model, loss_function, optimizer, epochs):
    for epoch in range(epochs):
        total_loss = 0

        # Step 1. Recall that torch *accumulates* gradients. Before passing in a new instance,
        # you need to zero out the gradients from the old instance
        optimizer.zero_grad()

        # Step 2. Run the forward pass, getting log probabilities over next words
        log_probs = model(X)
        # Step 3. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, y.to(torch.float))

        # Step 4. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        # Print progress
        if (epoch + 1) % 10 == 0:
            print('Epoch: {}, Loss: {:.4f}'.format(epoch + 1, total_loss))
    return model

In [110]:
trained_model = train_cbow(X_train, y_train, model, loss_function, optimizer, epochs=epochs)

Epoch: 10, Loss: 1.3865
Epoch: 20, Loss: 1.3865
Epoch: 30, Loss: 1.3865
Epoch: 40, Loss: 1.3865
Epoch: 50, Loss: 1.3865
Epoch: 60, Loss: 1.3865
Epoch: 70, Loss: 1.3865
Epoch: 80, Loss: 1.3865
Epoch: 90, Loss: 1.3865
Epoch: 100, Loss: 1.3865


## Exercise extra-credit: Make X and y a DataLoader, add batching, and validate the performance with the test set